<a href="https://colab.research.google.com/github/sainath5466/asdf/blob/main/notebooks/feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -qq pycaret category_encoders yellowbrick

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m169.6/169.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.1/486.1 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.8/106.8 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.8/21.8 MB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.2/302.2 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m11.4 MB/s[0m et

In [2]:
# some builtin imports
import re
import warnings
from collections import Counter
import time
from datetime import datetime
warnings.filterwarnings('ignore')

# Some usual imports here
import csv
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl


# sklearn models
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn import metrics, model_selection
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import (
    StandardScaler
)

from category_encoders.one_hot import OneHotEncoder

# pycaret
from pycaret.internal.preprocess.transformers import (
    CleanColumnNames,
    RemoveMulticollinearity,
    RemoveOutliers,
    TransformerWrapper,
    TransformerWrapperWithInverse,
    EmbedTextFeatures,
)
from pycaret.internal.pipeline import Pipeline
from pycaret.internal.memory import Memory

np.random.seed(17)

In [3]:
## Customize Matplotlib Parameters
%matplotlib inline
mpl.rcParams['figure.dpi']= 120
mpl.rcParams['figure.edgecolor']= 'black'
mpl.rcParams['axes.linewidth']= .5
# Customize Seaborn Parameters
sns.set()
rc = {
      'font.family': ['serif'],
      'font.serif':'Times New Roman',
      'grid.color': 'gainsboro',
      'grid.linestyle': '-',
}
sns.set_style(rc=rc)
sns.set_context("notebook", font_scale=0.8)

# Load dataset

In [6]:
df = pd.read_csv('house_rentals.csv')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17890 entries, 0 to 17889
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   url            17890 non-null  object 
 1   listing_type   17890 non-null  object 
 2   name           17890 non-null  object 
 3   price          17890 non-null  int64  
 4   category       17890 non-null  object 
 5   bedrooms       17890 non-null  float64
 6   bathrooms      17890 non-null  float64
 7   floor_area     17890 non-null  float64
 8   location       17890 non-null  object 
 9   condition      17890 non-null  object 
 10  amenities      17890 non-null  object 
 11  region         17890 non-null  object 
 12  locality       17890 non-null  object 
 13  parking_space  17890 non-null  bool   
 14  is_furnished   17890 non-null  object 
 15  lat            17890 non-null  float64
 16  lng            17890 non-null  float64
dtypes: bool(1), float64(5), int64(1), object(10)
memor

# Feature engineering

In [8]:
drop_cols = ['name', 'url', 'listing_type', 'location', 'region', 'locality']
df = df.drop(columns=drop_cols)

## Amenities count

In [9]:
df['amenities_count'] = df['amenities'].apply(lambda x: len(x.split(',')))

## Furnishing

In [10]:
df['furnishing'] = df['is_furnished']
df = df.drop(columns=['is_furnished'])

## Parking space

In [11]:
df.parking_space.value_counts()

Unnamed: 0_level_0,count
parking_space,Unnamed: 1_level_1
False,17875
True,15


In [12]:
df['parking_space'] = df['parking_space'].apply(lambda x: 1 if x == True else 0)

In [13]:
df.parking_space.value_counts()

Unnamed: 0_level_0,count
parking_space,Unnamed: 1_level_1
0,17875
1,15


## Price transformation

In [14]:
# Move the 'price' column to the end
price_column = df['price']
df = df.drop(columns=['price'])
df['price'] = price_column

df['log1p_price'] = np.log1p(df['price'])

# Preprocessing Pipeline

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17890 entries, 0 to 17889
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   category         17890 non-null  object 
 1   bedrooms         17890 non-null  float64
 2   bathrooms        17890 non-null  float64
 3   floor_area       17890 non-null  float64
 4   condition        17890 non-null  object 
 5   amenities        17890 non-null  object 
 6   parking_space    17890 non-null  int64  
 7   lat              17890 non-null  float64
 8   lng              17890 non-null  float64
 9   amenities_count  17890 non-null  int64  
 10  furnishing       17890 non-null  object 
 11  price            17890 non-null  int64  
 12  log1p_price      17890 non-null  float64
dtypes: float64(6), int64(3), object(4)
memory usage: 1.8+ MB


In [16]:
target_columns = ["log1p_price", "price"]
one_hot_cols = ["category", "condition", "furnishing"]
text_cols = ["amenities"]
text_features_method = "bow"
bool_cols = ['parking_space']
numeric_cols = list(set(df.columns) - set(target_columns) - set(one_hot_cols) - set(text_cols) - set(bool_cols))

# one-hot encoding of categorical columns
category_encoding = TransformerWrapper(
    transformer=OneHotEncoder(
        cols=one_hot_cols,
        use_cat_names=True,
        handle_missing="return_nan",
        handle_unknown="value",
    ),
    include=one_hot_cols,
)

# text emebeding for amenities using BOW
embed_kwards = dict(tokenizer=lambda x: x.split(','), binary=True, max_features=20)
text_embedding = TransformerWrapper(
    transformer=EmbedTextFeatures(method=text_features_method, kwargs=embed_kwards),
    include=text_cols,
)
# transformer to remove highly correlated features
remove_multicollinearity = TransformerWrapper(
    exclude=[], transformer=RemoveMulticollinearity(threshold=0.9)
)
# remove outliers
# remove_outliers = TransformerWrapper(
#     transformer=RemoveOutliers(random_state=123, threshold="auto")
# )

# normalize numeric features
normalize = TransformerWrapper(transformer=StandardScaler(), include=numeric_cols)
clean_column_names = TransformerWrapper(transformer=CleanColumnNames())

preprocessor = Pipeline(
    steps=[
        ("category_encoding", category_encoding),
        ("text_embedding", text_embedding),
        ("remove_multicollinearity", remove_multicollinearity),
        # ("remove_outliers", remove_outliers),
        ("normalize", normalize),
        ("clean_column_names", clean_column_names),
    ],
)

# # prompt: transform the df using the preprocessor and convert the output to a dataframe with the transformed columns
trans_df = pd.DataFrame(preprocessor.fit_transform(df))
trans_df.head()

Unnamed: 0,category_Flats,category_Detached,category_Townhouse,category_Duplex,category_Mansion,category_Semi-Detached,bedrooms,bathrooms,floor_area,condition_Used,...,amenities_wi-fi,parking_space,lat,lng,amenities_count,furnishing_Semi-Furnished,furnishing_Unfurnished,furnishing_Furnished,price,log1p_price
0,1.0,0.0,0.0,0.0,0.0,0.0,-0.363125,-0.409811,-0.393436,1.0,...,1,0,-0.198154,0.264372,1.026783,1.0,0.0,0.0,9196,9.126633
1,1.0,0.0,0.0,0.0,0.0,0.0,-0.363125,-0.409811,-0.436503,1.0,...,1,0,-0.245904,0.158057,0.306848,0.0,1.0,0.0,7500,8.922792
2,0.0,1.0,0.0,0.0,0.0,0.0,1.220567,1.767869,-0.436503,0.0,...,0,0,-0.10051,0.339132,1.50674,1.0,0.0,0.0,11200,9.323758
3,0.0,1.0,0.0,0.0,0.0,0.0,0.428721,0.316082,-0.515938,1.0,...,0,0,-0.314007,0.440336,-0.893045,1.0,0.0,0.0,2500,7.824446
4,0.0,1.0,0.0,0.0,0.0,0.0,0.428721,0.316082,-0.379081,1.0,...,1,0,-0.194226,0.372127,1.026783,1.0,0.0,0.0,9146,9.121181


In [17]:
trans_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17890 entries, 0 to 17889
Data columns (total 39 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   category_Flats                 17890 non-null  float64
 1   category_Detached              17890 non-null  float64
 2   category_Townhouse             17890 non-null  float64
 3   category_Duplex                17890 non-null  float64
 4   category_Mansion               17890 non-null  float64
 5   category_Semi-Detached         17890 non-null  float64
 6   bedrooms                       17890 non-null  float64
 7   bathrooms                      17890 non-null  float64
 8   floor_area                     17890 non-null  float64
 9   condition_Used                 17890 non-null  float64
 10  condition_New                  17890 non-null  float64
 11  condition_Renovated            17890 non-null  float64
 12  amenities_24-hour electricity  17890 non-null 

In [18]:
df.to_csv("house_rental_final.csv", index=False)

In [19]:
!cp house_rental_final.csv "/content/drive/MyDrive/Datasets/"

cp: cannot create regular file '/content/drive/MyDrive/Datasets/': No such file or directory
