In [113]:
import sys
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy import stats
from typing import Any
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from typing import Any
import warnings
warnings.simplefilter(action='ignore')

#matplotlib.rcParams["figure.figsize"] = (20,10)

# Constants

In [114]:
# Initialize filepaths 

PREPROCESSED_FILE_PATH = 'preprocessed/'

# Reading the Data

In [115]:
pd.set_option('display.max_rows', None)
df = pd.read_csv('apartments_for_rent.csv', sep = ';', encoding = 'ISO-8859-1')

In [116]:
df.columns


Index(['id', 'category', 'title', 'body', 'amenities', 'bathrooms', 'bedrooms',
       'currency', 'fee', 'has_photo', 'pets_allowed', 'price',
       'price_display', 'price_type', 'square_feet', 'address', 'cityname',
       'state', 'latitude', 'longitude', 'source', 'time'],
      dtype='object')

# Inspecting Data

In [117]:
df.describe()

Unnamed: 0,id,bathrooms,bedrooms,price,square_feet,latitude,longitude,time
count,99492.0,99429.0,99368.0,99491.0,99492.0,99467.0,99467.0,99492.0
mean,5358321000.0,1.445323,1.728212,1527.057281,956.430688,36.947988,-91.568656,1559665000.0
std,184740400.0,0.547021,0.7492,904.245882,417.571522,4.599461,15.817168,11050770.0
min,5121046000.0,1.0,0.0,100.0,101.0,19.5738,-159.3698,1544174000.0
25%,5197950000.0,1.0,1.0,1013.0,729.0,33.7465,-104.7919,1550832000.0
50%,5508673000.0,1.0,2.0,1350.0,900.0,37.2282,-84.5623,1568745000.0
75%,5509007000.0,2.0,2.0,1795.0,1115.0,39.953,-77.6082,1568767000.0
max,5669439000.0,9.0,9.0,52500.0,50000.0,64.8332,-68.7788,1577391000.0


In [118]:
def summarise_data(df):
    print("\nOverview")
    display(df.head())
    print("\nSummary")
    display(df.describe(include= "all"))
    print("\nNull Values")
    display(df.isnull().sum()/len(df))
    print("\nShape")
    display(df.shape)


In [119]:
summarise_data(df)


Overview


Unnamed: 0,id,category,title,body,amenities,bathrooms,bedrooms,currency,fee,has_photo,...,price_display,price_type,square_feet,address,cityname,state,latitude,longitude,source,time
0,5668640009,housing/rent/apartment,One BR 507 & 509 Esplanade,"This unit is located at 507 & 509 Esplanade, R...",,1.0,1.0,USD,No,Thumbnail,...,"$2,195",Monthly,542,507 509 Esplanade,Redondo Beach,CA,33.852,-118.3759,RentLingo,1577360355
1,5668639818,housing/rent/apartment,Three BR 146 Lochview Drive,"This unit is located at 146 Lochview Drive, Ne...",,1.5,3.0,USD,No,Thumbnail,...,"$1,250",Monthly,1500,146 Lochview Dr,Newport News,VA,37.0867,-76.4941,RentLingo,1577360340
2,5668639686,housing/rent/apartment,Three BR 3101 Morningside Drive,This unit is located at 3101 Morningside Drive...,,2.0,3.0,USD,No,Thumbnail,...,"$1,395",Monthly,1650,3101 Morningside Dr,Raleigh,NC,35.823,-78.6438,RentLingo,1577360332
3,5668639659,housing/rent/apartment,Two BR 209 Aegean Way,"This unit is located at 209 Aegean Way, Vacavi...",,1.0,2.0,USD,No,Thumbnail,...,"$1,600",Monthly,820,209 Aegean Way,Vacaville,CA,38.3622,-121.9712,RentLingo,1577360330
4,5668639374,housing/rent/apartment,One BR 4805 Marquette NE,"This unit is located at 4805 Marquette NE, Alb...",,1.0,1.0,USD,No,Thumbnail,...,$975,Monthly,624,4805 Marquette NE,Albuquerque,NM,35.1038,-106.611,RentLingo,1577360308



Summary


Unnamed: 0,id,category,title,body,amenities,bathrooms,bedrooms,currency,fee,has_photo,...,price_display,price_type,square_feet,address,cityname,state,latitude,longitude,source,time
count,99492.0,99492,99492,99492,83448,99429.0,99368.0,99492,99492,99492,...,99491,99492,99492.0,7943,99190,99190,99467.0,99467.0,99492,99492.0
unique,,7,58503,94503,9827,,,1,2,3,...,3718,3,,7771,2979,51,,,25,
top,,housing/rent/apartment,Apartment in great location,"When searching for a pet-friendly One-, Two- a...",Parking,,,USD,No,Yes,...,"$1,350",Monthly,,8215 S.W 72nd Avenue,Dallas,TX,,,RentDigs.com,
freq,,99431,1064,43,6188,,,99492,99291,55974,...,574,99488,,19,2858,11257,,,90912,
mean,5358321000.0,,,,,1.445323,1.728212,,,,...,,,956.430688,,,,36.947988,-91.568656,,1559665000.0
std,184740400.0,,,,,0.547021,0.7492,,,,...,,,417.571522,,,,4.599461,15.817168,,11050770.0
min,5121046000.0,,,,,1.0,0.0,,,,...,,,101.0,,,,19.5738,-159.3698,,1544174000.0
25%,5197950000.0,,,,,1.0,1.0,,,,...,,,729.0,,,,33.7465,-104.7919,,1550832000.0
50%,5508673000.0,,,,,1.0,2.0,,,,...,,,900.0,,,,37.2282,-84.5623,,1568745000.0
75%,5509007000.0,,,,,2.0,2.0,,,,...,,,1115.0,,,,39.953,-77.6082,,1568767000.0



Null Values


id               0.000000
category         0.000000
title            0.000000
body             0.000000
amenities        0.161259
bathrooms        0.000633
bedrooms         0.001246
currency         0.000000
fee              0.000000
has_photo        0.000000
pets_allowed     0.556366
price            0.000010
price_display    0.000010
price_type       0.000000
square_feet      0.000000
address          0.920164
cityname         0.003035
state            0.003035
latitude         0.000251
longitude        0.000251
source           0.000000
time             0.000000
dtype: float64


Shape


(99492, 22)

In [120]:
#df.fillna(0, inplace=True)

# Cleaning Data


### Label encoding fees, amenities, category & pets_allowed

#### Handling categorical values: fee

In [122]:
df.value_counts('fee')

fee
No     99291
Yes      201
dtype: int64

In [123]:
df.loc[df.fee == 'No', 'fee'] = 1
df.loc[df.fee == 'Yes', 'fee'] = 0

In [124]:
df.value_counts('fee')

fee
1    99291
0      201
dtype: int64

In [125]:
#Handling categorical values: amenities

In [126]:
df.value_counts('amenities')

amenities
Parking                                                                                                                                                                                     6188
Parking,Storage                                                                                                                                                                             2116
Gym,Pool                                                                                                                                                                                    1871
Pool                                                                                                                                                                                        1485
Gym,Parking,Pool                                                                                                                                                                            1187
Parking,Pool             

In [127]:
df['amenities'] = df['amenities'].astype(str)


In [128]:
df['amenities'] = df['amenities'].apply(lambda x: x.lower() if isinstance(x, str) else x)
df['amenities'] = df['amenities'].apply(lambda x: x.strip())
results = set()
df['amenities'].str.lower().str.split(',').apply(results.update)
print(results)
len(results)

{'washer dryer', 'hot tub', 'tennis', 'pool', 'wood floors', 'internet access', 'parking', 'doorman', 'gated', 'cable or satellite', 'alarm', 'golf', 'elevator', 'garbage disposal', 'luxury', 'playground', 'ac', 'refrigerator', 'storage', 'clubhouse', 'dishwasher', 'view', 'tv', 'patio/deck', 'fireplace', 'basketball', 'gym', 'nan'}


28

In [129]:
df_am = df['amenities'].str.split(',',expand = True)


In [130]:
df_am.fillna(0, inplace = True)
df_am = df_am.replace('nan', 0) 

In [131]:
list_4 = ['basketball','tennis','doorman','clubhouse','playground','gym','golf','pool','view']
val = 4
df_am = df_am.replace(list_4,val)


list_5 = ['luxury']
val = 5
df_am = df_am.replace(list_5,val)




list_3 = ['storage','wood floors','fireplace','patio/deck','gated','elevator','parking','garbage disposal']
val = 3
df_am = df_am.replace(list_3,val)



list_2 = ['alarm','parking','internet access']
val = 2
df_am = df_am.replace(list_2,val)


list_1 = ['refrigerator','dishwasher','ac','cable or satellite','tv','washer dryer','hot tub']
val = 1
df_am = df_am.replace(list_1,val)

In [132]:
df_am.head(100)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [135]:
df = pd.concat([df,df_am], axis = 'columns')

In [136]:
df.head()

Unnamed: 0,id,category,title,body,amenities,bathrooms,bedrooms,currency,fee,has_photo,...,8,9,10,11,12,13,14,15,16,17
0,5668640009,housing/rent/apartment,One BR 507 & 509 Esplanade,"This unit is located at 507 & 509 Esplanade, R...",,1.0,1.0,USD,1,Thumbnail,...,0,0,0,0,0,0,0,0,0,0
1,5668639818,housing/rent/apartment,Three BR 146 Lochview Drive,"This unit is located at 146 Lochview Drive, Ne...",,1.5,3.0,USD,1,Thumbnail,...,0,0,0,0,0,0,0,0,0,0
2,5668639686,housing/rent/apartment,Three BR 3101 Morningside Drive,This unit is located at 3101 Morningside Drive...,,2.0,3.0,USD,1,Thumbnail,...,0,0,0,0,0,0,0,0,0,0
3,5668639659,housing/rent/apartment,Two BR 209 Aegean Way,"This unit is located at 209 Aegean Way, Vacavi...",,1.0,2.0,USD,1,Thumbnail,...,0,0,0,0,0,0,0,0,0,0
4,5668639374,housing/rent/apartment,One BR 4805 Marquette NE,"This unit is located at 4805 Marquette NE, Alb...",,1.0,1.0,USD,1,Thumbnail,...,0,0,0,0,0,0,0,0,0,0


In [137]:
df.columns

Index([           'id',      'category',         'title',          'body',
           'amenities',     'bathrooms',      'bedrooms',      'currency',
                 'fee',     'has_photo',  'pets_allowed',         'price',
       'price_display',    'price_type',   'square_feet',       'address',
            'cityname',         'state',      'latitude',     'longitude',
              'source',          'time',               0,               1,
                     2,               3,               4,               5,
                     6,               7,               8,               9,
                    10,              11,              12,              13,
                    14,              15,              16,              17],
      dtype='object')

In [138]:
df.columns = [f'amm_{i}' if i not in [ 'id','category','title','body','amenities','bathrooms','bedrooms','currency'
                                      ,'fee','has_photo',  'pets_allowed','price','price_display','price_type','square_feet'
                                      ,'address','cityname','state','latitude','longitude','source','time'] 
                        else f'{i}' for i in df.columns]
df.drop('amenities', axis = 1, inplace = True)

In [139]:
#Handling categorical values: category

In [140]:
df['category'].unique()

array(['housing/rent/apartment', 'housing/rent/home',
       'housing/rent/short_term', 'housing/rent', 'housing/rent/condo',
       'housing/rent/other', 'housing/rent/commercial/retail'],
      dtype=object)

In [141]:
#Cleaning Data : extracting the exact category like 'apartment' or 'retail' from category column 'housing/rent/commercial/retail' 
df['category'] = df['category'].str.split('/').str[-1]

In [142]:
df.value_counts('category')

category
apartment     99431
retail           42
rent              7
home              4
short_term        4
condo             3
other             1
dtype: int64

In [143]:
#df['category'] = np.where(df[df['bedrooms']>4], df['category'].map('house'))
#df['New_State_Name'] = np.where(df['Name']=='Person1',df['State'].map(state_map),df['State'].map(state_map2))
df.loc[df.bedrooms >= 3, 'category'] = 'home'
df.value_counts('category')

category
apartment     87039
home          12403
retail           42
short_term        4
condo             2
rent              2
dtype: int64

In [144]:
df.loc[df.category == 'other', 'category'] = 0

In [145]:
df.value_counts('category')

category
apartment     87039
home          12403
retail           42
short_term        4
condo             2
rent              2
dtype: int64

In [146]:
df.set_index('category')
df.loc[df.category == 'short_term', 'category'] = 'apartment'

In [147]:
df.loc[df.category == 'condo' ,'category'] = 'apartment'
df.loc[df.category == 'rent', 'category'] = 'home'
df.loc[df.category == 'retail', 'category'] = 'apartment'

In [186]:
df.head()

Unnamed: 0,id,category,title,body,bathrooms,bedrooms,currency,fee,has_photo,pets_allowed,...,amm_8,amm_9,amm_10,amm_11,amm_12,amm_13,amm_14,amm_15,amm_16,amm_17
0,5668640009,1,One BR 507 & 509 Esplanade,"This unit is located at 507 & 509 Esplanade, R...",1.0,1.0,USD,1,Thumbnail,1,...,0,0,0,0,0,0,0,0,0,0
1,5668639818,2,Three BR 146 Lochview Drive,"This unit is located at 146 Lochview Drive, Ne...",1.5,3.0,USD,1,Thumbnail,3,...,0,0,0,0,0,0,0,0,0,0
2,5668639686,2,Three BR 3101 Morningside Drive,This unit is located at 3101 Morningside Drive...,2.0,3.0,USD,1,Thumbnail,0,...,0,0,0,0,0,0,0,0,0,0
3,5668639659,1,Two BR 209 Aegean Way,"This unit is located at 209 Aegean Way, Vacavi...",1.0,2.0,USD,1,Thumbnail,3,...,0,0,0,0,0,0,0,0,0,0
4,5668639374,1,One BR 4805 Marquette NE,"This unit is located at 4805 Marquette NE, Alb...",1.0,1.0,USD,1,Thumbnail,3,...,0,0,0,0,0,0,0,0,0,0


In [148]:
df.value_counts('category')

category
apartment    87087
home         12405
dtype: int64

In [149]:
df.loc[df.category == 'apartment', 'category'] = 1
df.loc[df.category == 'home', 'category'] = 2

In [150]:
df.value_counts('category')

category
1    87087
2    12405
dtype: int64

In [151]:
#Handling categorical values: pets_allowed

In [152]:
df.value_counts('pets_allowed')

pets_allowed
Cats,Dogs         37097
None               5070
Cats               1843
Dogs                127
Cats,Dogs,None        1
dtype: int64

In [153]:
df.set_index('pets_allowed')
df.loc[df.pets_allowed == 'Cats,Dogs','pets_allowed'] = 3
df.loc[df.pets_allowed == 'Cats','pets_allowed'] = 1
df.loc[df.pets_allowed == 'Dogs','pets_allowed'] = 1
df.loc[df.pets_allowed == 'Cats,Dogs,None','pets_allowed'] = 3
df.loc[df.pets_allowed == 'None','pets_allowed'] = 0

In [154]:
df.value_counts('pets_allowed')

pets_allowed
3    37098
0     5070
1     1970
dtype: int64

In [157]:
df.value_counts('price_type')

price_type
Monthly           99488
Weekly                3
Monthly|Weekly        1
dtype: int64

##Replace Monthly|weekly and weekly price in price column with a monthly price.

In [158]:

df.loc[df['price_type'] == 'Monthly|Weekly']
df['price'] = df['price'].astype(float)
df.loc[7245,['price']] = 1195
df['price_type'] = df['price_type'].replace('Monthly|Weekly','Monthly')

#df.iloc[7245]

In [159]:
df.loc[df['price_type'] == 'Weekly']
df.loc[6729,['price']] = 6778
df.loc[49908,['price']] = 36935
df.loc[83395,['price']] = 3693
df['price_type'] = df['price_type'].replace('Weekly','Monthly')


In [160]:
#df.iloc[83395]

In [161]:
#df.loc[df['price_type'] == 'Weekly']['price']
df.value_counts('price_type')

price_type
Monthly    99492
dtype: int64

In [162]:
#Data Cleaning : Convertirng all the price values to monthly
df.head()

Unnamed: 0,id,category,title,body,bathrooms,bedrooms,currency,fee,has_photo,pets_allowed,...,amm_8,amm_9,amm_10,amm_11,amm_12,amm_13,amm_14,amm_15,amm_16,amm_17
0,5668640009,1,One BR 507 & 509 Esplanade,"This unit is located at 507 & 509 Esplanade, R...",1.0,1.0,USD,1,Thumbnail,1,...,0,0,0,0,0,0,0,0,0,0
1,5668639818,2,Three BR 146 Lochview Drive,"This unit is located at 146 Lochview Drive, Ne...",1.5,3.0,USD,1,Thumbnail,3,...,0,0,0,0,0,0,0,0,0,0
2,5668639686,2,Three BR 3101 Morningside Drive,This unit is located at 3101 Morningside Drive...,2.0,3.0,USD,1,Thumbnail,0,...,0,0,0,0,0,0,0,0,0,0
3,5668639659,1,Two BR 209 Aegean Way,"This unit is located at 209 Aegean Way, Vacavi...",1.0,2.0,USD,1,Thumbnail,3,...,0,0,0,0,0,0,0,0,0,0
4,5668639374,1,One BR 4805 Marquette NE,"This unit is located at 4805 Marquette NE, Alb...",1.0,1.0,USD,1,Thumbnail,3,...,0,0,0,0,0,0,0,0,0,0


In [163]:
#cleaning the dataset
df1 = df.drop(['id','title','body','currency','has_photo','latitude','longitude','price_display','price_type','address','source','time'],axis='columns')

In [164]:
df1.head(100)


Unnamed: 0,category,bathrooms,bedrooms,fee,pets_allowed,price,square_feet,cityname,state,amm_0,...,amm_8,amm_9,amm_10,amm_11,amm_12,amm_13,amm_14,amm_15,amm_16,amm_17
0,1,1.0,1.0,1,1,2195.0,542,Redondo Beach,CA,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1.5,3.0,1,3,1250.0,1500,Newport News,VA,0,...,0,0,0,0,0,0,0,0,0,0
2,2,2.0,3.0,1,0,1395.0,1650,Raleigh,NC,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1.0,2.0,1,3,1600.0,820,Vacaville,CA,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1.0,1.0,1,3,975.0,624,Albuquerque,NM,0,...,0,0,0,0,0,0,0,0,0,0
5,1,1.5,2.0,1,3,1250.0,965,Albuquerque,NM,0,...,0,0,0,0,0,0,0,0,0,0
6,1,2.0,2.0,1,0,1600.0,1120,Hampton,VA,0,...,0,0,0,0,0,0,0,0,0,0
7,1,2.0,2.0,1,3,1300.0,947,Lakewood,CO,0,...,0,0,0,0,0,0,0,0,0,0
8,1,1.0,2.0,1,1,795.0,600,Charleston,WV,0,...,0,0,0,0,0,0,0,0,0,0
9,1,2.0,2.0,1,0,2150.0,1005,Chatsworth,CA,0,...,0,0,0,0,0,0,0,0,0,0


In [165]:
df1.value_counts('bedrooms')

bedrooms
2.0    45975
1.0    40331
3.0    10561
4.0     1498
0.0      662
5.0      286
6.0       47
7.0        4
8.0        3
9.0        1
dtype: int64

In [166]:
df1.value_counts('square_feet')
df1.value_counts('price')

price
1350.00     574
850.00      547
1200.00     538
950.00      528
1100.00     522
1500.00     514
1250.00     512
1450.00     498
1400.00     489
995.00      465
1300.00     463
1000.00     447
900.00      444
1295.00     431
750.00      425
1550.00     423
1650.00     423
1050.00     418
1395.00     410
895.00      409
1150.00     408
1600.00     407
1800.00     404
1095.00     400
800.00      392
875.00      389
975.00      388
1495.00     385
925.00      377
1750.00     371
1595.00     366
1695.00     357
795.00      356
1700.00     355
2000.00     345
1325.00     334
1195.00     330
1850.00     330
825.00      323
775.00      320
2200.00     317
725.00      308
1175.00     308
1375.00     306
1225.00     301
650.00      301
1025.00     297
1475.00     296
700.00      286
1425.00     284
1275.00     276
1795.00     272
1125.00     271
2300.00     271
695.00      269
1075.00     264
2100.00     256
1525.00     248
1900.00     244
2500.00     242
1245.00     239
675.00      239
19

In [167]:
display(df1.isnull().sum())


category            0
bathrooms          63
bedrooms          124
fee                 0
pets_allowed    55354
price               1
square_feet         0
cityname          302
state             302
amm_0               0
amm_1               0
amm_2               0
amm_3               0
amm_4               0
amm_5               0
amm_6               0
amm_7               0
amm_8               0
amm_9               0
amm_10              0
amm_11              0
amm_12              0
amm_13              0
amm_14              0
amm_15              0
amm_16              0
amm_17              0
dtype: int64

In [168]:
df1.head()

Unnamed: 0,category,bathrooms,bedrooms,fee,pets_allowed,price,square_feet,cityname,state,amm_0,...,amm_8,amm_9,amm_10,amm_11,amm_12,amm_13,amm_14,amm_15,amm_16,amm_17
0,1,1.0,1.0,1,1,2195.0,542,Redondo Beach,CA,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1.5,3.0,1,3,1250.0,1500,Newport News,VA,0,...,0,0,0,0,0,0,0,0,0,0
2,2,2.0,3.0,1,0,1395.0,1650,Raleigh,NC,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1.0,2.0,1,3,1600.0,820,Vacaville,CA,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1.0,1.0,1,3,975.0,624,Albuquerque,NM,0,...,0,0,0,0,0,0,0,0,0,0


In [169]:
df1.head()

Unnamed: 0,category,bathrooms,bedrooms,fee,pets_allowed,price,square_feet,cityname,state,amm_0,...,amm_8,amm_9,amm_10,amm_11,amm_12,amm_13,amm_14,amm_15,amm_16,amm_17
0,1,1.0,1.0,1,1,2195.0,542,Redondo Beach,CA,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1.5,3.0,1,3,1250.0,1500,Newport News,VA,0,...,0,0,0,0,0,0,0,0,0,0
2,2,2.0,3.0,1,0,1395.0,1650,Raleigh,NC,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1.0,2.0,1,3,1600.0,820,Vacaville,CA,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1.0,1.0,1,3,975.0,624,Albuquerque,NM,0,...,0,0,0,0,0,0,0,0,0,0


In [170]:
df1.head()

Unnamed: 0,category,bathrooms,bedrooms,fee,pets_allowed,price,square_feet,cityname,state,amm_0,...,amm_8,amm_9,amm_10,amm_11,amm_12,amm_13,amm_14,amm_15,amm_16,amm_17
0,1,1.0,1.0,1,1,2195.0,542,Redondo Beach,CA,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1.5,3.0,1,3,1250.0,1500,Newport News,VA,0,...,0,0,0,0,0,0,0,0,0,0
2,2,2.0,3.0,1,0,1395.0,1650,Raleigh,NC,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1.0,2.0,1,3,1600.0,820,Vacaville,CA,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1.0,1.0,1,3,975.0,624,Albuquerque,NM,0,...,0,0,0,0,0,0,0,0,0,0


# Feature Engineering to create new feature : Economic Analysis Region

In [171]:
#we will do categorical Binning for the states which has 51 unique values.
#We will group the 51 states in 8 groups based on U.S. Bureau of Economic Analysis Regions
#1.New England Region 
#2.Great Lakes Region 
#3.Southwest Region 
#4.Mideast Region
#5.Plains Region
#6.Far West Region
#7.Southeast Region
#8.Rocky Mountain Region

In [172]:
df1.head()
df1.value_counts('state')
len(df1.state.unique())

52

In [173]:
df1.state.unique()
state = df1.state

In [174]:
groups = {
         'New England' : ('ME','VT','NH','MA','CT','RI'),
         'Mideast' : ('DE','NJ','NY','PA','DC','MD'),
         'Great Lakes' : ('IL','IN','OH','WI','MI'),
         'Plains' : ('IA','KS','MN','MO','NE','ND','SD'),
         'Southeast' : ('AL','AR','FL','GA','KY','LA','MS','NC','SC','TN','VA','WV'),
         'Southwest' : ('AZ','NM','OK','TX'),
         'Rocky Mountain' : ('CO','ID','MT','UT','WY'),
         'Far West' : ('WA','OR','NV','CA','AK','HI')
         }

In [175]:
def state_group_map(series:pd.Series, groups:dict, othervalue: Any=-1) -> pd.Series:
    #Assign the dictionary pairs
    groups = {z: j for j, r in groups.items() for z in r}
    return series.map(groups).fillna(othervalue)

In [176]:
grouped_states = state_group_map(state, groups, othervalue='other')
states_grouped = pd.concat([df1.state, grouped_states.rename('economic_region')], axis = 1)            

In [177]:
df2 = pd.concat([df1, states_grouped['economic_region']], axis = 1)

In [178]:
# Saving the dataframes 
if not os.path.exists(PREPROCESSED_FILE_PATH):
    os.makedirs(PREPROCESSED_FILE_PATH)

df1.to_csv(os.path.join(PREPROCESSED_FILE_PATH, '_With_Cities_States_Cleaned.csv'),index=False)

In [179]:
# Saving the dataframes 
if not os.path.exists(PREPROCESSED_FILE_PATH):
    os.makedirs(PREPROCESSED_FILE_PATH)

df2.to_csv(os.path.join(PREPROCESSED_FILE_PATH, '_With_regions_States_Cleaned.csv'),index=False)

In [180]:
df1.head()

Unnamed: 0,category,bathrooms,bedrooms,fee,pets_allowed,price,square_feet,cityname,state,amm_0,...,amm_8,amm_9,amm_10,amm_11,amm_12,amm_13,amm_14,amm_15,amm_16,amm_17
0,1,1.0,1.0,1,1,2195.0,542,Redondo Beach,CA,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1.5,3.0,1,3,1250.0,1500,Newport News,VA,0,...,0,0,0,0,0,0,0,0,0,0
2,2,2.0,3.0,1,0,1395.0,1650,Raleigh,NC,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1.0,2.0,1,3,1600.0,820,Vacaville,CA,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1.0,1.0,1,3,975.0,624,Albuquerque,NM,0,...,0,0,0,0,0,0,0,0,0,0


In [181]:
df2.head()

Unnamed: 0,category,bathrooms,bedrooms,fee,pets_allowed,price,square_feet,cityname,state,amm_0,...,amm_9,amm_10,amm_11,amm_12,amm_13,amm_14,amm_15,amm_16,amm_17,economic_region
0,1,1.0,1.0,1,1,2195.0,542,Redondo Beach,CA,0,...,0,0,0,0,0,0,0,0,0,Far West
1,2,1.5,3.0,1,3,1250.0,1500,Newport News,VA,0,...,0,0,0,0,0,0,0,0,0,Southeast
2,2,2.0,3.0,1,0,1395.0,1650,Raleigh,NC,0,...,0,0,0,0,0,0,0,0,0,Southeast
3,1,1.0,2.0,1,3,1600.0,820,Vacaville,CA,0,...,0,0,0,0,0,0,0,0,0,Far West
4,1,1.0,1.0,1,3,975.0,624,Albuquerque,NM,0,...,0,0,0,0,0,0,0,0,0,Southwest


In [182]:
df1.drop_duplicates(inplace=True)

In [183]:
df1.shape

(89219, 27)

In [184]:
len(df1.cityname.unique())

2980

In [185]:
df1.head()

Unnamed: 0,category,bathrooms,bedrooms,fee,pets_allowed,price,square_feet,cityname,state,amm_0,...,amm_8,amm_9,amm_10,amm_11,amm_12,amm_13,amm_14,amm_15,amm_16,amm_17
0,1,1.0,1.0,1,1,2195.0,542,Redondo Beach,CA,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1.5,3.0,1,3,1250.0,1500,Newport News,VA,0,...,0,0,0,0,0,0,0,0,0,0
2,2,2.0,3.0,1,0,1395.0,1650,Raleigh,NC,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1.0,2.0,1,3,1600.0,820,Vacaville,CA,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1.0,1.0,1,3,975.0,624,Albuquerque,NM,0,...,0,0,0,0,0,0,0,0,0,0


# One Hot encoding Cityname feature

In [None]:
city_name = df1.groupby('cityname')['cityname'].agg('count').sort_values(ascending = False)
city_name

In [None]:
dummies = pd.get_dummies(df1.cityname)
dummies.head()

In [None]:
#df1 = pd.concat([df1,dummies.drop('Mililani',axis = 'columns')], axis = 'columns')
df1 = pd.concat([df1,dummies], axis = 'columns')

In [None]:
df1 = df1.drop('cityname', axis = 1)

In [None]:
df1.fillna(0, inplace = True)

In [None]:
df1.isnull().sum()

In [None]:
summarise_data(df1)

In [None]:
# Saving the dataframes 
if not os.path.exists(PREPROCESSED_FILE_PATH):
    os.makedirs(PREPROCESSED_FILE_PATH)

df1.to_csv(os.path.join(PREPROCESSED_FILE_PATH, '_All_States_Cleaned.csv'),index=False)