In [1]:
import sys
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy import stats
from typing import Any
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from typing import Any
import warnings
warnings.simplefilter(action='ignore')

#matplotlib.rcParams["figure.figsize"] = (20,10)

# Constants

In [2]:
# Initialize filepaths 

PREPROCESSED_FILE_PATH = 'preprocessed/'

# Reading the Data

In [3]:
pd.set_option('display.max_rows', None)
df = pd.read_csv('apartments_for_rent.csv', sep = ';', encoding = 'ISO-8859-1')

In [4]:
df.columns


Index(['id', 'category', 'title', 'body', 'amenities', 'bathrooms', 'bedrooms',
       'currency', 'fee', 'has_photo', 'pets_allowed', 'price',
       'price_display', 'price_type', 'square_feet', 'address', 'cityname',
       'state', 'latitude', 'longitude', 'source', 'time'],
      dtype='object')

# Inspecting Data

In [5]:
df.describe()

Unnamed: 0,id,bathrooms,bedrooms,price,square_feet,latitude,longitude,time
count,99492.0,99429.0,99368.0,99491.0,99492.0,99467.0,99467.0,99492.0
mean,5358321000.0,1.445323,1.728212,1527.057281,956.430688,36.947988,-91.568656,1559665000.0
std,184740400.0,0.547021,0.7492,904.245882,417.571522,4.599461,15.817168,11050770.0
min,5121046000.0,1.0,0.0,100.0,101.0,19.5738,-159.3698,1544174000.0
25%,5197950000.0,1.0,1.0,1013.0,729.0,33.7465,-104.7919,1550832000.0
50%,5508673000.0,1.0,2.0,1350.0,900.0,37.2282,-84.5623,1568745000.0
75%,5509007000.0,2.0,2.0,1795.0,1115.0,39.953,-77.6082,1568767000.0
max,5669439000.0,9.0,9.0,52500.0,50000.0,64.8332,-68.7788,1577391000.0


In [6]:
def summarise_data(df):
    print("\nOverview")
    display(df.head())
    print("\nSummary")
    display(df.describe(include= "all"))
    print("\nNull Values")
    display(df.isnull().sum()/len(df))
    print("\nShape")
    display(df.shape)


In [7]:
summarise_data(df)


Overview


Unnamed: 0,id,category,title,body,amenities,bathrooms,bedrooms,currency,fee,has_photo,...,price_display,price_type,square_feet,address,cityname,state,latitude,longitude,source,time
0,5668640009,housing/rent/apartment,One BR 507 & 509 Esplanade,"This unit is located at 507 & 509 Esplanade, R...",,1.0,1.0,USD,No,Thumbnail,...,"$2,195",Monthly,542,507 509 Esplanade,Redondo Beach,CA,33.852,-118.3759,RentLingo,1577360355
1,5668639818,housing/rent/apartment,Three BR 146 Lochview Drive,"This unit is located at 146 Lochview Drive, Ne...",,1.5,3.0,USD,No,Thumbnail,...,"$1,250",Monthly,1500,146 Lochview Dr,Newport News,VA,37.0867,-76.4941,RentLingo,1577360340
2,5668639686,housing/rent/apartment,Three BR 3101 Morningside Drive,This unit is located at 3101 Morningside Drive...,,2.0,3.0,USD,No,Thumbnail,...,"$1,395",Monthly,1650,3101 Morningside Dr,Raleigh,NC,35.823,-78.6438,RentLingo,1577360332
3,5668639659,housing/rent/apartment,Two BR 209 Aegean Way,"This unit is located at 209 Aegean Way, Vacavi...",,1.0,2.0,USD,No,Thumbnail,...,"$1,600",Monthly,820,209 Aegean Way,Vacaville,CA,38.3622,-121.9712,RentLingo,1577360330
4,5668639374,housing/rent/apartment,One BR 4805 Marquette NE,"This unit is located at 4805 Marquette NE, Alb...",,1.0,1.0,USD,No,Thumbnail,...,$975,Monthly,624,4805 Marquette NE,Albuquerque,NM,35.1038,-106.611,RentLingo,1577360308



Summary


Unnamed: 0,id,category,title,body,amenities,bathrooms,bedrooms,currency,fee,has_photo,...,price_display,price_type,square_feet,address,cityname,state,latitude,longitude,source,time
count,99492.0,99492,99492,99492,83448,99429.0,99368.0,99492,99492,99492,...,99491,99492,99492.0,7943,99190,99190,99467.0,99467.0,99492,99492.0
unique,,7,58503,94503,9827,,,1,2,3,...,3718,3,,7771,2979,51,,,25,
top,,housing/rent/apartment,Apartment in great location,"When searching for a pet-friendly One-, Two- a...",Parking,,,USD,No,Yes,...,"$1,350",Monthly,,8215 S.W 72nd Avenue,Dallas,TX,,,RentDigs.com,
freq,,99431,1064,43,6188,,,99492,99291,55974,...,574,99488,,19,2858,11257,,,90912,
mean,5358321000.0,,,,,1.445323,1.728212,,,,...,,,956.430688,,,,36.947988,-91.568656,,1559665000.0
std,184740400.0,,,,,0.547021,0.7492,,,,...,,,417.571522,,,,4.599461,15.817168,,11050770.0
min,5121046000.0,,,,,1.0,0.0,,,,...,,,101.0,,,,19.5738,-159.3698,,1544174000.0
25%,5197950000.0,,,,,1.0,1.0,,,,...,,,729.0,,,,33.7465,-104.7919,,1550832000.0
50%,5508673000.0,,,,,1.0,2.0,,,,...,,,900.0,,,,37.2282,-84.5623,,1568745000.0
75%,5509007000.0,,,,,2.0,2.0,,,,...,,,1115.0,,,,39.953,-77.6082,,1568767000.0



Null Values


id               0.000000
category         0.000000
title            0.000000
body             0.000000
amenities        0.161259
bathrooms        0.000633
bedrooms         0.001246
currency         0.000000
fee              0.000000
has_photo        0.000000
pets_allowed     0.556366
price            0.000010
price_display    0.000010
price_type       0.000000
square_feet      0.000000
address          0.920164
cityname         0.003035
state            0.003035
latitude         0.000251
longitude        0.000251
source           0.000000
time             0.000000
dtype: float64


Shape


(99492, 22)

In [8]:
#df.fillna(0, inplace=True)

# Cleaning Data


In [9]:
df['amenities'].value_counts()[:20]

Parking                       6188
Parking,Storage               2116
Gym,Pool                      1871
Pool                          1485
Gym,Parking,Pool              1187
Parking,Pool                   886
Washer Dryer                   842
Patio/Deck                     775
Clubhouse,Gym,Pool             760
Gym                            735
Parking,Patio/Deck,Storage     664
Wood Floors                    627
Parking,Washer Dryer           521
Gym,Patio/Deck,Pool            518
Gym,Parking,Pool,Storage       477
Clubhouse,Gym,Parking,Pool     430
Parking,Patio/Deck             427
Parking,Wood Floors            422
Cable or Satellite,TV          408
Refrigerator                   399
Name: amenities, dtype: int64

In [10]:
df['amenities'] = df['amenities'].astype(str)


In [11]:
df.value_counts('amenities') #inspecting values in amenities

amenities
nan                                                                                                                                                                                         16044
Parking                                                                                                                                                                                      6188
Parking,Storage                                                                                                                                                                              2116
Gym,Pool                                                                                                                                                                                     1871
Pool                                                                                                                                                                                         1485
Gym,Parking,Pool    

In [12]:
df['amenities'] = df['amenities'].str.lower().str.split(',')


In [13]:
df['with_storage'] = df['amenities'].apply(lambda x: pd.Series(x).str.contains('storage').any())
df['with_parking'] = df['amenities'].apply(lambda x: pd.Series(x).str.contains('parking').any())
df['with_gym'] = df['amenities'].apply(lambda x: pd.Series(x).str.contains('gym').any()) #create a new column with_gym with boolean values
df['with_pool'] = df['amenities'].apply(lambda x: pd.Series(x).str.contains('pool').any()) #create a new column with_pool with boolean values
df['with_woodfloors'] = df['amenities'].apply(lambda x: pd.Series(x).str.contains('wood floors').any()) #create a new column with_washer_dryer with boolean values
df['with_patio'] = df['amenities'].apply(lambda x: pd.Series(x).str.contains('patio/deck').any()) #create a new column with_washer_dryer with boolean values
df['with_clubhouse'] = df['amenities'].apply(lambda x: pd.Series(x).str.contains('clubhouse').any()) #create a new column with_washer_dryer with boolean values
#df = df.apply(lambda x: np.square(x) if x.name == 'd' else x, axis=1)
df['with_internet'] = df['amenities'].apply(lambda x: pd.Series(x).str.contains('internet').any()) #create a new column with_washer_dryer with boolean values
df['with_gated'] = df['amenities'].apply(lambda x: pd.Series(x).str.contains('gated').any()) #create a new column with_washer_dryer with boolean values





In [14]:
#Handling categorical values: category

In [15]:
df['category'].unique()

array(['housing/rent/apartment', 'housing/rent/home',
       'housing/rent/short_term', 'housing/rent', 'housing/rent/condo',
       'housing/rent/other', 'housing/rent/commercial/retail'],
      dtype=object)

In [16]:
#Cleaning Data : extracting the exact category like 'apartment' or 'retail' from category column 'housing/rent/commercial/retail' 
df['category'] = df['category'].str.split('/').str[-1]

In [17]:
df.value_counts('category')

category
apartment     99431
retail           42
rent              7
home              4
short_term        4
condo             3
other             1
dtype: int64

In [18]:
#df['category'] = np.where(df[df['bedrooms']>4], df['category'].map('house'))
#df['New_State_Name'] = np.where(df['Name']=='Person1',df['State'].map(state_map),df['State'].map(state_map2))
df.loc[df.bedrooms >= 3, 'category'] = 'home'
df.value_counts('category')

category
apartment     87039
home          12403
retail           42
short_term        4
condo             2
rent              2
dtype: int64

In [19]:
df.loc[df.category == 'other', 'category'] = 0

In [20]:
df.value_counts('category')

category
apartment     87039
home          12403
retail           42
short_term        4
condo             2
rent              2
dtype: int64

In [21]:
df.set_index('category')
df.loc[df.category == 'short_term', 'category'] = 'apartment'

In [22]:
df.loc[df.category == 'condo' ,'category'] = 'apartment'
df.loc[df.category == 'rent', 'category'] = 'home'
df.loc[df.category == 'retail', 'category'] = 'apartment'

In [23]:
df.head()

Unnamed: 0,id,category,title,body,amenities,bathrooms,bedrooms,currency,fee,has_photo,...,time,with_storage,with_parking,with_gym,with_pool,with_woodfloors,with_patio,with_clubhouse,with_internet,with_gated
0,5668640009,apartment,One BR 507 & 509 Esplanade,"This unit is located at 507 & 509 Esplanade, R...",[nan],1.0,1.0,USD,No,Thumbnail,...,1577360355,False,False,False,False,False,False,False,False,False
1,5668639818,home,Three BR 146 Lochview Drive,"This unit is located at 146 Lochview Drive, Ne...",[nan],1.5,3.0,USD,No,Thumbnail,...,1577360340,False,False,False,False,False,False,False,False,False
2,5668639686,home,Three BR 3101 Morningside Drive,This unit is located at 3101 Morningside Drive...,[nan],2.0,3.0,USD,No,Thumbnail,...,1577360332,False,False,False,False,False,False,False,False,False
3,5668639659,apartment,Two BR 209 Aegean Way,"This unit is located at 209 Aegean Way, Vacavi...",[nan],1.0,2.0,USD,No,Thumbnail,...,1577360330,False,False,False,False,False,False,False,False,False
4,5668639374,apartment,One BR 4805 Marquette NE,"This unit is located at 4805 Marquette NE, Alb...",[nan],1.0,1.0,USD,No,Thumbnail,...,1577360308,False,False,False,False,False,False,False,False,False


In [24]:
df.value_counts('category')

category
apartment    87087
home         12405
dtype: int64

In [25]:

#df['pets_allowed'].mode() This a single element pandas.Series and the fillna() expects either a scalar 
#or a dict/Series/DataFrame of the same len as the column you are trying to fill.

mode = df['pets_allowed'].mode().values[0] 
df['pets_allowed'].fillna(value=mode, inplace=True)

In [26]:
#Handling categorical values: pets_allowed

In [27]:
df.value_counts('pets_allowed')

pets_allowed
Cats,Dogs         92451
None               5070
Cats               1843
Dogs                127
Cats,Dogs,None        1
dtype: int64

In [28]:
df.set_index('pets_allowed')
df.loc[df.pets_allowed == 'Cats,Dogs','pets_allowed'] = 'both cats and dogs'
df.loc[df.pets_allowed == 'Cats','pets_allowed'] = 'only cats'
df.loc[df.pets_allowed == 'Dogs','pets_allowed'] = 'only dogs'
df.loc[df.pets_allowed == 'Cats,Dogs,None','pets_allowed'] = 'both cats and dogs'
df.loc[df.pets_allowed == 'None','pets_allowed'] = 'no pets'

In [29]:
df.value_counts('pets_allowed')

pets_allowed
both cats and dogs    92452
no pets                5070
only cats              1843
only dogs               127
dtype: int64

In [30]:
df.value_counts('price_type')

price_type
Monthly           99488
Weekly                3
Monthly|Weekly        1
dtype: int64

##Replace Monthly|weekly and weekly price in price column with a monthly price.

In [31]:

df.loc[df['price_type'] == 'Monthly|Weekly']
df['price'] = df['price'].astype(float)
df.loc[7245,['price']] = 1195
df['price_type'] = df['price_type'].replace('Monthly|Weekly','Monthly')

#df.iloc[7245]

In [32]:
df.loc[df['price_type'] == 'Weekly']
df.loc[6729,['price']] = 6778
df.loc[49908,['price']] = 36935
df.loc[83395,['price']] = 3693
df['price_type'] = df['price_type'].replace('Weekly','Monthly')


In [33]:
#df.iloc[83395]

In [34]:
#df.loc[df['price_type'] == 'Weekly']['price']
df.value_counts('price_type')

price_type
Monthly    99492
dtype: int64

In [35]:
#Data Cleaning : Convertirng all the price values to monthly
df.head(100)

Unnamed: 0,id,category,title,body,amenities,bathrooms,bedrooms,currency,fee,has_photo,...,time,with_storage,with_parking,with_gym,with_pool,with_woodfloors,with_patio,with_clubhouse,with_internet,with_gated
0,5668640009,apartment,One BR 507 & 509 Esplanade,"This unit is located at 507 & 509 Esplanade, R...",[nan],1.0,1.0,USD,No,Thumbnail,...,1577360355,False,False,False,False,False,False,False,False,False
1,5668639818,home,Three BR 146 Lochview Drive,"This unit is located at 146 Lochview Drive, Ne...",[nan],1.5,3.0,USD,No,Thumbnail,...,1577360340,False,False,False,False,False,False,False,False,False
2,5668639686,home,Three BR 3101 Morningside Drive,This unit is located at 3101 Morningside Drive...,[nan],2.0,3.0,USD,No,Thumbnail,...,1577360332,False,False,False,False,False,False,False,False,False
3,5668639659,apartment,Two BR 209 Aegean Way,"This unit is located at 209 Aegean Way, Vacavi...",[nan],1.0,2.0,USD,No,Thumbnail,...,1577360330,False,False,False,False,False,False,False,False,False
4,5668639374,apartment,One BR 4805 Marquette NE,"This unit is located at 4805 Marquette NE, Alb...",[nan],1.0,1.0,USD,No,Thumbnail,...,1577360308,False,False,False,False,False,False,False,False,False
5,5668639368,apartment,Two BR 7801 Marble NE,"This unit is located at 7801 Marble NE, Albuqu...",[nan],1.5,2.0,USD,No,Thumbnail,...,1577360308,False,False,False,False,False,False,False,False,False
6,5668638765,apartment,Two BR 5 Salt Marsh Quay Apartment H,This unit is located at five Salt Marsh Quay A...,[nan],2.0,2.0,USD,No,Thumbnail,...,1577360261,False,False,False,False,False,False,False,False,False
7,5668638578,apartment,Two BR 11280 W. 20th Ave.,"This unit is located at 11280 W. 20th Ave., La...",[nan],2.0,2.0,USD,No,Thumbnail,...,1577360247,False,False,False,False,False,False,False,False,False
8,5668638476,apartment,Two BR 1427 Lewis Street,"This unit is located at 1427 Lewis Street, Cha...",[nan],1.0,2.0,USD,No,Thumbnail,...,1577360239,False,False,False,False,False,False,False,False,False
9,5668638122,apartment,Two BR 10201 Remmet Avenue,"This unit is located at 10201 Remmet Avenue, C...",[nan],2.0,2.0,USD,No,Thumbnail,...,1577360204,False,False,False,False,False,False,False,False,False


In [36]:
#cleaning the dataset
df = df.drop(['id','title','body','currency','has_photo','latitude','longitude','price_display','price_type','address','source','time','amenities'],axis='columns')

In [37]:
df.value_counts('fee')

fee
No     99291
Yes      201
dtype: int64

In [38]:
df.head()

Unnamed: 0,category,bathrooms,bedrooms,fee,pets_allowed,price,square_feet,cityname,state,with_storage,with_parking,with_gym,with_pool,with_woodfloors,with_patio,with_clubhouse,with_internet,with_gated
0,apartment,1.0,1.0,No,only cats,2195.0,542,Redondo Beach,CA,False,False,False,False,False,False,False,False,False
1,home,1.5,3.0,No,both cats and dogs,1250.0,1500,Newport News,VA,False,False,False,False,False,False,False,False,False
2,home,2.0,3.0,No,no pets,1395.0,1650,Raleigh,NC,False,False,False,False,False,False,False,False,False
3,apartment,1.0,2.0,No,both cats and dogs,1600.0,820,Vacaville,CA,False,False,False,False,False,False,False,False,False
4,apartment,1.0,1.0,No,both cats and dogs,975.0,624,Albuquerque,NM,False,False,False,False,False,False,False,False,False


In [39]:
summarise_data(df)


Overview


Unnamed: 0,category,bathrooms,bedrooms,fee,pets_allowed,price,square_feet,cityname,state,with_storage,with_parking,with_gym,with_pool,with_woodfloors,with_patio,with_clubhouse,with_internet,with_gated
0,apartment,1.0,1.0,No,only cats,2195.0,542,Redondo Beach,CA,False,False,False,False,False,False,False,False,False
1,home,1.5,3.0,No,both cats and dogs,1250.0,1500,Newport News,VA,False,False,False,False,False,False,False,False,False
2,home,2.0,3.0,No,no pets,1395.0,1650,Raleigh,NC,False,False,False,False,False,False,False,False,False
3,apartment,1.0,2.0,No,both cats and dogs,1600.0,820,Vacaville,CA,False,False,False,False,False,False,False,False,False
4,apartment,1.0,1.0,No,both cats and dogs,975.0,624,Albuquerque,NM,False,False,False,False,False,False,False,False,False



Summary


Unnamed: 0,category,bathrooms,bedrooms,fee,pets_allowed,price,square_feet,cityname,state,with_storage,with_parking,with_gym,with_pool,with_woodfloors,with_patio,with_clubhouse,with_internet,with_gated
count,99492,99429.0,99368.0,99492,99492,99491.0,99492.0,99190,99190,99492,99492,99492,99492,99492,99492,99492,99492,99492
unique,2,,,2,4,,,2979,51,2,2,2,2,2,2,2,2,2
top,apartment,,,No,both cats and dogs,,,Dallas,TX,False,False,False,False,False,False,False,False,False
freq,87087,,,99291,92452,,,2858,11257,77910,55680,62213,55996,90613,72990,80428,88441,90840
mean,,1.445323,1.728212,,,1527.433356,956.430688,,,,,,,,,,,
std,,0.547021,0.7492,,,911.086317,417.571522,,,,,,,,,,,
min,,1.0,0.0,,,100.0,101.0,,,,,,,,,,,
25%,,1.0,1.0,,,1013.5,729.0,,,,,,,,,,,
50%,,1.0,2.0,,,1350.0,900.0,,,,,,,,,,,
75%,,2.0,2.0,,,1795.0,1115.0,,,,,,,,,,,



Null Values


category           0.000000
bathrooms          0.000633
bedrooms           0.001246
fee                0.000000
pets_allowed       0.000000
price              0.000010
square_feet        0.000000
cityname           0.003035
state              0.003035
with_storage       0.000000
with_parking       0.000000
with_gym           0.000000
with_pool          0.000000
with_woodfloors    0.000000
with_patio         0.000000
with_clubhouse     0.000000
with_internet      0.000000
with_gated         0.000000
dtype: float64


Shape


(99492, 18)

In [40]:
summarise_data(df)


Overview


Unnamed: 0,category,bathrooms,bedrooms,fee,pets_allowed,price,square_feet,cityname,state,with_storage,with_parking,with_gym,with_pool,with_woodfloors,with_patio,with_clubhouse,with_internet,with_gated
0,apartment,1.0,1.0,No,only cats,2195.0,542,Redondo Beach,CA,False,False,False,False,False,False,False,False,False
1,home,1.5,3.0,No,both cats and dogs,1250.0,1500,Newport News,VA,False,False,False,False,False,False,False,False,False
2,home,2.0,3.0,No,no pets,1395.0,1650,Raleigh,NC,False,False,False,False,False,False,False,False,False
3,apartment,1.0,2.0,No,both cats and dogs,1600.0,820,Vacaville,CA,False,False,False,False,False,False,False,False,False
4,apartment,1.0,1.0,No,both cats and dogs,975.0,624,Albuquerque,NM,False,False,False,False,False,False,False,False,False



Summary


Unnamed: 0,category,bathrooms,bedrooms,fee,pets_allowed,price,square_feet,cityname,state,with_storage,with_parking,with_gym,with_pool,with_woodfloors,with_patio,with_clubhouse,with_internet,with_gated
count,99492,99429.0,99368.0,99492,99492,99491.0,99492.0,99190,99190,99492,99492,99492,99492,99492,99492,99492,99492,99492
unique,2,,,2,4,,,2979,51,2,2,2,2,2,2,2,2,2
top,apartment,,,No,both cats and dogs,,,Dallas,TX,False,False,False,False,False,False,False,False,False
freq,87087,,,99291,92452,,,2858,11257,77910,55680,62213,55996,90613,72990,80428,88441,90840
mean,,1.445323,1.728212,,,1527.433356,956.430688,,,,,,,,,,,
std,,0.547021,0.7492,,,911.086317,417.571522,,,,,,,,,,,
min,,1.0,0.0,,,100.0,101.0,,,,,,,,,,,
25%,,1.0,1.0,,,1013.5,729.0,,,,,,,,,,,
50%,,1.0,2.0,,,1350.0,900.0,,,,,,,,,,,
75%,,2.0,2.0,,,1795.0,1115.0,,,,,,,,,,,



Null Values


category           0.000000
bathrooms          0.000633
bedrooms           0.001246
fee                0.000000
pets_allowed       0.000000
price              0.000010
square_feet        0.000000
cityname           0.003035
state              0.003035
with_storage       0.000000
with_parking       0.000000
with_gym           0.000000
with_pool          0.000000
with_woodfloors    0.000000
with_patio         0.000000
with_clubhouse     0.000000
with_internet      0.000000
with_gated         0.000000
dtype: float64


Shape


(99492, 18)

In [41]:
df = df.fillna(0)

In [42]:
# Saving the dataframes 
if not os.path.exists(PREPROCESSED_FILE_PATH):
    os.makedirs(PREPROCESSED_FILE_PATH)

df.to_csv(os.path.join(PREPROCESSED_FILE_PATH, '_All_States_Cleaned.csv'),index=False)

In [43]:
df.tail(50)

Unnamed: 0,category,bathrooms,bedrooms,fee,pets_allowed,price,square_feet,cityname,state,with_storage,with_parking,with_gym,with_pool,with_woodfloors,with_patio,with_clubhouse,with_internet,with_gated
99442,apartment,2.0,2.0,No,both cats and dogs,930.0,1162,Warner Robins,GA,False,False,False,False,False,False,False,False,False
99443,apartment,1.0,1.0,No,both cats and dogs,1290.0,722,Jacksonville,FL,True,False,False,True,False,True,False,False,False
99444,apartment,2.0,2.0,No,both cats and dogs,1750.0,950,Philadelphia,PA,False,False,True,False,False,False,False,False,False
99445,apartment,1.0,1.0,No,both cats and dogs,1240.0,781,Jacksonville,FL,False,False,True,True,False,False,False,False,False
99446,apartment,2.0,2.0,No,both cats and dogs,1256.0,1323,Jacksonville,FL,False,False,False,False,False,False,False,False,False
99447,apartment,1.0,1.0,No,both cats and dogs,605.0,533,Milwaukee,WI,True,False,False,False,False,False,False,False,False
99448,apartment,2.0,2.0,No,both cats and dogs,1438.0,1286,Jacksonville,FL,False,False,False,True,False,True,False,False,False
99449,apartment,1.0,2.0,No,both cats and dogs,2200.0,800,Boston,MA,False,False,False,False,True,True,False,False,False
99450,apartment,2.0,2.0,No,both cats and dogs,1537.0,1601,Jacksonville,FL,True,True,True,True,False,True,True,False,True
99451,apartment,2.0,2.0,No,both cats and dogs,1760.0,1082,Jacksonville,FL,True,False,False,True,False,True,False,False,False


# Feature Engineering to create new feature : Economic Analysis Region

In [44]:
#we will do categorical Binning for the states which has 51 unique values.
#We will group the 51 states in 8 groups based on U.S. Bureau of Economic Analysis Regions
#1.New England Region 
#2.Great Lakes Region 
#3.Southwest Region 
#4.Mideast Region
#5.Plains Region
#6.Far West Region
#7.Southeast Region
#8.Rocky Mountain Region

In [45]:
df.head()
df.value_counts('state')
len(df.state.unique())

52

In [46]:
df.state.unique()
state = df.state

In [47]:
groups = {
         'New England' : ('ME','VT','NH','MA','CT','RI'),
         'Mideast' : ('DE','NJ','NY','PA','DC','MD'),
         'Great Lakes' : ('IL','IN','OH','WI','MI'),
         'Plains' : ('IA','KS','MN','MO','NE','ND','SD'),
         'Southeast' : ('AL','AR','FL','GA','KY','LA','MS','NC','SC','TN','VA','WV'),
         'Southwest' : ('AZ','NM','OK','TX'),
         'Rocky Mountain' : ('CO','ID','MT','UT','WY'),
         'Far West' : ('WA','OR','NV','CA','AK','HI')
         }

In [48]:
def state_group_map(series:pd.Series, groups:dict, othervalue: Any=-1) -> pd.Series:
    #Assign the dictionary pairs
    groups = {z: j for j, r in groups.items() for z in r}
    return series.map(groups).fillna(othervalue)

In [49]:
grouped_states = state_group_map(state, groups, othervalue='other')
states_grouped = pd.concat([df.state, grouped_states.rename('economic_region')], axis = 1)            

In [50]:
df2 = pd.concat([df, states_grouped['economic_region']], axis = 1)

In [51]:
# Saving the dataframes 
if not os.path.exists(PREPROCESSED_FILE_PATH):
    os.makedirs(PREPROCESSED_FILE_PATH)

df2.to_csv(os.path.join(PREPROCESSED_FILE_PATH, '_With_regions_States_Cleaned.csv'),index=False)

In [52]:






summarise_data(df)


Overview


Unnamed: 0,category,bathrooms,bedrooms,fee,pets_allowed,price,square_feet,cityname,state,with_storage,with_parking,with_gym,with_pool,with_woodfloors,with_patio,with_clubhouse,with_internet,with_gated
0,apartment,1.0,1.0,No,only cats,2195.0,542,Redondo Beach,CA,False,False,False,False,False,False,False,False,False
1,home,1.5,3.0,No,both cats and dogs,1250.0,1500,Newport News,VA,False,False,False,False,False,False,False,False,False
2,home,2.0,3.0,No,no pets,1395.0,1650,Raleigh,NC,False,False,False,False,False,False,False,False,False
3,apartment,1.0,2.0,No,both cats and dogs,1600.0,820,Vacaville,CA,False,False,False,False,False,False,False,False,False
4,apartment,1.0,1.0,No,both cats and dogs,975.0,624,Albuquerque,NM,False,False,False,False,False,False,False,False,False



Summary


Unnamed: 0,category,bathrooms,bedrooms,fee,pets_allowed,price,square_feet,cityname,state,with_storage,with_parking,with_gym,with_pool,with_woodfloors,with_patio,with_clubhouse,with_internet,with_gated
count,99492,99492.0,99492.0,99492,99492,99492.0,99492.0,99492,99492,99492,99492,99492,99492,99492,99492,99492,99492,99492
unique,2,,,2,4,,,2980,52,2,2,2,2,2,2,2,2,2
top,apartment,,,No,both cats and dogs,,,Dallas,TX,False,False,False,False,False,False,False,False,False
freq,87087,,,99291,92452,,,2858,11257,77910,55680,62213,55996,90613,72990,80428,88441,90840
mean,,1.444408,1.726058,,,1527.418003,956.430688,,,,,,,,,,,
std,,0.548055,0.751212,,,911.094608,417.571522,,,,,,,,,,,
min,,0.0,0.0,,,0.0,101.0,,,,,,,,,,,
25%,,1.0,1.0,,,1013.0,729.0,,,,,,,,,,,
50%,,1.0,2.0,,,1350.0,900.0,,,,,,,,,,,
75%,,2.0,2.0,,,1795.0,1115.0,,,,,,,,,,,



Null Values


category           0.0
bathrooms          0.0
bedrooms           0.0
fee                0.0
pets_allowed       0.0
price              0.0
square_feet        0.0
cityname           0.0
state              0.0
with_storage       0.0
with_parking       0.0
with_gym           0.0
with_pool          0.0
with_woodfloors    0.0
with_patio         0.0
with_clubhouse     0.0
with_internet      0.0
with_gated         0.0
dtype: float64


Shape


(99492, 18)