In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# the number of columns the notebook can display
pd.set_option('display.max_columns', 200)

In [3]:
house = pd.read_csv('housing.csv')

In [4]:
house.head(5)

Unnamed: 0,id,url,region,region_url,price,type,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,laundry_options,parking_options,image_url,description,lat,long,state
0,7049044568,https://reno.craigslist.org/apa/d/reno-beautif...,reno / tahoe,https://reno.craigslist.org,1148,apartment,1078,3,2.0,1,1,0,0,0,0,w/d in unit,carport,https://images.craigslist.org/01616_daghmBUvTC...,Ridgeview by Vintage is where you will find al...,39.5483,-119.796,ca
1,7049047186,https://reno.craigslist.org/apa/d/reno-reduced...,reno / tahoe,https://reno.craigslist.org,1200,condo,1001,2,2.0,0,0,0,0,0,0,w/d hookups,carport,https://images.craigslist.org/00V0V_5va0MkgO9q...,Conveniently located in the middle town of Ren...,39.5026,-119.789,ca
2,7043634882,https://reno.craigslist.org/apa/d/sparks-state...,reno / tahoe,https://reno.craigslist.org,1813,apartment,1683,2,2.0,1,1,1,0,0,0,w/d in unit,attached garage,https://images.craigslist.org/00t0t_erYqC6LgB8...,2BD | 2BA | 1683SQFTDiscover exceptional servi...,39.6269,-119.708,ca
3,7049045324,https://reno.craigslist.org/apa/d/reno-1x1-fir...,reno / tahoe,https://reno.craigslist.org,1095,apartment,708,1,1.0,1,1,1,0,0,0,w/d in unit,carport,https://images.craigslist.org/00303_3HSJz75zlI...,MOVE IN SPECIAL FREE WASHER/DRYER WITH 6 OR 12...,39.4477,-119.771,ca
4,7049043759,https://reno.craigslist.org/apa/d/reno-no-long...,reno / tahoe,https://reno.craigslist.org,289,apartment,250,0,1.0,1,1,1,1,0,1,laundry on site,,https://images.craigslist.org/01616_fALAWFV8zQ...,"Move In Today: Reno Low-Cost, Clean & Furnishe...",39.5357,-119.805,ca


In [5]:
house.shape

(384977, 22)

In [6]:
house.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384977 entries, 0 to 384976
Data columns (total 22 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       384977 non-null  int64  
 1   url                      384977 non-null  object 
 2   region                   384977 non-null  object 
 3   region_url               384977 non-null  object 
 4   price                    384977 non-null  int64  
 5   type                     384977 non-null  object 
 6   sqfeet                   384977 non-null  int64  
 7   beds                     384977 non-null  int64  
 8   baths                    384977 non-null  float64
 9   cats_allowed             384977 non-null  int64  
 10  dogs_allowed             384977 non-null  int64  
 11  smoking_allowed          384977 non-null  int64  
 12  wheelchair_access        384977 non-null  int64  
 13  electric_vehicle_charge  384977 non-null  int64  
 14  come

In [7]:
house.columns

Index(['id', 'url', 'region', 'region_url', 'price', 'type', 'sqfeet', 'beds',
       'baths', 'cats_allowed', 'dogs_allowed', 'smoking_allowed',
       'wheelchair_access', 'electric_vehicle_charge', 'comes_furnished',
       'laundry_options', 'parking_options', 'image_url', 'description', 'lat',
       'long', 'state'],
      dtype='object')

In [8]:
house.region.value_counts()

region
jacksonville      4246
columbus          3738
rochester         3677
jackson           3667
fayetteville      3652
                  ... 
southwest MS        12
st louis             9
southwest TX         9
fort smith, AR       5
kansas city          3
Name: count, Length: 404, dtype: int64

In [9]:
house.state.value_counts().head(8)

state
ca    33085
fl    31929
tx    31137
nc    18628
mi    14529
ga    13841
oh    12884
tn    11541
Name: count, dtype: int64

In [10]:
house.type.unique()

array(['apartment', 'condo', 'house', 'duplex', 'townhouse', 'loft',
       'manufactured', 'cottage/cabin', 'flat', 'in-law', 'land',
       'assisted living'], dtype=object)

In [11]:
#Getting the percentage of the null records
null_values=pd.DataFrame(house.isnull().sum(),columns=['null_sum'])
null_values=null_values[null_values.null_sum>0]
null_values['percentage']=(null_values.null_sum/len(house ))*100
null_values=null_values.sort_values(by='percentage',ascending=False)
null_values

Unnamed: 0,null_sum,percentage
parking_options,140687,36.544261
laundry_options,79026,20.52746
lat,1918,0.498212
long,1918,0.498212
description,2,0.00052


Before imputing null values, it's prudent to eliminate irrelevant features that do not contribute to estimating house rent. We will drop the columns 'url', 'id', 'image_url', 'description', and 'region_url' as they are not highly relevant for our analysis.

In [12]:
house.drop(columns=['id','url','lat','long','region_url','image_url',
                    'description', 
#                     'electric_vehicle_charge', 
#                     'smoking_allowed','wheelchair_access'
                   ], inplace=True)

In [13]:
house[20000:20005]

Unnamed: 0,region,price,type,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,laundry_options,parking_options,state
20000,colorado springs,999,apartment,561,1,1.0,1,1,1,0,0,0,,,co
20001,colorado springs,1350,apartment,701,1,1.0,1,1,0,0,0,0,w/d in unit,attached garage,co
20002,colorado springs,999,apartment,561,1,1.0,1,1,1,0,0,0,laundry in bldg,,co
20003,colorado springs,1169,apartment,1109,3,2.0,1,1,1,0,0,0,,no parking,co
20004,colorado springs,958,apartment,561,1,1.0,1,1,1,0,0,0,,,co


In [14]:
house.parking_options.unique()

array(['carport', 'attached garage', nan, 'off-street parking',
       'detached garage', 'street parking', 'no parking', 'valet parking'],
      dtype=object)

In [15]:
house.laundry_options.unique()

array(['w/d in unit', 'w/d hookups', 'laundry on site', 'laundry in bldg',
       nan, 'no laundry on site'], dtype=object)

# Data Cleaning

In [16]:
house.dtypes

region                      object
price                        int64
type                        object
sqfeet                       int64
beds                         int64
baths                      float64
cats_allowed                 int64
dogs_allowed                 int64
smoking_allowed              int64
wheelchair_access            int64
electric_vehicle_charge      int64
comes_furnished              int64
laundry_options             object
parking_options             object
state                       object
dtype: object

In [17]:
house.isna().sum()

region                          0
price                           0
type                            0
sqfeet                          0
beds                            0
baths                           0
cats_allowed                    0
dogs_allowed                    0
smoking_allowed                 0
wheelchair_access               0
electric_vehicle_charge         0
comes_furnished                 0
laundry_options             79026
parking_options            140687
state                           0
dtype: int64

In [18]:
house['parking_options']=house.parking_options.fillna('no parking')
house['laundry_options'] = house.laundry_options.fillna('no laundry')

In [19]:
house.isna().sum()

region                     0
price                      0
type                       0
sqfeet                     0
beds                       0
baths                      0
cats_allowed               0
dogs_allowed               0
smoking_allowed            0
wheelchair_access          0
electric_vehicle_charge    0
comes_furnished            0
laundry_options            0
parking_options            0
state                      0
dtype: int64

# Feature Engineering

In [20]:
# Create new column for pets allowed and drop cats and dogs
house['pets_allowed'] = (house['cats_allowed'] & house['dogs_allowed']).astype(int)
house = house.drop(['cats_allowed', 'dogs_allowed'], axis = 'columns')