# Data Preprocessing and model iteration

This code will clean the training data and iterate through various machine learning models for selecting the best model. Five fold cross validation will be used for testing the accuracy of the models

## Importing all libraries

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestRegressor
import statistics as st

## Loading the data

In [6]:
input_df = pd.read_csv('../../data/raw/train.csv')

## Summarizing the data

In [7]:
input_df.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [8]:
input_df.set_index('Id')

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000
6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,Inside,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,307000
8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,Shed,350,11,2009,WD,Normal,200000
9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2008,WD,Abnorml,129900
10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,Corner,...,0,,,,0,1,2008,WD,Normal,118000


In [9]:
#Creating a remodelled flag
input_df['RemodelledFlag'] = 'No'
input_df.loc[(input_df['YearBuilt'] < input_df['YearRemodAdd']) ,'RemodelledFlag'] = 'Yes'

In [10]:
#Inputation to 0 list
field_list1 = ['LotFrontage', 'MasVnrArea']

for i in field_list1:
    input_df[i].fillna(0,inplace=True)

#Inputation to None list
field_list2 = ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtFinType1','BsmtFinType2', 'BsmtExposure', 'Electrical', 'FireplaceQu', 'GarageType',\
              'GarageYrBlt', 'GarageFinish','GarageQual', 'GarageCond','PoolQC', 'Fence', 'MiscFeature']

for i in field_list2:
    input_df[i].fillna("Not_Available",inplace=True)


In [11]:
field_list3 = ['YearRemodAdd', 'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition']

#Removong unnecessary fields
for i in field_list3:
    input_df.drop(i, axis = 1, inplace = True)

In [12]:
# Check for NULLS in all columns
for i in input_df.columns:
    print("{} : {}".format(i,input_df[i].isnull().sum().sum()))

Id : 0
MSSubClass : 0
MSZoning : 0
LotFrontage : 0
LotArea : 0
Street : 0
Alley : 0
LotShape : 0
LandContour : 0
Utilities : 0
LotConfig : 0
LandSlope : 0
Neighborhood : 0
Condition1 : 0
Condition2 : 0
BldgType : 0
HouseStyle : 0
OverallQual : 0
OverallCond : 0
YearBuilt : 0
RoofStyle : 0
RoofMatl : 0
Exterior1st : 0
Exterior2nd : 0
MasVnrType : 0
MasVnrArea : 0
ExterQual : 0
ExterCond : 0
Foundation : 0
BsmtQual : 0
BsmtCond : 0
BsmtExposure : 0
BsmtFinType1 : 0
BsmtFinSF1 : 0
BsmtFinType2 : 0
BsmtFinSF2 : 0
BsmtUnfSF : 0
TotalBsmtSF : 0
Heating : 0
HeatingQC : 0
CentralAir : 0
Electrical : 0
1stFlrSF : 0
2ndFlrSF : 0
LowQualFinSF : 0
GrLivArea : 0
BsmtFullBath : 0
BsmtHalfBath : 0
FullBath : 0
HalfBath : 0
BedroomAbvGr : 0
KitchenAbvGr : 0
KitchenQual : 0
TotRmsAbvGrd : 0
Functional : 0
Fireplaces : 0
FireplaceQu : 0
GarageType : 0
GarageYrBlt : 0
GarageFinish : 0
GarageCars : 0
GarageArea : 0
GarageQual : 0
GarageCond : 0
PavedDrive : 0
WoodDeckSF : 0
OpenPorchSF : 0
EnclosedPorch :

In [13]:
#Check total Nulls
input_df.isnull().sum().sum()

0

In [14]:
field_list4 = ['MSSubClass']

# Converting the variables to strings
for i in field_list4:
    input_df[i] = input_df[i].apply(str)

In [15]:
input_df.describe()

Unnamed: 0,Id,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,Fireplaces,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,SalePrice
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,57.623288,10516.828082,6.099315,5.575342,1971.267808,103.117123,443.639726,46.549315,567.240411,...,0.613014,1.767123,472.980137,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,180921.19589
std,421.610009,34.664304,9981.264932,1.382997,1.112799,30.202904,180.731373,456.098091,161.319273,441.866955,...,0.644666,0.747315,213.804841,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,79442.502883
min,1.0,0.0,1300.0,1.0,1.0,1872.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34900.0
25%,365.75,42.0,7553.5,5.0,5.0,1954.0,0.0,0.0,0.0,223.0,...,0.0,1.0,334.5,0.0,0.0,0.0,0.0,0.0,0.0,129975.0
50%,730.5,63.0,9478.5,6.0,5.0,1973.0,0.0,383.5,0.0,477.5,...,1.0,2.0,480.0,0.0,25.0,0.0,0.0,0.0,0.0,163000.0
75%,1095.25,79.0,11601.5,7.0,6.0,2000.0,164.25,712.25,0.0,808.0,...,1.0,2.0,576.0,168.0,68.0,0.0,0.0,0.0,0.0,214000.0
max,1460.0,313.0,215245.0,10.0,9.0,2010.0,1600.0,5644.0,1474.0,2336.0,...,3.0,4.0,1418.0,857.0,547.0,552.0,508.0,480.0,738.0,755000.0


In [16]:
#Converting to dummies

In [17]:
input_df2 = pd.get_dummies(input_df)

In [18]:
input_df2.shape

(1460, 400)

## Fitting the Model

In [19]:
input_df2.columns

Index(['Id', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       ...
       'Fence_MnPrv', 'Fence_MnWw', 'Fence_Not_Available', 'MiscFeature_Gar2',
       'MiscFeature_Not_Available', 'MiscFeature_Othr', 'MiscFeature_Shed',
       'MiscFeature_TenC', 'RemodelledFlag_No', 'RemodelledFlag_Yes'],
      dtype='object', length=400)

In [20]:
# Labels are the values we want to predict
labels = np.array(input_df2['SalePrice'])

# Remove the labels from the features
# axis 1 refers to the columns
input_df2= input_df2.drop('SalePrice', axis = 1)

# Saving feature names for later use
feature_list = list(input_df2.columns)

# Convert to numpy array
features = np.array(input_df2)

In [21]:
# Running hyperparameter optimization
max_depth_iter =list(range(10,31,2))
max_features_iter = list(range(150,251,10))

Depth = []
Feature_count = []
OOB_Score = []

for i in max_depth_iter:
    for j in max_features_iter:
        rf = RandomForestRegressor(n_estimators = 1000, min_samples_leaf = 5,oob_score = True ,max_depth = i, max_features = j,random_state = 12345)
        Depth.append(i)
        Feature_count.append(j)
        OOB_Score.append(rf.fit(features,labels).oob_score_)
        print("iteration: Max Depth = {} Feature count = {}".format(i,j))

iteration: Max Depth = 10 Feature count = 150
iteration: Max Depth = 10 Feature count = 160
iteration: Max Depth = 10 Feature count = 170
iteration: Max Depth = 10 Feature count = 180
iteration: Max Depth = 10 Feature count = 190
iteration: Max Depth = 10 Feature count = 200
iteration: Max Depth = 10 Feature count = 210
iteration: Max Depth = 10 Feature count = 220
iteration: Max Depth = 10 Feature count = 230
iteration: Max Depth = 10 Feature count = 240
iteration: Max Depth = 10 Feature count = 250
iteration: Max Depth = 12 Feature count = 150
iteration: Max Depth = 12 Feature count = 160
iteration: Max Depth = 12 Feature count = 170
iteration: Max Depth = 12 Feature count = 180
iteration: Max Depth = 12 Feature count = 190
iteration: Max Depth = 12 Feature count = 200
iteration: Max Depth = 12 Feature count = 210
iteration: Max Depth = 12 Feature count = 220
iteration: Max Depth = 12 Feature count = 230
iteration: Max Depth = 12 Feature count = 240
iteration: Max Depth = 12 Feature 

In [22]:
Results = pd.DataFrame({'Depth':Depth, 'Feature_count':Feature_count, 'OOB_Score':OOB_Score })

In [23]:
Results[Results['OOB_Score'] == Results['OOB_Score'].max()]

Unnamed: 0,Depth,Feature_count,OOB_Score
72,22,210,0.859521
83,24,210,0.859521
94,26,210,0.859521
105,28,210,0.859521
116,30,210,0.859521


## The best model which gives an out of bag error of around 85.95% has depth = 22 and Feature_count = 210

In [24]:
# Fitting the final model
rf = RandomForestRegressor(n_estimators = 1000, min_samples_leaf = 5,oob_score = True ,max_depth = 22, max_features = 210,random_state = 12345)
rf1 = rf.fit(features,labels)

In [25]:
feature_importances = pd.DataFrame(rf1.feature_importances_,
                                   index = input_df2.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)

In [26]:
feature_importances

Unnamed: 0,importance
OverallQual,0.377309
GrLivArea,0.130659
GarageCars,0.104271
ExterQual_TA,0.050616
TotalBsmtSF,0.045286
YearBuilt,0.030720
1stFlrSF,0.030338
GarageArea,0.028087
BsmtFinSF1,0.024839
BsmtQual_Ex,0.021240


## Feature Selection
We will only look at the features which have importance > 0.005 and take those in our final model. Quality scores are subjective and hence will not be considerd in our model.

The features are:
* GrLivArea
* GarageCars
* TotalBsmtSF
* YearBuilt
* 1stFlrSF
* GarageArea
* FullBath
* LotArea
* TotRmsAbvGrd
* Fireplaces

In [27]:
## Fitting another Random forest with the above features
final_feature_list = ['GrLivArea', 'GarageCars', 'TotalBsmtSF', 'YearBuilt', '1stFlrSF', 'GarageArea', 
                      'BsmtFinSF1', 'FullBath', 'LotArea', 'TotRmsAbvGrd', 'Fireplaces']

final_df = input_df[final_feature_list]

In [28]:
final_df2 = pd.get_dummies(final_df)

In [29]:
# Saving feature names for later use
feature_list = list(final_df2.columns)

# Convert to numpy array
features = np.array(final_df2)
features.shape

(1460, 11)

The final model has only ten total features. So, we will perform cross validation again to determine the optimal depth and feature count

In [30]:
# Running hyperparameter optimization
max_depth_iter =list(range(2,31,2))
max_features_iter = list(range(4,11,2))

Depth = []
Feature_count = []
OOB_Score = []

for i in max_depth_iter:
    for j in max_features_iter:
        rf = RandomForestRegressor(n_estimators = 1000, min_samples_leaf = 5,oob_score = True ,max_depth = i, max_features = j,random_state = 12345)
        Depth.append(i)
        Feature_count.append(j)
        res = rf.fit(features,labels).oob_score_
        OOB_Score.append(res)
        print("iteration: Max Depth = {} Feature count = {} Res = {}".format(i,j, res))

iteration: Max Depth = 2 Feature count = 4 Res = 0.6571218509157541
iteration: Max Depth = 2 Feature count = 6 Res = 0.6504092972186435
iteration: Max Depth = 2 Feature count = 8 Res = 0.6368426767625113
iteration: Max Depth = 2 Feature count = 10 Res = 0.6224986734206988
iteration: Max Depth = 4 Feature count = 4 Res = 0.7760914262200886
iteration: Max Depth = 4 Feature count = 6 Res = 0.7782295116535561
iteration: Max Depth = 4 Feature count = 8 Res = 0.7767392914182852
iteration: Max Depth = 4 Feature count = 10 Res = 0.7732285306292722
iteration: Max Depth = 6 Feature count = 4 Res = 0.8137324636674312
iteration: Max Depth = 6 Feature count = 6 Res = 0.8164511926779583
iteration: Max Depth = 6 Feature count = 8 Res = 0.8183835104924877
iteration: Max Depth = 6 Feature count = 10 Res = 0.8178613436689033
iteration: Max Depth = 8 Feature count = 4 Res = 0.823867632548483
iteration: Max Depth = 8 Feature count = 6 Res = 0.8262959637796593
iteration: Max Depth = 8 Feature count = 8 Res

In [31]:
Results = pd.DataFrame({'Depth':Depth, 'Feature_count':Feature_count, 'OOB_Score':OOB_Score })

In [32]:
Results[Results['OOB_Score'] == Results['OOB_Score'].max()]

Unnamed: 0,Depth,Feature_count,OOB_Score
35,18,10,0.831096
39,20,10,0.831096
43,22,10,0.831096
47,24,10,0.831096
51,26,10,0.831096
55,28,10,0.831096
59,30,10,0.831096


## Looking at the filtered set of features and running the model, we see that the best result is for depth = 18 and Feature count = 10. We get an OOB R2 value of 83.1% which is a good result 

In [38]:
# Fitting final model

rf = RandomForestRegressor(n_estimators = 1000, min_samples_leaf = 5,oob_score = True ,max_depth = 18,max_features = 10,random_state = 12345)
rf1 = rf.fit(features,labels)

In [39]:
feature_importances = pd.DataFrame(rf1.feature_importances_,
                                   index = final_df2.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)
feature_importances

Unnamed: 0,importance
GarageCars,0.323435
GrLivArea,0.275172
YearBuilt,0.116943
TotalBsmtSF,0.090199
1stFlrSF,0.046398
BsmtFinSF1,0.045513
GarageArea,0.036441
LotArea,0.021877
Fireplaces,0.017689
FullBath,0.014489


In [40]:
import math
math.sqrt(sum((np.log(rf1.predict(features)) - np.log(labels))**2))

4.579873112942877