In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
# from sklearn.model_selection import KFold

pd.options.display.max_columns = None
pd.set_option("display.max_rows", None, "display.max_columns", None)
sns.set_theme()
%matplotlib inline

# Data Preparation
#### Import
 - import cleaned dataset
 - extract target
 - drop cols not relevant to model


In [2]:
homes = pd.read_csv('../data/engineered.csv', index_col=0)
prices = homes.SalePrice

dropcols = ['PID','SalePrice','MSSubClass','MoSold','YrSold','DateSold','IsNearNegativeCondition','IsNearPositiveCondition','Combine_BathroomsAbvGrd','Combine_BathroomsBsmt','AllBathrooms','LogSalePrice','latitude','longitude','district']
homes.drop(dropcols,axis=1,inplace=True)
homes.head()

Unnamed: 0,GrLivArea,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,SaleType,SaleCondition,IsPUD,NumFloors,Collapse_MSSubClass,Combine_Age,IsRenovated
0,856,RL,66.0,7890,2,0,4,Lvl,4,Corner,3,SWISU,Norm,Norm,1Fam,1Story,6,6,1939,1950,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,3,3,CBlock,3,3,1,3,238.0,1,0.0,618.0,856.0,GasA,3,Y,SBrkr,856,0,0,1.0,0.0,1,0,2,1,3,4,8,1,4,Detchd,1939.0,Unf,2.0,399.0,3,3,3,0,0,0,0,166,0,0,NoFence,NoMisc,0,WD,Normal,False,1.0,Traditional,60,True
1,1049,RL,42.0,4235,2,0,4,Lvl,4,Inside,3,Edwards,Norm,Norm,TwnhsE,1Story,5,5,1984,1984,Gable,CompShg,HdBoard,HdBoard,BrkFace,149.0,4,3,CBlock,4,3,2,6,552.0,5,393.0,104.0,1049.0,GasA,3,Y,SBrkr,1049,0,0,1.0,0.0,2,0,2,1,4,5,8,0,0,Attchd,1984.0,Fin,1.0,266.0,3,3,3,0,105,0,0,0,0,0,NoFence,NoMisc,0,WD,Normal,True,1.0,Traditional,25,False
2,1001,C (all),60.0,6060,2,0,4,Lvl,4,Inside,3,IDOTRR,Norm,Norm,1Fam,1Story,5,9,1930,2007,Hip,CompShg,MetalSd,MetalSd,,0.0,4,3,BrkTil,3,3,1,5,737.0,1,0.0,100.0,837.0,GasA,5,Y,SBrkr,1001,0,0,0.0,0.0,1,0,2,1,4,5,8,0,0,Detchd,1930.0,Unf,1.0,216.0,3,1,1,154,0,42,86,0,0,0,NoFence,NoMisc,0,WD,Normal,False,1.0,Traditional,0,True
3,1039,RL,80.0,8146,2,0,4,Lvl,4,Corner,3,OldTown,Norm,Norm,1Fam,2Story,4,8,1900,2003,Gable,CompShg,MetalSd,MetalSd,,0.0,4,4,BrkTil,2,3,1,1,0.0,1,0.0,405.0,405.0,GasA,4,Y,SBrkr,717,322,0,0.0,0.0,1,0,2,1,3,6,8,0,0,Detchd,1940.0,Unf,1.0,281.0,3,3,1,0,0,168,0,111,0,0,NoFence,NoMisc,0,WD,Normal,False,2.0,Traditional,6,True
4,1665,RL,70.0,8400,2,0,4,Lvl,4,Inside,3,NWAmes,Norm,Norm,1Fam,2Story,8,6,2001,2001,Gable,CompShg,VinylSd,VinylSd,,0.0,4,3,PConc,4,3,1,6,643.0,1,0.0,167.0,810.0,GasA,5,Y,SBrkr,810,855,0,1.0,0.0,2,1,3,1,4,6,8,0,0,Attchd,2001.0,Fin,2.0,528.0,3,3,3,0,45,0,0,0,0,0,NoFence,NoMisc,0,WD,Normal,False,2.0,Traditional,8,False


#### Label encode categorical features

In [3]:
catfeats = ['MSZoning','LandContour','LotConfig','Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl','Exterior1st',
            'Exterior2nd','MasVnrType','Foundation','Heating','Electrical','GarageType','GarageFinish','Fence','MiscFeature','SaleType','CentralAir',
            'SaleCondition','IsPUD','Collapse_MSSubClass','IsRenovated']

labelencoder = LabelEncoder()

for i in catfeats:
    homes[i+"_Encoded"] = labelencoder.fit_transform(homes[i])

homes.drop(catfeats,axis=1,inplace=True)
homes.head()

Unnamed: 0,GrLivArea,LotFrontage,LotArea,Street,Alley,LotShape,Utilities,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,HeatingQC,1stFlrSF,2ndFlrSF,LowQualFinSF,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageYrBlt,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,MiscVal,NumFloors,Combine_Age,MSZoning_Encoded,LandContour_Encoded,LotConfig_Encoded,Neighborhood_Encoded,Condition1_Encoded,Condition2_Encoded,BldgType_Encoded,HouseStyle_Encoded,RoofStyle_Encoded,RoofMatl_Encoded,Exterior1st_Encoded,Exterior2nd_Encoded,MasVnrType_Encoded,Foundation_Encoded,Heating_Encoded,Electrical_Encoded,GarageType_Encoded,GarageFinish_Encoded,Fence_Encoded,MiscFeature_Encoded,SaleType_Encoded,CentralAir_Encoded,SaleCondition_Encoded,IsPUD_Encoded,Collapse_MSSubClass_Encoded,IsRenovated_Encoded
0,856,66.0,7890,2,0,4,4,3,6,6,1939,1950,0.0,3,3,3,3,1,3,238.0,1,0.0,618.0,856.0,3,856,0,0,1.0,0.0,1,0,2,1,3,4,8,1,4,1939.0,2.0,399.0,3,3,3,0,0,0,0,166,0,0,0,1.0,60,5,3,0,21,2,2,0,2,1,0,13,14,2,1,1,3,5,3,4,1,9,1,4,0,2,1
1,1049,42.0,4235,2,0,4,4,3,5,5,1984,1984,149.0,4,3,4,3,2,6,552.0,5,393.0,104.0,1049.0,3,1049,0,0,1.0,0.0,2,0,2,1,4,5,8,0,0,1984.0,1.0,266.0,3,3,3,0,105,0,0,0,0,0,0,1.0,25,5,3,4,7,2,2,4,2,1,0,6,6,1,1,1,3,1,0,4,1,9,1,4,1,2,0
2,1001,60.0,6060,2,0,4,4,3,5,9,1930,2007,0.0,4,3,3,3,1,5,737.0,1,0.0,100.0,837.0,5,1001,0,0,0.0,0.0,1,0,2,1,4,5,8,0,0,1930.0,1.0,216.0,3,1,1,154,0,42,86,0,0,0,0,1.0,0,1,3,4,11,2,2,0,2,3,0,8,8,2,0,1,3,5,3,4,1,9,1,4,0,2,1
3,1039,80.0,8146,2,0,4,4,3,4,8,1900,2003,0.0,4,4,2,3,1,1,0.0,1,0.0,405.0,405.0,4,717,322,0,0.0,0.0,1,0,2,1,3,6,8,0,0,1940.0,1.0,281.0,3,3,1,0,0,168,0,111,0,0,0,2.0,6,5,3,0,20,2,2,0,5,1,0,8,8,2,0,1,3,5,3,4,1,9,1,4,0,2,1
4,1665,70.0,8400,2,0,4,4,3,8,6,2001,2001,0.0,4,3,4,3,1,6,643.0,1,0.0,167.0,810.0,5,810,855,0,1.0,0.0,2,1,3,1,4,6,8,0,0,2001.0,2.0,528.0,3,3,3,0,45,0,0,0,0,0,0,2.0,8,5,3,4,17,2,2,0,5,1,0,12,13,2,2,1,3,1,0,4,1,9,1,4,0,2,0


# Modeling - All Features


#### Naive Model

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(homes, prices, random_state=0, test_size=0.3)

In [17]:
def get_score(model):

    model.fit(X_train,Y_train)
    print('Train Accuracy: ', model.score(X_train,Y_train))
    print('Test Accuracy: ', model.score(X_test,Y_test))

In [10]:
forest = RandomForestRegressor()
forest.set_params(random_state=0)

get_score(forest)

Train Accuracy:  0.9851904986631612
Test Accuracy:  0.9046183675840183


#### Hyperparameter Tuning - All Features

First Attempt - focus on depth and criterion
- similar accuracy results to defaults - still overfitting
- limiting max depth appears good
- squared error seems like the better evaluator
- having a very large forest doesn't provide much improvement

In [11]:
grid_para_forest = [{
    'n_estimators' : range(100,500,50),
    'criterion' : ['squared_error','poisson'],  #sklearn doc says absolute_error is slow compared to others
    'max_depth' : range(10,50,10),
    'min_samples_split' : range(2,20,2)
    # 'max_features' : 
    # 'random_state':
}]

grid_search_forest = GridSearchCV(forest, grid_para_forest, scoring='r2', cv=5, n_jobs=-1)

get_score(grid_search_forest)

Train Accuracy:  0.9862523247269853
Test Accuracy:  0.9056004659340895


In [12]:
grid_search_forest.best_params_

{'criterion': 'squared_error',
 'max_depth': 20,
 'min_samples_split': 2,
 'n_estimators': 150}

In [33]:
best_forest = grid_search_forest.best_estimator_

forest_params1 = pd.DataFrame(best_forest.feature_importances_, index=homes.columns).reset_index().sort_values(by=0,ascending=False)
forest_params1.rename({'index':'Feature',0:'Score'},inplace=True, axis=1)
forest_params1

Unnamed: 0,Feature,Score
8,OverallQual,0.5406479
0,GrLivArea,0.1388422
25,1stFlrSF,0.05457879
23,TotalBsmtSF,0.04061743
41,GarageArea,0.03536719
19,BsmtFinSF1,0.02485385
2,LotArea,0.01424035
13,ExterQual,0.01219379
12,MasVnrArea,0.01178312
10,YearBuilt,0.009661028


Second Attempt - Focus on features and splitting
- 150 max trees and 20 max depth still are best (should I try smaller depth?)
- limiting features appears to have a good result
- min_samples_split seems to have a positive effect

In [26]:
grid_para_forest2 = [{
    'n_estimators' : range(100,300,50),
    'criterion' : ['squared_error'],  #sklearn doc says absolute_error is slow compared to others
    'max_depth' : [20,25,30],
    'min_samples_split' : [2,5,20],
    'min_samples_leaf' : [2,5,10],
    'max_features' : range(5,80,10)
}]

grid_search_forest2 = GridSearchCV(forest, grid_para_forest2, scoring='r2', cv=5, n_jobs=-1)

get_score(grid_search_forest2)

Train Accuracy:  0.9760240772488377
Test Accuracy:  0.9120073306131777


In [27]:
grid_search_forest2.best_params_

{'criterion': 'squared_error',
 'max_depth': 20,
 'max_features': 25,
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'n_estimators': 150}

In [34]:
best_forest = grid_search_forest2.best_estimator_

forest_params2 = pd.DataFrame(best_forest.feature_importances_, index=homes.columns).reset_index().sort_values(by=0,ascending=False)
forest_params2.rename({'index':'Feature',0:'Score'},inplace=True, axis=1)
forest_params2

Unnamed: 0,Feature,Score
8,OverallQual,0.1997258
13,ExterQual,0.1267874
0,GrLivArea,0.1137751
40,GarageCars,0.08737853
41,GarageArea,0.05613286
10,YearBuilt,0.05502789
23,TotalBsmtSF,0.05361323
25,1stFlrSF,0.05025388
34,KitchenQual,0.02656012
19,BsmtFinSF1,0.02282199


Third attempt
- revisit lower max_depth with max_features
- look for more clarity in min_samples_split

In [35]:
grid_para_forest3 = [{
    'n_estimators' : range(100,300,50),
    'criterion' : ['squared_error'],  #sklearn doc says absolute_error is slow compared to others
    'max_depth' : [10,15,20,25,30],
    'min_samples_split' : range(2,10,2),
    # 'min_samples_leaf' : [2,5,10],
    'max_features' : range(10,30,5)
}]

grid_search_forest3 = GridSearchCV(forest, grid_para_forest3, scoring='r2', cv=5, n_jobs=-1)

get_score(grid_search_forest3)

Train Accuracy:  0.9868194698859898
Test Accuracy:  0.9104944324716694


In [41]:
grid_search_forest3.best_params_

{'criterion': 'squared_error',
 'max_depth': 25,
 'max_features': 20,
 'min_samples_split': 2,
 'n_estimators': 200}

In [36]:
best_forest = grid_search_forest3.best_estimator_

forest_params3 = pd.DataFrame(best_forest.feature_importances_, index=homes.columns).reset_index().sort_values(by=0,ascending=False)
forest_params3.rename({'index':'Feature',0:'Score'},inplace=True, axis=1)
forest_params3

Unnamed: 0,Feature,Score
8,OverallQual,0.1986168
0,GrLivArea,0.1193489
13,ExterQual,0.1058613
40,GarageCars,0.06987671
23,TotalBsmtSF,0.0568927
10,YearBuilt,0.0551779
41,GarageArea,0.05329777
25,1stFlrSF,0.05305677
34,KitchenQual,0.02918973
19,BsmtFinSF1,0.02357946


Merge Estimator Results

In [40]:
forest_params = pd.merge(forest_params1,forest_params2,on='Feature')
forest_params = pd.merge(forest_params,forest_params3,on='Feature')
forest_params

Unnamed: 0,Feature,Score_x,Score_y,Score
0,OverallQual,0.5406479,0.1997258,0.1986168
1,GrLivArea,0.1388422,0.1137751,0.1193489
2,1stFlrSF,0.05457879,0.05025388,0.05305677
3,TotalBsmtSF,0.04061743,0.05361323,0.0568927
4,GarageArea,0.03536719,0.05613286,0.05329777
5,BsmtFinSF1,0.02485385,0.02282199,0.02357946
6,LotArea,0.01424035,0.01812988,0.01905011
7,ExterQual,0.01219379,0.1267874,0.1058613
8,MasVnrArea,0.01178312,0.0106639,0.009899292
9,YearBuilt,0.009661028,0.05502789,0.0551779
