In [38]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVR
import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns = None
pd.set_option("display.max_rows", None, "display.max_columns", None)
sns.set_theme()
%matplotlib inline

In [39]:
housing = pd.read_csv("ames_house_cleaned.csv")
housing.head()

Unnamed: 0.1,Unnamed: 0,PID,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,BsmtUnfSF,Heating,HeatingQC,CentralAir,Electrical,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,NumbrFlrs,HouseAge,Remodeled,LogPrice,AllBathrooms,TotalLivArea,TotalOutdoorSF,HasPool
0,1,909176150,126000,1-STORY 1945 & OLDER,RL,60,7890,2,0,4,Lvl,4,Corner,1,Southwest of ISU,Norm,Norm,1Fam,1Story,6,6,1939,1950,Gable,CompShg,Wd Sdng,Wd Sdng,,0,3,3,CBlock,3,3,1,3,1,618,GasA,3,Y,SBrkr,2,1,3,4,8,1,4,Detchd,1939,1,2,399,3,3,2,0,0,No Misc,0,Mar,2010,Warranty Deed,Normal,1,71,1,11.744037,2.0,1094,166,0
1,2,905476230,139500,1-STORY PUD 1946 & NEWER,RL,42,4235,2,0,4,Lvl,4,Inside,1,Edwards,Norm,Norm,TwnhsE,1Story,5,5,1984,1984,Gable,CompShg,HdBoard,HdBoard,BrkFace,149,4,3,CBlock,4,3,2,6,5,104,GasA,3,Y,SBrkr,2,1,4,5,8,0,0,Attchd,1984,3,1,266,3,3,2,0,0,No Misc,0,Feb,2009,Warranty Deed,Normal,1,26,0,11.84582,3.0,1994,105,0
2,3,911128020,124900,1-STORY 1945 & OLDER,C (all),60,6060,2,0,4,Lvl,4,Inside,1,"Iowa DOT,Rail Road",Norm,Norm,1Fam,1Story,5,9,1930,2007,Hip,CompShg,MetalSd,MetalSd,,0,4,3,BrkTil,3,3,1,5,1,100,GasA,5,Y,SBrkr,2,1,4,5,8,0,0,Detchd,1930,1,1,216,3,1,0,0,0,No Misc,0,Nov,2007,Warranty Deed,Normal,1,80,1,11.735269,1.0,1738,282,0
3,4,535377150,114000,2-STORY 1945 & OLDER,RL,80,8146,2,0,4,Lvl,4,Corner,1,Old Town,Norm,Norm,1Fam,2Story,4,8,1900,2003,Gable,CompShg,MetalSd,MetalSd,,0,4,4,BrkTil,2,3,1,1,1,405,GasA,4,Y,SBrkr,2,1,3,6,8,0,0,Detchd,1940,1,1,281,3,3,0,0,0,No Misc,0,May,2009,Warranty Deed,Normal,2,110,1,11.643954,1.0,1039,279,0
4,5,534177230,227000,2-STORY 1946 & NEWER,RL,70,8400,2,0,4,Lvl,4,Inside,1,Northwest Ames,Norm,Norm,1Fam,2Story,8,6,2001,2001,Gable,CompShg,VinylSd,VinylSd,,0,4,3,PConc,4,3,1,6,1,167,GasA,5,Y,SBrkr,3,1,4,6,8,0,0,Attchd,2001,3,2,528,3,3,2,0,0,No Misc,0,Nov,2009,Warranty Deed,Normal,2,9,0,12.332705,3.5,2308,45,0


In [40]:
housing.drop(['Unnamed: 0'],inplace=True, axis=1)

In [41]:
house_price = housing.SalePrice
log_price = housing.LogPrice

In [42]:
X = housing[[
    'TotalLivArea','TotalOutdoorSF','MasVnrArea','LotFrontage',
    'LotArea','GarageArea','BedroomAbvGr','TotRmsAbvGrd',
    'AllBathrooms','GarageCars','LotShape','YearBuilt',
    'YearRemodAdd','HouseAge','OverallQual','ExterQual','BsmtQual',
    'OverallCond','FireplaceQu','KitchenQual','SaleCondition',
    'Neighborhood','YrSold','CentralAir','Fireplaces',
    'MSSubClass','MSZoning'
]]

X.head()

Unnamed: 0,TotalLivArea,TotalOutdoorSF,MasVnrArea,LotFrontage,LotArea,GarageArea,BedroomAbvGr,TotRmsAbvGrd,AllBathrooms,GarageCars,LotShape,YearBuilt,YearRemodAdd,HouseAge,OverallQual,ExterQual,BsmtQual,OverallCond,FireplaceQu,KitchenQual,SaleCondition,Neighborhood,YrSold,CentralAir,Fireplaces,MSSubClass,MSZoning
0,1094,166,0,60,7890,399,2,4,2.0,2,4,1939,1950,71,6,3,3,6,4,3,Normal,Southwest of ISU,2010,Y,1,1-STORY 1945 & OLDER,RL
1,1994,105,149,42,4235,266,2,5,3.0,1,4,1984,1984,26,5,4,4,5,0,4,Normal,Edwards,2009,Y,0,1-STORY PUD 1946 & NEWER,RL
2,1738,282,0,60,6060,216,2,5,1.0,1,4,1930,2007,80,5,4,3,9,0,4,Normal,"Iowa DOT,Rail Road",2007,Y,0,1-STORY 1945 & OLDER,C (all)
3,1039,279,0,80,8146,281,2,6,1.0,1,4,1900,2003,110,4,4,2,8,0,3,Normal,Old Town,2009,Y,0,2-STORY 1945 & OLDER,RL
4,2308,45,0,70,8400,528,3,6,3.5,2,4,2001,2001,9,8,4,4,6,0,4,Normal,Northwest Ames,2009,Y,0,2-STORY 1946 & NEWER,RL


In [43]:
X.dtypes

TotalLivArea        int64
TotalOutdoorSF      int64
MasVnrArea          int64
LotFrontage         int64
LotArea             int64
GarageArea          int64
BedroomAbvGr        int64
TotRmsAbvGrd        int64
AllBathrooms      float64
GarageCars          int64
LotShape            int64
YearBuilt           int64
YearRemodAdd        int64
HouseAge            int64
OverallQual         int64
ExterQual           int64
BsmtQual            int64
OverallCond         int64
FireplaceQu         int64
KitchenQual         int64
SaleCondition      object
Neighborhood       object
YrSold              int64
CentralAir         object
Fireplaces          int64
MSSubClass         object
MSZoning           object
dtype: object

In [44]:
feats_to_encode = X.columns[X.dtypes==object].tolist() 

In [49]:
feats_to_encode

['SaleCondition', 'Neighborhood', 'CentralAir', 'MSSubClass', 'MSZoning']

In [46]:
#label encoding
labelencoder = LabelEncoder()

for i in feats_to_encode:
    X[i+"_Encoded"] = labelencoder.fit_transform(X[i])

X.drop(feats_to_encode,axis=1,inplace=True)
X.head()

Unnamed: 0,TotalLivArea,TotalOutdoorSF,MasVnrArea,LotFrontage,LotArea,GarageArea,BedroomAbvGr,TotRmsAbvGrd,AllBathrooms,GarageCars,LotShape,YearBuilt,YearRemodAdd,HouseAge,OverallQual,ExterQual,BsmtQual,OverallCond,FireplaceQu,KitchenQual,YrSold,Fireplaces,SaleCondition_Encoded,Neighborhood_Encoded,CentralAir_Encoded,MSSubClass_Encoded,MSZoning_Encoded
0,1094,166,0,60,7890,399,2,4,2.0,2,4,1939,1950,71,6,3,3,6,4,3,2010,1,3,24,1,3,5
1,1994,105,149,42,4235,266,2,5,3.0,1,4,1984,1984,26,5,4,4,5,0,4,2009,0,3,7,1,5,5
2,1738,282,0,60,6060,216,2,5,1.0,1,4,1930,2007,80,5,4,3,9,0,4,2007,0,3,11,1,3,1
3,1039,279,0,80,8146,281,2,6,1.0,1,4,1900,2003,110,4,4,2,8,0,3,2009,0,3,20,1,9,5
4,2308,45,0,70,8400,528,3,6,3.5,2,4,2001,2001,9,8,4,4,6,0,4,2009,0,3,19,1,10,5


In [50]:
#create scoring functions 

def get_score(model):

    model.fit(X_train,Y_train)
    print('Train Accuracy: ', model.score(X_train,Y_train))
    print('Test Accuracy: ', model.score(X_test,Y_test))

def get_score_log(model):

    model.fit(X_train_log,Y_train_log)
    print('Train Accuracy (log): ', model.score(X_train_log,Y_train_log))
    print('Test Accuracy (log): ', model.score(X_test_log,Y_test_log))

In [51]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, house_price, test_size=0.3, random_state=0)
X_train_log, X_test_log, Y_train_log, Y_test_log = train_test_split(X, log_price, random_state=0, test_size=0.3)


In [52]:
forest = RandomForestRegressor()
forest.set_params(random_state=0)

score_model(forest)

Train Score:  0.9866945301649861
Test Score:  0.9116174256174505


In [57]:
mse = np.sqrt(mean_squared_error(y_test, forest.predict(X_test)))
rmse = mse**0.5
print(mse)
print(rmse)

190451.77392454626
436.4078069014649


In [53]:
get_score_log(forest)

Train Accuracy (log):  0.9856190563170815
Test Accuracy (log):  0.9150622684151759


In [55]:
log_mse = np.sqrt(mean_squared_error(Y_test_log,forest.predict(X_test_log)))
log_rmse = log_mse**0.5
print(log_mse)
print(log_rmse)

0.11142318020884018
0.3338011087591534


In [22]:
#rf = RandomForestRegressor(n_estimators = 300, max_features = 'sqrt', max_depth =5, random_state=18).fit(X_train, y_train)

In [58]:
forest_importances = pd.DataFrame(forest.feature_importances_, index=X.columns).reset_index().sort_values(by=0,ascending=False)
forest_importances.rename({'index':'Feature',0:'RF_Score'},inplace=True, axis=1)
forest_importances

Unnamed: 0,Feature,RF_Score
14,OverallQual,0.491657
0,TotalLivArea,0.271497
5,GarageArea,0.048001
4,LotArea,0.026861
24,CentralAir_Encoded,0.014154
13,HouseAge,0.013753
11,YearBuilt,0.01324
8,AllBathrooms,0.012748
3,LotFrontage,0.011873
12,YearRemodAdd,0.01171


In [61]:
# grid_para_forest = [{
#     'n_estimators' : range(100,400,50),
#     'criterion' : ['mean_squared_error'],  
#     'max_depth' : range(15,25,5),
#     'min_samples_split' : range(2,10,2),
#     'max_features' : range(15,30,5)
# }]

# grid_search_forest = GridSearchCV(forest, grid_para_forest, scoring='r2', cv=5, n_jobs=-1)

# %time get_score_log(grid_search_forest)

In [None]:
grid_search_forest.best_params_

In [28]:
from datetime import datetime

In [32]:
rf_grid = {
    'n_estimators' : [200,300,400,500],
    'max_features' : ['sqrt','log2'],
    'max_depth' : [3,4,5,6,7],
    'random_state': [18]
}

## show start time
print(datetime.now())

## Grid Search function
CV_rfr = GridSearchCV(estimator=RandomForestRegressor(), param_grid=rf_grid, cv= 5)
CV_rfr.fit(X_train, y_train)

## show end time
print(datetime.now())

#grid_search_forest = GridSearchCV(forest, grid_para_forest, scoring='r2', cv=5, n_jobs=-1)

score_model(CV_rfr)

2022-10-14 15:52:23.571169
2022-10-14 15:54:27.954597
Train Score:  0.9397493819924866
Test Score:  0.896835485336888


In [None]:
grid_forest = {
    'n_estimators' : [100,150,200,250,300,350,400,450,500],
    'max_depth' : [10,15,20,25,30,35,40,45,50],
    'random_state': [18]
}

## show start time
print(datetime.now())

## Grid Search function
grid_search_forest = GridSearchCV(estimator=RandomForestRegressor(), param_grid=grid_forest, cv= 5)
grid_search_forest.fit(X_train, y_train)

## show end time
print(datetime.now())

score_model(grid_search_forest)

In [None]:
grid_search_forest.best_params_

In [None]:
best_forest = grid_search_forest.best_estimator_

forest_params1 = pd.DataFrame(best_forest.feature_importances_, index=homes.columns).reset_index().sort_values(by=0,ascending=False)
forest_params1.rename({'index':'Feature',0:'Score'},inplace=True, axis=1)
forest_params1