# [ Random Forest Regressor ]

## 1. Import Dataset 

In [70]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings(action="ignore")

In [207]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print(train.shape, test.shape)

(1460, 81) (1459, 80)


In [72]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## 2. Data Preprocessing

### data overview

In [73]:
import pandas_profiling

pandas_profiling.ProfileReport(train)



#### NA Imputation

In [208]:
dropcol = ['Id','Alley','Fence','FireplaceQu','MiscFeature','PoolQC']
train.drop(dropcol, axis=1, inplace=True)
test.drop(dropcol, axis=1, inplace=True)

In [209]:
# NA imputation ( mode )
NAmode = ['BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','BsmtQual','BsmtFullBath','BsmtHalfBath','Exterior1st','Exterior2nd','Functional','KitchenQual','MSZoning','GarageCond','GarageFinish','GarageQual','GarageType','MasVnrType']
for col in NAmode:
    train[col].fillna(train[col].mode(), inplace=True)
    test[col].fillna(test[col].mode(), inplace=True)
    
# Na imputation ( mean )
NAmean = ['BsmtFinSF1','BsmtUnfSF','BsmtFinSF2','GarageArea','GarageCars','GarageYrBlt','LotFrontage','MasVnrArea','TotalBsmtSF',]
for col in NAmean:
    train[col].fillna(train[col].mean(), inplace=True)
    test[col].fillna(test[col].mean(), inplace=True)

#### dummy variables

In [210]:
for col in train.dtypes[train.dtypes =='object'].index:
    for_dummy = train.pop(col)
    train = pd.concat([train, pd.get_dummies(for_dummy, prefix=col)], axis=1)
    
for col in test.dtypes[test.dtypes =='object'].index:
    for_dummy = test.pop(col)
    test = pd.concat([test, pd.get_dummies(for_dummy, prefix=col)], axis=1)

In [211]:
missing_cols = set( train.columns ) - set( test.columns )
for cols in missing_cols:
    test[cols] = 0

test.drop('SalePrice',axis=1,inplace=True)

In [219]:
test['BsmtFullBath'].fillna(0,inplace=True)
test['BsmtHalfBath'].fillna(0,inplace=True)

## 3. Modeling

In [221]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [222]:
X_train = train.iloc[:,train.columns!='SalePrice']
y_train = train[['SalePrice']]

In [223]:
clf = RandomForestRegressor()
clf.fit(X_train,y_train.values.ravel())

score = np.sqrt(-cross_val_score(clf, X_train, y_train.values.ravel(), cv=10,scoring = 'neg_mean_squared_error'))

In [224]:
score.mean()

30059.01674394802

In [225]:
param_grid = { "n_estimators" : [20,40,60,80],  "max_depth" : [5,10,15,20,25] }
grid = GridSearchCV(clf, param_grid, n_jobs=-1, cv=10)
grid.fit(X_train, y_train.values.ravel())

print(grid.best_score_ , grid.best_params_)

0.8655263390587464 {'max_depth': 20, 'n_estimators': 80}


In [226]:
clf2 = RandomForestRegressor(n_estimators=80,max_depth=20)
clf2.fit(X_train,y_train.values.ravel())

score2 = np.sqrt(-cross_val_score(clf2,X_train,y_train.values.ravel(),cv=10, scoring='neg_mean_squared_error'))

In [228]:
score2.mean()

29358.329180610308

In [229]:
clf2.predict(test)

array([121403.25      , 150172.69166667, 179016.125     , ...,
       156032.4875    , 113338.4375    , 233626.95      ])