In [40]:
import sys
sys.path.append('..')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import CustomPipeline as cp
from sklearn.model_selection import GridSearchCV, KFold

In [41]:
housingData = pd.read_csv('./data/Ames_Housing_Price_Data.csv')
housingData.drop('Unnamed: 0', axis=1, inplace=True)
train, test = np.split(housingData.sample(frac=1, random_state=42), [int(.9*len(housingData))])
train.to_csv('./data/trainData.csv')
test.to_csv('./data/testData.csv')
trainData=cp.clean('./data/trainData.csv')
testData=cp.clean('./data/testData.csv')
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

features=list(trainData.columns)
features.remove("PID")
features.remove("SalePrice")
XTrain=trainData[features]
yTrain=np.log(trainData['SalePrice'])
XTest=testData[features]
yTest=np.log(testData['SalePrice'])

In [43]:
from sklearn.ensemble import RandomForestRegressor 
rf=RandomForestRegressor(n_estimators = 100, random_state = 42) 
rf.fit(XTrain,yTrain)

RandomForestRegressor(random_state=42)

In [44]:
feat_imp = pd.Series(rf.feature_importances_, index=XTrain.columns)
feat_imp.sort_values(ascending=False)

OverallQual             5.543526e-01
GrLivArea               8.482950e-02
GarageArea              6.543389e-02
GarageCars              4.347122e-02
TotalBsmtSF             3.775953e-02
1stFlrSF                3.078101e-02
BsmtFinSF1              1.359797e-02
FullBath                1.297862e-02
LotArea                 1.234731e-02
YearBuilt               1.046886e-02
YearRemodAdd            9.830200e-03
OverallCond             9.722761e-03
GarageType              6.745590e-03
BsmtQual                6.415922e-03
GarageYrBlt             6.276332e-03
LotFrontage             6.153536e-03
BsmtFinType1            5.566699e-03
OpenPorchSF             5.463243e-03
BsmtUnfSF               4.852099e-03
MasVnrArea              4.635824e-03
2ndFlrSF                4.063824e-03
FireplaceQu             3.755354e-03
MSZoning_RM             3.470030e-03
MoSold                  2.981522e-03
MSSubClass_30           2.637849e-03
WoodDeckSF              2.601651e-03
Fireplaces              2.160113e-03
G

In [37]:
pred=rf.predict(XTest)

In [58]:
metrics.r2_score(yTest, pred)

0.97681988719092

In [45]:
params = {'ccp_alpha': [0,0.5,1],'n_estimators':[10,50,100],'min_samples_leaf':[1,2,5,10]}

cv = KFold(n_splits=5, shuffle=True)

grid = GridSearchCV(estimator=rf,param_grid=params,cv=cv,return_train_score=True)
grid.fit(XTrain,yTrain)

GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
             estimator=RandomForestRegressor(random_state=42),
             param_grid={'ccp_alpha': [0, 0.5, 1],
                         'min_samples_leaf': [1, 2, 5, 10],
                         'n_estimators': [10, 50, 100]},
             return_train_score=True)

In [50]:
grid.best_params_

{'ccp_alpha': 0, 'min_samples_leaf': 2, 'n_estimators': 1000}

In [51]:
rfmod=RandomForestRegressor(n_estimators = 100, random_state = 42, ccp_alpha=0, min_samples_leaf=2) 
rfmod.fit(XTrain,yTrain)

RandomForestRegressor(ccp_alpha=0, min_samples_leaf=2, random_state=42)

In [52]:
feat_imp = pd.Series(rfmod.feature_importances_, index=XTrain.columns)
feat_imp.sort_values(ascending=False)

OverallQual             5.621970e-01
GrLivArea               8.495340e-02
GarageArea              6.228725e-02
GarageCars              4.362516e-02
TotalBsmtSF             3.802971e-02
1stFlrSF                3.052583e-02
BsmtFinSF1              1.342684e-02
FullBath                1.305381e-02
LotArea                 1.229462e-02
YearRemodAdd            1.099934e-02
YearBuilt               1.036058e-02
OverallCond             9.583586e-03
GarageType              6.750060e-03
GarageYrBlt             6.599581e-03
BsmtQual                6.415483e-03
LotFrontage             6.228683e-03
BsmtUnfSF               5.383676e-03
BsmtFinType1            5.352621e-03
OpenPorchSF             4.560968e-03
MasVnrArea              4.550900e-03
2ndFlrSF                4.130774e-03
MSZoning_RM             3.422180e-03
MoSold                  3.161457e-03
FireplaceQu             2.777805e-03
MSSubClass_30           2.654687e-03
WoodDeckSF              2.609680e-03
Fireplaces              2.249711e-03
G

In [53]:
predmod=rfmod.predict(XTest)

In [56]:
metrics.r2_score(yTest, predmod)
metrics.mean_squared_error(yTest,predmod)

0.010430539808264234