In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as stats
from scipy.stats import skew
import seaborn as sns
import sklearn
import warnings
from sklearn.cross_validation import KFold
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Lasso



In [11]:
train_clean = pd.read_csv('train_clean.csv', dtype={'MSSubClass': str})  
train = pd.read_csv('train.csv', dtype={'MSSubClass': str})

Remove outlier records we previously identified as being harmful to the model

In [12]:
train_clean = train_clean[(train_clean.Id != 1299) & (train_clean.Id != 524)]

In [13]:
y_train = train_clean['SalePrice']
train_clean = train_clean.drop(['SalePrice', 'Id'], axis=1)

## Fit the model.

Run grid search to fit find the best shrinkage hyperparameter.

In [14]:
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import cross_val_score
def runGSAndGetRMSE(est, params):
    gs = GridSearchCV(est, param_grid= params, verbose=5, scoring='neg_mean_squared_error')
    gs.fit(train_clean, y_train)
    print(np.sqrt(gs.best_score_*-1))
    print(gs.best_params_)
    return gs.best_estimator_

In [19]:
from sklearn.ensemble import RandomForestRegressor
rfParams = {'n_estimators': [10,25,50], 'max_depth':list(np.arange(2,10,2))}
bestRf = runGSAndGetRMSE(RandomForestRegressor(), rfParams)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] max_depth=2, n_estimators=10 ....................................
[CV] .......... max_depth=2, n_estimators=10, score=-0.053405 -   0.1s
[CV] max_depth=2, n_estimators=10 ....................................
[CV] .......... max_depth=2, n_estimators=10, score=-0.058434 -   0.0s
[CV] max_depth=2, n_estimators=10 ....................................
[CV] .......... max_depth=2, n_estimators=10, score=-0.051154 -   0.0s
[CV] max_depth=2, n_estimators=25 ....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.2s remaining:    0.0s


[CV] .......... max_depth=2, n_estimators=25, score=-0.055098 -   0.1s
[CV] max_depth=2, n_estimators=25 ....................................
[CV] .......... max_depth=2, n_estimators=25, score=-0.059147 -   0.1s
[CV] max_depth=2, n_estimators=25 ....................................
[CV] .......... max_depth=2, n_estimators=25, score=-0.049920 -   0.1s
[CV] max_depth=2, n_estimators=50 ....................................
[CV] .......... max_depth=2, n_estimators=50, score=-0.054375 -   0.2s
[CV] max_depth=2, n_estimators=50 ....................................
[CV] .......... max_depth=2, n_estimators=50, score=-0.060029 -   0.2s
[CV] max_depth=2, n_estimators=50 ....................................
[CV] .......... max_depth=2, n_estimators=50, score=-0.048684 -   0.2s
[CV] max_depth=4, n_estimators=10 ....................................
[CV] .......... max_depth=4, n_estimators=10, score=-0.031591 -   0.1s
[CV] max_depth=4, n_estimators=10 ....................................
[CV] .

[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:    8.4s finished


0.146033913163
{'max_depth': 8, 'n_estimators': 25}


In [20]:
from sklearn.ensemble import RandomForestRegressor
rfParams = {'n_estimators': [25], 'max_depth':[7,8,9]}
bestRf = runGSAndGetRMSE(RandomForestRegressor(), rfParams)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] max_depth=7, n_estimators=25 ....................................
[CV] .......... max_depth=7, n_estimators=25, score=-0.021877 -   0.3s
[CV] max_depth=7, n_estimators=25 ....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] .......... max_depth=7, n_estimators=25, score=-0.024848 -   0.3s
[CV] max_depth=7, n_estimators=25 ....................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.6s remaining:    0.0s


[CV] .......... max_depth=7, n_estimators=25, score=-0.020075 -   0.3s
[CV] max_depth=8, n_estimators=25 ....................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.8s remaining:    0.0s


[CV] .......... max_depth=8, n_estimators=25, score=-0.021543 -   0.3s
[CV] max_depth=8, n_estimators=25 ....................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.1s remaining:    0.0s


[CV] .......... max_depth=8, n_estimators=25, score=-0.023766 -   0.3s
[CV] max_depth=8, n_estimators=25 ....................................
[CV] .......... max_depth=8, n_estimators=25, score=-0.019057 -   0.3s
[CV] max_depth=9, n_estimators=25 ....................................
[CV] .......... max_depth=9, n_estimators=25, score=-0.020597 -   0.3s
[CV] max_depth=9, n_estimators=25 ....................................
[CV] .......... max_depth=9, n_estimators=25, score=-0.024350 -   0.3s
[CV] max_depth=9, n_estimators=25 ....................................
[CV] .......... max_depth=9, n_estimators=25, score=-0.019115 -   0.3s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    2.7s finished


0.146129833306
{'max_depth': 9, 'n_estimators': 25}


In [21]:
from sklearn.ensemble import RandomForestRegressor
rfParams = {'n_estimators': [15, 25, 40], 'max_depth':[9]}
bestRf = runGSAndGetRMSE(RandomForestRegressor(), rfParams)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] max_depth=9, n_estimators=15 ....................................
[CV] .......... max_depth=9, n_estimators=15, score=-0.022637 -   0.2s
[CV] max_depth=9, n_estimators=15 ....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV] .......... max_depth=9, n_estimators=15, score=-0.024062 -   0.2s
[CV] max_depth=9, n_estimators=15 ....................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s


[CV] .......... max_depth=9, n_estimators=15, score=-0.019114 -   0.2s
[CV] max_depth=9, n_estimators=25 ....................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.6s remaining:    0.0s


[CV] .......... max_depth=9, n_estimators=25, score=-0.021540 -   0.3s
[CV] max_depth=9, n_estimators=25 ....................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.0s remaining:    0.0s


[CV] .......... max_depth=9, n_estimators=25, score=-0.024148 -   0.3s
[CV] max_depth=9, n_estimators=25 ....................................
[CV] .......... max_depth=9, n_estimators=25, score=-0.018924 -   0.3s
[CV] max_depth=9, n_estimators=40 ....................................
[CV] .......... max_depth=9, n_estimators=40, score=-0.021035 -   0.5s
[CV] max_depth=9, n_estimators=40 ....................................
[CV] .......... max_depth=9, n_estimators=40, score=-0.022998 -   0.5s
[CV] max_depth=9, n_estimators=40 ....................................
[CV] .......... max_depth=9, n_estimators=40, score=-0.018222 -   0.5s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    3.3s finished


0.144055163702
{'max_depth': 9, 'n_estimators': 40}


In [22]:
from sklearn.ensemble import RandomForestRegressor
rfParams = {'n_estimators': [35, 40,45], 'max_depth':[9]}
bestRf = runGSAndGetRMSE(RandomForestRegressor(), rfParams)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] max_depth=9, n_estimators=35 ....................................
[CV] .......... max_depth=9, n_estimators=35, score=-0.021528 -   0.5s
[CV] max_depth=9, n_estimators=35 ....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV] .......... max_depth=9, n_estimators=35, score=-0.023285 -   0.5s
[CV] max_depth=9, n_estimators=35 ....................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.0s remaining:    0.0s


[CV] .......... max_depth=9, n_estimators=35, score=-0.018573 -   0.5s
[CV] max_depth=9, n_estimators=40 ....................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.4s remaining:    0.0s


[CV] .......... max_depth=9, n_estimators=40, score=-0.021346 -   0.6s
[CV] max_depth=9, n_estimators=40 ....................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    2.0s remaining:    0.0s


[CV] .......... max_depth=9, n_estimators=40, score=-0.024979 -   0.6s
[CV] max_depth=9, n_estimators=40 ....................................
[CV] .......... max_depth=9, n_estimators=40, score=-0.018512 -   0.8s
[CV] max_depth=9, n_estimators=45 ....................................
[CV] .......... max_depth=9, n_estimators=45, score=-0.020302 -   0.6s
[CV] max_depth=9, n_estimators=45 ....................................
[CV] .......... max_depth=9, n_estimators=45, score=-0.023225 -   0.7s
[CV] max_depth=9, n_estimators=45 ....................................
[CV] .......... max_depth=9, n_estimators=45, score=-0.018232 -   0.6s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    5.3s finished


0.143478807231
{'max_depth': 9, 'n_estimators': 45}
