In [9]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as stats
from scipy.stats import skew
import seaborn as sns
import sklearn
import pdb
import warnings
from sklearn.cross_validation import KFold
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Lasso
import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge

In [14]:
train_clean = pd.read_csv('train_clean.csv', dtype={'MSSubClass': str})  
train = pd.read_csv('train.csv', dtype={'MSSubClass': str})

#### This notebook follows the stacking approach explained [here](http://blog.kaggle.com/2016/12/27/a-kagglers-guide-to-model-stacking-in-practice/).

Remove outlier records we previously identified as being harmful to the model

In [15]:
train_clean = train_clean[(train_clean.Id != 1299) & (train_clean.Id != 524)]

Split the data into five folds

In [16]:
div = len(train_clean) // 5
train_clean = train_clean.assign(fold=0)
train_shuf = train_clean.sample(frac=1).copy()

for i in range(1, 6):
    start = div*i - div
    end = div*i
    train_shuf.iloc[start:end,-1]= i #fold is added as last column, so we can use -1 to select it
train_shuf.iloc[end:, -1] = 5   #left overs go in last fold

print(train_shuf.fold.head())
train_shuf.fold.tail()

930    1
89     1
122    1
769    1
111    1
Name: fold, dtype: int64


1263    5
1212    5
890     5
1393    5
1112    5
Name: fold, dtype: int64

We loop through the folds we created. Inside the loop, we fit each base model on all the folds except the current fold, then make predictions on the current fold. These predictions will be the features we feed into our meta-model later. This process of dividing into folds prevents leakage of sales price information into these features.

Note that hyperparameters for our base models have been determined by expermentation in other notebooks in this project.

In [18]:
test_folds = []
for i in train_shuf.fold.unique():
    test = train_shuf[train_shuf.fold == i]
    tr = train_shuf[train_shuf.fold != i]
    #tr = train_clean[train_shuf.fold != i]
    X_tr =  tr.drop(['SalePrice', 'Id', 'fold'], axis=1)
    X_test =  test.drop(['SalePrice', 'Id', 'fold'], axis=1)
    y_tr = tr.loc[:,'SalePrice']    

    ls = Lasso(alpha=0.001)
    ls.fit(X_tr, y_tr)
    test.loc[:,'m1'] = ls.predict(X_test)
    
    params = {'eta':0.01, 'max_depth':4, 'min_child_weight':4, 'subsample':0.4, 'colsample_bytree':0.8, 'scale_pos_weight':1}
    dtr = xgb.DMatrix(X_tr, y_tr)
    dtest = xgb.DMatrix(X_test)
    num_rounds = 1334
    bst = xgb.train(params, dtr, num_rounds)
    test.loc[:,'m2'] = bst.predict(dtest)
    
    knn = KNeighborsRegressor(n_neighbors=8)
    knn.fit(X_tr, y_tr)
    test.loc[:,'m3'] = knn.predict(X_test)
    
                    
                    
    rf= RandomForestRegressor(max_depth=9, n_estimators=45)
    rf.fit(X_tr, y_tr)
    test.loc[:,'m4'] = rf.predict(X_test)

    hb= HuberRegressor(epsilon=1.25)
    hb.fit(X_tr, y_tr)
    test.loc[:,'m5'] = hb.predict(X_test)

    r= Ridge(alpha=15)
    r.fit(X_tr, y_tr)
    test.loc[:, 'm6'] = r.predict(X_test)    
    
    test_folds.append(test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the document

In [19]:
tr_meta = pd.concat(test_folds)

In [24]:
tr_meta.iloc[:, -6:].head()

Unnamed: 0,m1,m2,m3,m4,m5,m6
930,12.304979,12.310223,12.199615,12.392673,12.33994,12.262643
89,11.686624,11.698879,11.588229,11.684445,11.643924,11.699539
122,11.86114,11.878616,11.813866,11.829899,11.870404,11.862354
769,13.032097,13.070211,12.840238,12.886018,13.068641,13.123636
111,12.117168,12.068261,12.235435,12.021047,12.130108,12.132389


In [28]:
test_set = pd.read_csv('test_clean.csv', dtype={'MSSubClass': str})
ids = test_set.Id.values
test_set = test_set.drop('Id', axis=1)


We fit each of our base models on the entire training set, then use these models to make predictions on the test set. Once we've trained our meta-model, we will feed these predictions into it as features in order to get the housing prices for the test set that we will submit to Kaggle. 

In [29]:
X_tr =  train_clean.drop(['SalePrice', 'fold', 'Id'], axis=1)
y_tr = train_clean['SalePrice']    

ls = Lasso(alpha=0.001)
ls.fit(X_tr, y_tr)

knn = KNeighborsRegressor(n_neighbors=8)
knn.fit(X_tr, y_tr)
                    
rf= RandomForestRegressor(max_depth=9, n_estimators=45)
rf.fit(X_tr, y_tr)

hb= HuberRegressor(epsilon=1.25)
hb.fit(X_tr,y_tr)

r = Ridge(alpha=15)
r.fit(X_tr, y_tr)

params = {'eta':0.01, 'max_depth':4, 'min_child_weight':4, 'subsample':0.4, 'colsample_bytree':0.8, 'scale_pos_weight':1}
dtr = xgb.DMatrix(X_tr, y_tr)
dtest = xgb.DMatrix(test_set)
num_rounds = 1334
bst = xgb.train(params, dtr, num_rounds)


m1 = ls.predict(test_set)
m2 = bst.predict(dtest)
m3 = knn.predict(test_set)
m4 = rf.predict(test_set)
m5 = hb.predict(test_set)
m6 = r.predict(test_set)


test_set.loc[:, 'm1'] = m1
test_set.loc[:, 'm2'] = m2
test_set.loc[:, 'm3'] = m3
test_set.loc[:, 'm4'] = m4
test_set.loc[:, 'm5'] = m5
test_set.loc[:, 'm6'] = m6


In [32]:
test_set.iloc[:, -6:].head()

Unnamed: 0,m1,m2,m3,m4,m5,m6
0,11.704515,11.759861,11.804009,11.749297,11.699601,11.684129
1,11.933021,11.98443,11.829794,11.937045,11.966786,11.918219
2,12.107182,12.105458,12.156579,12.114277,12.116783,12.126022
3,12.206209,12.13735,12.21596,12.096477,12.234749,12.19152
4,12.181544,12.173636,12.087263,12.209609,12.158725,12.176629


To get an idea of how our meta-model is performing, we loop through all of our folds, using the current fold as test set and the rest of the folds as train set. We fit our meta-model (SVR) on the train set and make predictions on the test set, then compare our predictions with the actual sales prices to get our score.

In [34]:
scores = []
for i in tr_meta.fold.unique():
    test = tr_meta[tr_meta.fold == i]
    tr = tr_meta[tr_meta.fold != i]
    X_tr =  tr[['m1',  'm2', 'm3', 'm4', 'm5', 'm6']]
    X_test =  test[['m1', 'm2', 'm3', 'm4', 'm5', 'm6']]
    y_test = test['SalePrice']
    y_tr = tr['SalePrice']    
    
    sv = SVR(kernel='linear', C=.4)
    sv.fit(X_tr, y_tr)
    preds = sv.predict(X_test)
    score = np.sqrt(np.mean((y_test.values - preds) **2))
    print(score)
    scores.append(score)
print('mean score: {}'.format(np.mean(scores)))

0.107248999412
0.111884210576
0.1070539283
0.0993367889917
0.115702654343
mean score: 0.10824531632464432


Performance looks reasonable, so we fit our meta-model on all of the data created by our base-models, then make predictions to submit.

In [43]:
X_tr =  tr_meta[['m1', 'm2', 'm3', 'm4', 'm5', 'm6']]
X_test =  test_set[['m1', 'm2', 'm3', 'm4', 'm5', 'm6']]
y_tr = tr_meta['SalePrice']
sv = SVR(kernel='linear', C=.4)
sv.fit(X_tr, y_tr)    

preds = np.expm1(sv.predict(X_test))

solution = pd.DataFrame({"id":ids, "SalePrice":preds}, columns=['id', 'SalePrice'])
solution.to_csv("stack_sv_refac.csv", index = False)

Comparing our solution to a previous one as a sanity check.

In [44]:
pd.read_csv('stack_mult.csv').head()

Unnamed: 0,id,SalePrice
0,1461,123935.827919
1,1462,154055.4083
2,1463,184739.131214
3,1464,196656.755681
4,1465,191362.929464


In [45]:
solution.head()

Unnamed: 0,id,SalePrice
0,1461,122932.308682
1,1462,154215.9301
2,1463,183381.564393
3,1464,194679.457561
4,1465,194963.189982
