In [27]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as stats
from scipy.stats import skew
import seaborn as sns
import sklearn
import warnings
from sklearn.cross_validation import KFold
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Lasso

In [58]:
train_clean = pd.read_csv('train_clean.csv', dtype={'MSSubClass': str})  
train = pd.read_csv('train.csv', dtype={'MSSubClass': str})

In [59]:
coef_df = pd.read_csv('coef_df.csv')


Remove outlier records we previously identified as being harmful to the model

In [60]:
train_clean = train_clean[(train_clean.Id != 1299) & (train_clean.Id != 524)]

In [61]:
y_train = train_clean['SalePrice']
train_clean = train_clean.drop(['SalePrice', 'Id'], axis=1)

In [62]:
to_expand = list(coef_df.feature[:30].values)

In [63]:
train_no_expand = train_clean.drop(to_expand, axis=1)

In [64]:
train_to_expand = train_clean[to_expand]

In [65]:
poly = PolynomialFeatures(2)
train_to_expand = poly.fit_transform(train_to_expand)

In [66]:
X_train = np.hstack([train_to_expand, train_no_expand.values])

In [67]:
X_train.shape

(1458, 886)

from sklearn.decomposition import PCA

#X_train = train_clean.drop(['SalePrice', 'Id'], axis=1)
pca = PCA()
pca.fit(to_expand)

ratios = np.cumsum(pca.explained_variance_ratio_)

ratios[ratios < 0.98].shape

pca = PCA(100)
X_train = pca.fit_transform(X_train)


feat_cols = coef_df.feature.values[:50]

In [68]:
kf = KFold(len(X_train), 4, shuffle=True)

In [69]:
val_idx_list = []
val_preds = []
train_preds = []
val_errors = []
train_rmse = []
val_rmse = []




for tr_idx, val_idx in kf:
    X_tr, y_tr = X_train[tr_idx, :], y_train.iloc[tr_idx]
    X_val, y_val = X_train[val_idx, :], y_train.iloc[val_idx]
    val_idx_list.append(val_idx)
    ls = Lasso(alpha=.001, max_iter=10000) #the hyperparameter we found to be best
    ls.fit(X_tr, y_tr)
    preds = ls.predict(X_val)
    val_preds.append(preds)
    val_errors.append(np.abs(y_val.values - preds))
    tr_preds = ls.predict(X_tr)
    train_preds.append(tr_preds)
    train_score = np.mean(np.sqrt((y_tr.values - tr_preds)**2))
    train_rmse.append(train_score)
    print('train score: {}'.format(train_score))
    val_score = np.mean(np.sqrt((y_val.values - preds)**2))
    val_rmse.append(val_score)
    print('val_score score: {}'.format(val_score))
    
print("mean val error: {}".format(np.mean(val_rmse)))
print("mean train error: {}".format(np.mean(train_rmse)))


train score: 0.06738257733913645
val_score score: 0.07205926037681104
train score: 0.06352207689177738
val_score score: 0.08151550546928997
train score: 0.06474725668629422
val_score score: 0.08095462131958743
train score: 0.06700755169456961
val_score score: 0.07371962799507446
mean val error: 0.07706225379019073
mean train error: 0.06566486565294442


## Fit the model.

Run grid search to fit find the best shrinkage hyperparameter.

In [70]:
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import cross_val_score
def runGSAndGetRMSE(est, params):
    gs = GridSearchCV(est, param_grid= params, scoring='neg_mean_squared_error')
    gs.fit(X_train, y_train)
    rmse = np.mean(np.sqrt(-cross_val_score(gs.best_estimator_, X_train,y_train, scoring="neg_mean_squared_error", cv = 5))) 
    print('mean rmse: {}'.format(rmse))
    print('best alpha: {}'.format(gs.best_params_['alpha']))
    return gs.best_estimator_

In [71]:
from sklearn.linear_model import Lasso
lassoParams = {'alpha': [0.001, .01, 0.1]}
bestLassoEst = runGSAndGetRMSE(Lasso(max_iter=10000), lassoParams)

mean rmse: 0.11195550003012626
best alpha: 0.001


Fit our tuned model on all the data prior to submission.

In [72]:
bestLassoEst.fit(X_train, y_train)

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=10000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

Make predictions on the test set and undo our log transform so that the values will be on their original scale.

In [81]:
test = pd.read_csv('test_clean.csv', dtype={'MSSubClass': str})

In [82]:
ids = test.Id.values
test = test.drop('Id', axis=1)
test_to_expand = test[to_expand]
test_to_expand = poly.transform(test_to_expand)
test = test.drop(to_expand, axis=1)

test = np.hstack([test_to_expand, test.values])

In [83]:
preds = np.expm1(bestLassoEst.predict(test))
solution = pd.DataFrame({"id":ids, "SalePrice":preds}, columns=['id', 'SalePrice'])

solution.to_csv("lasso_expand_35.csv", index = False)

This one scores .12046

Let's save the model so we can use it later

In [115]:
from sklearn.externals import joblib
joblib.dump(bestLassoEst, 'poly_features_35_2.pkl')

['poly_features_35_2.pkl']