# Project 2 - Ames Housing Data and Kaggle Challenge
Author: _Ritchie Kwan_

---



## Table of Contents
1. [EDA and Data Cleaning](01-EDA-and-Cleaning.ipynb)
2. [Preprocessing and Feature Engineering](02-Preprocessing-and-Feature-Engineering.ipynb)
3. [Modeling Benchmarks](03-Model-Benchmarks.ipynb)  
4. [Model Tuning](#Model-Tuning)
5. [Production Model and Insights](05-Production-Model-and-Insights.ipynb)  
 

### Import Libraries

In [86]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV

from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.feature_selection import SelectKBest, VarianceThreshold, f_regression

### Load Data

In [3]:
df = pd.read_csv('../datasets/train_processed.csv')
df_train = pd.read_csv('../datasets/train_split_processed.csv')
df_test = pd.read_csv('../datasets/test_split_processed.csv')
df_kaggle = pd.read_csv('../datasets/kaggle_processed.csv')

### Define Predictors and Target

In [4]:
X = df[[col for col in df.columns if col != 'saleprice']]
y = df['saleprice']

X_train = df_train[[col for col in df_train.columns if col != 'saleprice']]
y_train = df_train['saleprice']

X_test = df_test[[col for col in df_test.columns if col != 'saleprice']]
y_test = df_test['saleprice']

X_kaggle = df_kaggle[[col for col in df_test.columns if col != 'saleprice']]



## Model Tuning

To improve our model, we will use the regularization properties of `Lasso`, `Ridge`, and `ElasticNet`.

### Define a function to compare cross-validation scores of different models

In [5]:
def compare_r2(X_train, X_test, y_train, y_test, model_type = 'linear'):
    '''
    Arguments:
    X : Dataframe of features
    y : True y
    
    Calculates R2 scores of SLR, cross validation train and test split
    
    Return:
    r2 : R2 score of SLR
    train_cv_r2 : R2 score of cross validation training set
    test_cv_r2 : R2 score of cross validation test set
    '''
    
    model = None
    
    # case insensitive
    model_type = model_type.lower()
    
    # instantiate model
    if model_type == 'linear' : 
        model = LinearRegression()
    elif model_type == 'ridge' :
        r_alphas = np.logspace(0, 5, 200)
        model = RidgeCV(alphas = r_alphas, scoring = 'r2')
    elif model_type == 'lasso' : 
        model = LassoCV()
    elif model_type == 'elasticnet' : 
        r_alphas = np.logspace(0, 5, 200)
        l1_ratios = [.1, .5, .9, 1]
        model = ElasticNetCV(alphas = r_alphas, l1_ratio = l1_ratios)
    else:
        print('Invalid model_type. Try linear, ridge, lasso, or elasticnet')
        return None
        
    # fit model
    model = model.fit(X_train, y_train)
    
    # make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # R2 scores
    train_score = r2_score(y_train, y_train_pred)
    test_score = r2_score(y_test, y_test_pred)
    
    # K-Folds Cross Validation
    kf = KFold(n_splits = 10, 
               shuffle = True, 
               random_state = 42)
     
    # cross-validation scores
    train_cv_score = cross_val_score(model, X_train, y_train, cv = kf).mean()
    test_cv_score = cross_val_score(model, X_test, y_test, cv = kf).mean()
    
    # build output
    output = {'Train' : train_score,
              'Test' : test_score,
              'Train CV' : train_cv_score,
              'Test CV' : test_cv_score,
              'Model' : model
             }
    
    # additional output depending on the model_type
    if model_type == 'ridge' :
        output['alpha'] = model.alpha_
        output['coef'] = model.coef_
    elif model_type == 'lasso' : 
        output['alpha'] = model.alpha_
        output['coef'] = model.coef_
    elif model_type == 'elasticnet' :
        output['alpha'] = model.alpha_
        output['coef'] = model.coef_
        output['l1_ratio'] = model.l1_ratio_
    
    # return output as a DataFrame
    return pd.DataFrame({'Score' : output})

### Linear

In [6]:
lm_comp = compare_r2(X_train, X_test, y_train, y_test, 'Linear')

In [7]:
lm = lm_comp.loc['Model', 'Score']
lm_comp

Unnamed: 0,Score
Model,"LinearRegression(copy_X=True, fit_intercept=Tr..."
Test,0.858923
Test CV,0.716141
Train,0.955155
Train CV,0.878245


In [8]:
lm_feature_coefs = pd.DataFrame({'feature' : X_train.columns, 'coef' : lm.coef_})
lm_feature_coefs = lm_feature_coefs.sort_values(by = 'coef', ascending = False)
lm_feature_coefs.head(10)

Unnamed: 0,feature,coef
3,gr_liv_area,20975.77481
1,overall_qual,15522.154684
11,year_remod/add,11797.834978
0,neighborhood,9673.633827
6,total_bsmt_sf,9393.944347
21,lot_area,7864.788463
116,bsmt_qual total_bsmt_sf,7704.321072
17,bsmtfin_sf_1,7652.803427
96,gr_liv_area total_bsmt_sf,7244.913784
56,overall_qual 1st_flr_sf,6520.820542


Interactive features appear to have the most negative relationship with sale price. This means one of the features that make up the interactive feature is negatively correlated with sales price in this model. 

In [9]:
lm_feature_coefs.tail(10)

Unnamed: 0,feature,coef
187,garage_area half_bath,-4670.987884
74,exter_qual kitchen_qual,-5118.835017
163,total_bsmt_sf fireplaces,-5497.083228
208,1st_flr_sf mas_vnr_area,-5697.975926
119,bsmt_qual 1st_flr_sf,-6067.949684
120,bsmt_qual year_built,-6100.273411
98,gr_liv_area garage_cars,-6147.698186
222,year_built mas_vnr_area,-7184.258403
104,gr_liv_area totrms_abvgrd,-7930.910394
99,gr_liv_area 1st_flr_sf,-12268.809155


### Ridge

In [10]:
ridge_comp = compare_r2(X_train, X_test, y_train, y_test, 'Ridge')

In [11]:
ridge = ridge_comp.loc['Model', 'Score']
ridge_comp

Unnamed: 0,Score
Model,"RidgeCV(alphas=array([1.00000e+00, 1.05956e+00..."
Test,0.914675
Test CV,0.897936
Train,0.947838
Train CV,0.904751
alpha,153.437
coef,"[7164.693699458214, 11663.574692696051, 3927.8..."


In [12]:
ridge_feature_coefs = pd.DataFrame({'feature' : X_train.columns, 'coef' : ridge.coef_})
ridge_feature_coefs = ridge_feature_coefs.sort_values(by = 'coef', ascending = False)
ridge_feature_coefs.head(10)

Unnamed: 0,feature,coef
3,gr_liv_area,12278.318269
1,overall_qual,11663.574693
0,neighborhood,7164.693699
21,lot_area,5723.416202
6,total_bsmt_sf,5699.757921
17,bsmtfin_sf_1,5697.497098
11,year_remod/add,5642.711125
16,fireplaces,4601.727282
23,half_bath,4212.172551
5,kitchen_qual,4105.454967


In [13]:
ridge_feature_coefs.tail(10)

Unnamed: 0,feature,coef
209,1st_flr_sf totrms_abvgrd,-1552.815326
232,year_built half_bath,-1568.969016
320,lot_area half_bath,-1627.974851
298,bsmtfin_sf_1 wood_deck_sf,-1670.444418
99,gr_liv_area 1st_flr_sf,-1938.356655
128,bsmt_qual open_porch_sf,-1989.191189
208,1st_flr_sf mas_vnr_area,-2241.58114
323,half_bath^2,-2301.175707
74,exter_qual kitchen_qual,-2481.73894
270,totrms_abvgrd garage_yr_blt,-2488.815088


### Lasso

In [14]:
lasso_comp = compare_r2(X_train, X_test, y_train, y_test, 'Lasso')

In [15]:
lasso = lasso_comp.loc['Model', 'Score']
lasso_comp

Unnamed: 0,Score
Model,"LassoCV(alphas=None, copy_X=True, cv=None, eps..."
Test,0.920037
Test CV,0.913841
Train,0.933021
Train CV,0.91364
alpha,989.016
coef,"[7163.404451022859, 15991.30717835233, 2478.11..."


### Feature Selection

The features with the largest coefficients using `Lasso` are the biggest best predictors of sale price.

In [16]:
lasso_feature_coefs = pd.DataFrame({'feature' : X_train.columns, 'coef' : lasso.coef_})
lasso_feature_coefs = lasso_feature_coefs.sort_values(by = 'coef', ascending = False)
lasso_feature_coefs.head(10)

Unnamed: 0,feature,coef
3,gr_liv_area,20850.957544
1,overall_qual,15991.307178
17,bsmtfin_sf_1,7307.88537
0,neighborhood,7163.404451
6,total_bsmt_sf,6922.192705
11,year_remod/add,6444.974805
21,lot_area,5628.517402
10,year_built,5189.339285
95,gr_liv_area kitchen_qual,4276.964196
5,kitchen_qual,3932.596174


In [17]:
lasso_feature_coefs.tail(10)

Unnamed: 0,feature,coef
318,lot_area^2,-102.534329
275,totrms_abvgrd lot_frontage,-151.287245
294,fireplaces bsmt_full_bath,-179.045896
188,garage_cars^2,-201.175553
298,bsmtfin_sf_1 wood_deck_sf,-211.383502
299,bsmtfin_sf_1 lot_frontage,-496.50487
323,half_bath^2,-574.743599
225,year_built fireplaces,-599.061055
314,lot_frontage^2,-668.786164
261,mas_vnr_area fireplaces,-919.38613


### Elastic Net

ElasticNet took too long.

In [18]:
# enet_comp = compare_r2(X_train, X_test, y_train, y_test, 'ElasticNet')

In [19]:
# enet = enet_comp.loc['Model', 'Score']
# enet_comp

In [20]:
# enet_feature_coefs = pd.DataFrame({'feature' : X_train.columns, 'coef' : enet.coef_})
# enet_feature_coefs = enet_feature_coefs.sort_values(by = 'coef', ascending = False)
# enet_feature_coefs.head(10)

### Predictions

In [21]:
# KAGGLE PREDICTION
y_kaggle_linear_pred = lm.predict(X_kaggle)
y_kaggle_ridge_pred = ridge.predict(X_kaggle)
y_kaggle_lasso_pred = lasso.predict(X_kaggle)


### BONUS: Pipeline

In [51]:
steps = [
    ('var_thresh', VarianceThreshold(0.05)),
    ('ss', StandardScaler()),
    ('poly', PolynomialFeatures(include_bias = False)),
    ('kbest', SelectKBest(f_regression, k = 5)),
    ('lasso', Lasso())
]

pipe = Pipeline(steps = steps)


In [52]:
pipe = pipe.fit(X_train, y_train)

In [53]:
pipe.score(X_train, y_train)

0.832290675627192

In [54]:
pipe.score(X_test, y_test)

0.8265861441339105

### BONUS: GridSearch

In [57]:
params = {
    'var_thresh__threshold' : [.05, .1, .25],
    'kbest__k' : [9, 15, 21],
    'lasso__alpha' : np.arange(800, 850, 10)
}

gs = GridSearchCV(
    pipe,
    param_grid = params
)

In [58]:
gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('var_thresh', VarianceThreshold(threshold=0.05)), ('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('poly', PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)), ('kbest', SelectKBest(k=5, score_func=<function f_regre...=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'var_thresh__threshold': [0.05, 0.1, 0.25], 'kbest__k': [9, 15, 21], 'lasso__alpha': array([850, 860, 870, 880, 890])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [59]:
gs.score(X_train, y_train)

0.8920710273008724

In [60]:
gs.score(X_test, y_test)

0.8842753238310664

#### Look at best parameters

In [61]:
gs.best_estimator_

Pipeline(memory=None,
     steps=[('var_thresh', VarianceThreshold(threshold=0.05)), ('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('poly', PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)), ('kbest', SelectKBest(k=15, score_func=<function f_regression at 0x1a1cbc00d0>)), ('lasso', Lasso(alpha=850, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))])

In [62]:
gs.best_params_

{'kbest__k': 15, 'lasso__alpha': 850, 'var_thresh__threshold': 0.05}

In [78]:
gs.best_estimator_

Pipeline(memory=None,
     steps=[('var_thresh', VarianceThreshold(threshold=0.05)), ('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('poly', PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)), ('kbest', SelectKBest(k=15, score_func=<function f_regression at 0x1a1cbc00d0>)), ('lasso', Lasso(alpha=850, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))])

In [71]:
y_kaggle_gs_pred = gs.predict(X_kaggle)

### Write predictions to CSV file

In [83]:
ridge_predictions = pd.DataFrame([], columns = ['Id', 'SalePrice'])
lasso_predictions = pd.DataFrame([], columns = ['Id', 'SalePrice'])

ridge_predictions['Id'] = df_kaggle['id']
ridge_predictions['SalePrice'] = y_kaggle_ridge_pred

lasso_predictions['Id'] = df_kaggle['id']
lasso_predictions['SalePrice'] = y_kaggle_lasso_pred



In [84]:
ridge_predictions.to_csv('../datasets/predictions_ridge.csv', index = False)
lasso_predictions.to_csv('../datasets/predictions_lasso.csv', index = False)