# Feature Selection on the 111101011 dataset

In this notebook we examine the result of dropping subsets of features that are linearly dependent or correlated to some degree.  We're specifically working on the dataset formed by dropping (31, 496, 524, 534, 917, 1183, 1299), since that resulted in the 4th lowest validation error.

In [30]:
import itertools
import numpy as np
import pandas as pd

pd.set_option('display.precision',20)
pd.set_option('display.max_colwidth',100)

from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_predict, KFold, cross_val_score, \
                                    GridSearchCV, RandomizedSearchCV, ShuffleSplit 
from time import time
from scipy.stats import randint as sp_randint

import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
from matplotlib import pyplot
rcParams['figure.figsize'] = 12, 4
%matplotlib inline

In [31]:
# def to compare goodness of fit on training set
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [32]:
# Cross-validation sets
kfold = KFold(n_splits=10, random_state=7)

# We are using LassoLarsCV as part of our metric
lr = linear_model.LassoLarsCV(verbose=False, max_iter=5000,precompute='auto', cv=kfold, max_n_alphas=1000, n_jobs=-1)

In [33]:
df = pd.read_csv("./input/train_tidy_111101011.csv")

In [34]:
ss = ShuffleSplit(n_splits=1, test_size=0.20, random_state=573)

In [35]:
X = df.values

In [36]:
for train_idx, validation_idx in ss.split(X):
    train_df = df.iloc[train_idx]
    validation_df = df.iloc[validation_idx]

We will establish a baseline by keeping all features.

In [37]:
y_validation = validation_df['SalePrice'].values
x_validation = validation_df.drop(['HouseId', 'SalePrice', 'GarageAge', 'GarageAgeLin'],axis=1).values
y_train = train_df['SalePrice'].values
x_train = train_df.drop(['HouseId', 'SalePrice', 'GarageAge', 'GarageAgeLin'],axis=1).values
lr.fit(x_train, y_train)
y_pred = lr.predict(x_validation)
baseline = rmse(y_validation, y_pred)
baseline



0.10515807312252047

## Features

We have a collection of Features, some of which were identified to be of potentially low-quality in predicting the response, others of which are known to be highly correlated with other Features.  We want to identify subsets of features that we can drop to improve the regression.

In [38]:
drop_cands = [
    'LotFrontage', 'LotArea', 'BsmtUnfSF', 'LowQualFinSF',
    'LogGrLivArea', 
    'GrLivArea', 'TotalHouseArea', 'LivArea', 'LivAreaWt', 'AllSizesSum', 'AllSizesSumLin', 'AreasSum',
    'X1stFlrSF','X1stLin', 'X2ndFlrSF', 'X2ndLin',
    'TotalBath', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
    'Age', 'AgeLin', 'RemodAgeLin','RemodAge',
    'MasVnrArea', 'MasVnrAreaLin',
    'DeckPorchLin','WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'X3SsnPorch', 'ScreenPorch'
]

In [39]:
corr_df = df[drop_cands].corr()

In [40]:
corr_df[corr_df > 0.75]

Unnamed: 0,LotFrontage,LotArea,BsmtUnfSF,LowQualFinSF,LogGrLivArea,GrLivArea,TotalHouseArea,LivArea,LivAreaWt,AllSizesSum,...,RemodAgeLin,RemodAge,MasVnrArea,MasVnrAreaLin,DeckPorchLin,WoodDeckSF,OpenPorchSF,EnclosedPorch,X3SsnPorch,ScreenPorch
LotFrontage,1.0,,,,,,,,,,...,,,,,,,,,,
LotArea,,1.0,,,,,,,,0.9860637239724525,...,,,,,,,,,,
BsmtUnfSF,,,1.0,,,,,,,,...,,,,,,,,,,
LowQualFinSF,,,,1.0,,,,,,,...,,,,,,,,,,
LogGrLivArea,,,,,1.0,0.9757273479067332,0.8605893722090229,0.7637333250153961,0.9188398216109024,,...,,,,,,,,,,
GrLivArea,,,,,0.9757273479067332,1.0,0.8355955585785779,0.767251955830606,0.9137756066094788,,...,,,,,,,,,,
TotalHouseArea,,,,,0.8605893722090229,0.8355955585785779,1.0,0.8024916912584044,0.8759542307405424,,...,,,,,,,,,,
LivArea,,,,,0.7637333250153961,0.767251955830606,0.8024916912584044,1.0,0.9457503620141537,,...,,,,,,,,,,
LivAreaWt,,,,,0.9188398216109024,0.9137756066094788,0.8759542307405424,0.9457503620141537,1.0,,...,,,,,,,,,,
AllSizesSum,,0.9860637239724525,,,,,,,,1.0,...,,,,,,,,,,


We'll restrict our drop set to the highly correlated features to make this more readable,

In [41]:
drop_cands = [
    'LotArea', 'LogGrLivArea', 
    'GrLivArea', 'TotalHouseArea', 'LivArea', 'LivAreaWt', 'AllSizesSum', 'AreasSum',
    'X1stFlrSF','X1stLin', 'TotalBath', 'FullBath', 'MasVnrArea', 'MasVnrAreaLin'
]

In [42]:
corr_df = df[drop_cands].corr()

In [43]:
corr_df[corr_df > 0.75]

Unnamed: 0,LotArea,LogGrLivArea,GrLivArea,TotalHouseArea,LivArea,LivAreaWt,AllSizesSum,AreasSum,X1stFlrSF,X1stLin,TotalBath,FullBath,MasVnrArea,MasVnrAreaLin
LotArea,1.0,,,,,,0.9860637239724525,,,,,,,
LogGrLivArea,,1.0,0.9757273479067332,0.8605893722090229,0.7637333250153961,0.9188398216109024,,0.8318550567154617,,,,,,
GrLivArea,,0.9757273479067332,1.0,0.8355955585785779,0.767251955830606,0.9137756066094788,,0.8198966698406291,,,,,,
TotalHouseArea,,0.8605893722090229,0.8355955585785779,1.0,0.8024916912584044,0.8759542307405424,,0.932335556586879,,,,,,
LivArea,,0.7637333250153961,0.767251955830606,0.8024916912584044,1.0,0.9457503620141537,,0.7950261426742665,,,,,,
LivAreaWt,,0.9188398216109024,0.9137756066094788,0.8759542307405424,0.9457503620141537,1.0,,0.8613257811627024,,,,,,
AllSizesSum,0.9860637239724525,,,,,,1.0,,,,,,,
AreasSum,,0.8318550567154617,0.8198966698406291,0.932335556586879,0.7950261426742665,0.8613257811627024,,1.0,,,,,,
X1stFlrSF,,,,,,,,,1.0,0.9790291234372676,,,,
X1stLin,,,,,,,,,0.9790291234372676,1.0,,,,


Let's first compare the pairs.

In [44]:
drop_cands = ['LotArea', 'AllSizesSum']

In [45]:
col_drop_results_df = pd.DataFrame(dtype = 'float64')
count = 0
for L in range(0, len(drop_cands)+1):
        for subset in itertools.combinations(drop_cands, L):
                drop_cols = list(subset)
                col_drop_results_df.loc[count, 'Dropped'] = str(subset)
                x_train = train_df.drop(['HouseId', 'SalePrice', 'GarageAge', 'GarageAgeLin'] + drop_cols,axis=1).values
                x_validation = validation_df.drop(['HouseId', 'SalePrice', 'GarageAge', 'GarageAgeLin'] + drop_cols,axis=1).values
                lr.fit(x_train, y_train)
                y_pred = lr.predict(x_validation)
                error = rmse(y_validation, y_pred)
                col_drop_results_df.loc[count, 'RMSE'] = error
                col_drop_results_df.loc[count, 'Diff from Base'] = error - baseline                
                count += 1
output_df = col_drop_results_df.sort_values(['RMSE'])



In [46]:
output_df

Unnamed: 0,Dropped,RMSE,Diff from Base
3,"('LotArea', 'AllSizesSum')",0.1047098773643123,-0.000448195758208
1,"('LotArea',)",0.1048985506468915,-0.0002595224756288
0,(),0.1051580731225204,0.0
2,"('AllSizesSum',)",0.1055859712672646,0.0004278981447442


We should keep these features.

In [47]:
drop_cands = ['X1stFlrSF','X1stLin']

In [48]:
col_drop_results_df = pd.DataFrame(dtype = 'float64')
count = 0
for L in range(0, len(drop_cands)+1):
        for subset in itertools.combinations(drop_cands, L):
                drop_cols = list(subset)
                col_drop_results_df.loc[count, 'Dropped'] = str(subset)
                x_train = train_df.drop(['HouseId', 'SalePrice', 'GarageAge', 'GarageAgeLin'] + drop_cols,axis=1).values
                x_validation = validation_df.drop(['HouseId', 'SalePrice', 'GarageAge', 'GarageAgeLin'] + drop_cols,axis=1).values
                lr.fit(x_train, y_train)
                y_pred = lr.predict(x_validation)
                error = rmse(y_validation, y_pred)
                col_drop_results_df.loc[count, 'RMSE'] = error
                col_drop_results_df.loc[count, 'Diff from Base'] = error - baseline                
                count += 1
output_df = col_drop_results_df.sort_values(['RMSE'])



In [49]:
output_df

Unnamed: 0,Dropped,RMSE,Diff from Base
1,"('X1stFlrSF',)",0.1046325227774257,-0.0005255503450947
2,"('X1stLin',)",0.1047327552100857,-0.0004253179124347
3,"('X1stFlrSF', 'X1stLin')",0.1048181741654083,-0.0003398989571121
0,(),0.1051580731225204,0.0


In [50]:
drop_cands = ['TotalBath', 'FullBath']

In [51]:
col_drop_results_df = pd.DataFrame(dtype = 'float64')
count = 0
for L in range(0, len(drop_cands)+1):
        for subset in itertools.combinations(drop_cands, L):
                drop_cols = list(subset)
                col_drop_results_df.loc[count, 'Dropped'] = str(subset)
                x_train = train_df.drop(['HouseId', 'SalePrice', 'GarageAge', 'GarageAgeLin'] + drop_cols,axis=1).values
                x_validation = validation_df.drop(['HouseId', 'SalePrice', 'GarageAge', 'GarageAgeLin'] + drop_cols,axis=1).values
                lr.fit(x_train, y_train)
                y_pred = lr.predict(x_validation)
                error = rmse(y_validation, y_pred)
                col_drop_results_df.loc[count, 'RMSE'] = error
                col_drop_results_df.loc[count, 'Diff from Base'] = error - baseline                
                count += 1
output_df = col_drop_results_df.sort_values(['RMSE'])



In [52]:
output_df

Unnamed: 0,Dropped,RMSE,Diff from Base
1,"('TotalBath',)",0.1043992942588775,-0.0007587788636429
3,"('TotalBath', 'FullBath')",0.1044426874149492,-0.0007153857075711
2,"('FullBath',)",0.1048985506468915,-0.0002595224756289
0,(),0.1051580731225204,0.0


The gain by dropping both of these isn't big enough to consider dropping a predictor that we'd consider strong by hedonic reasoning.

In [53]:
drop_cands = ['MasVnrArea', 'MasVnrAreaLin']

In [54]:
col_drop_results_df = pd.DataFrame(dtype = 'float64')
count = 0
for L in range(0, len(drop_cands)+1):
        for subset in itertools.combinations(drop_cands, L):
                drop_cols = list(subset)
                col_drop_results_df.loc[count, 'Dropped'] = str(subset)
                x_train = train_df.drop(['HouseId', 'SalePrice', 'GarageAge', 'GarageAgeLin'] + drop_cols,axis=1).values
                x_validation = validation_df.drop(['HouseId', 'SalePrice', 'GarageAge', 'GarageAgeLin'] + drop_cols,axis=1).values
                lr.fit(x_train, y_train)
                y_pred = lr.predict(x_validation)
                error = rmse(y_validation, y_pred)
                col_drop_results_df.loc[count, 'RMSE'] = error
                col_drop_results_df.loc[count, 'Diff from Base'] = error - baseline                
                count += 1
output_df = col_drop_results_df.sort_values(['RMSE'])



In [55]:
output_df

Unnamed: 0,Dropped,RMSE,Diff from Base
2,"('MasVnrAreaLin',)",0.1046325227774257,-0.0005255503450947
3,"('MasVnrArea', 'MasVnrAreaLin')",0.1046883937455733,-0.000469679376947
1,"('MasVnrArea',)",0.1046885137798311,-0.0004695593426893
0,(),0.1051580731225204,0.0


In [56]:
drop_cands = ['LogGrLivArea', 'GrLivArea', 'TotalHouseArea', 'LivArea', 'LivAreaWt', 'AreasSum']

In [57]:
col_drop_results_df = pd.DataFrame(dtype = 'float64')
count = 0
for L in range(0, len(drop_cands)+1):
        for subset in itertools.combinations(drop_cands, L):
                drop_cols = list(subset)
                col_drop_results_df.loc[count, 'Dropped'] = str(subset)
                x_train = train_df.drop(['HouseId', 'SalePrice', 'GarageAge', 'GarageAgeLin'] + drop_cols,axis=1).values
                x_validation = validation_df.drop(['HouseId', 'SalePrice', 'GarageAge', 'GarageAgeLin'] + drop_cols,axis=1).values
                lr.fit(x_train, y_train)
                y_pred = lr.predict(x_validation)
                error = rmse(y_validation, y_pred)
                col_drop_results_df.loc[count, 'RMSE'] = error
                col_drop_results_df.loc[count, 'Diff from Base'] = error - baseline                
                count += 1
output_df = col_drop_results_df.sort_values(['RMSE'])



In [58]:
output_df

Unnamed: 0,Dropped,RMSE,Diff from Base
59,"('LogGrLivArea', 'GrLivArea', 'TotalHouseArea', 'LivAreaWt', 'AreasSum')",0.10357174216942341416,-0.00158633095309705652
58,"('LogGrLivArea', 'GrLivArea', 'TotalHouseArea', 'LivArea', 'AreasSum')",0.10391536481195215447,-0.00124270831056831621
28,"('LogGrLivArea', 'TotalHouseArea', 'AreasSum')",0.10392310840042916220,-0.00123496472209130848
18,"('TotalHouseArea', 'AreasSum')",0.10393780236413910356,-0.00122027075838136712
63,"('LogGrLivArea', 'GrLivArea', 'TotalHouseArea', 'LivArea', 'LivAreaWt', 'AreasSum')",0.10394131820957328016,-0.00121675491294719051
49,"('LogGrLivArea', 'TotalHouseArea', 'LivArea', 'AreasSum')",0.10397951960735733745,-0.00117855351516313323
39,"('TotalHouseArea', 'LivArea', 'AreasSum')",0.10399046011626059127,-0.00116761300625987940
40,"('TotalHouseArea', 'LivAreaWt', 'AreasSum')",0.10417000486085599420,-0.00098806826166447648
47,"('LogGrLivArea', 'GrLivArea', 'LivAreaWt', 'AreasSum')",0.10419650500599279830,-0.00096156811652767238
34,"('GrLivArea', 'TotalHouseArea', 'AreasSum')",0.10423569331575052721,-0.00092237980676994347
