In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Lasso, LassoCV, Ridge, RidgeCV, LinearRegression
from sklearn.feature_selection import SelectFromModel
pd.pandas.set_option('display.max_columns', None)
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.metrics import mean_squared_error

In [2]:
X_train_dum = pd.read_csv('X_train_dum.csv')
X_test_dum = pd.read_csv('X_test_dum.csv')
#X_train = pd.read_csv('X_train.csv')
#X_test = pd.read_csv('X_test.csv')

In [3]:
# Capturing the dependent feature
train = pd.read_csv('train.csv')
y_train = train['SalePrice']
print(y_train.head())
print(y_train.shape)
print(type(y_train))

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64
(1460,)
<class 'pandas.core.series.Series'>


##### Apply Feature Selection

First, I will specify the LassoCV Regression model to capture the best alpha value.

Then I will fit the Lasso model with that alpha value

The bigger the alpha the less features that will be selected.

Then I will use the selectFromModel object from sklearn, which will select the features which have non-zero coefficients.


#### For Dummy datasets

In [4]:
alphas = np.logspace(2, 2.5, 1000)

lassocv = LassoCV(alphas = alphas, cv = 10, max_iter = 1000, random_state = 0, \
                  precompute = True, n_jobs = -1, selection = 'random')
lassocv.fit(X_train_dum, y_train)

print('Best Alpha Value from LassoCV:', lassocv.alpha_)

Best Alpha Value from LassoCV: 126.35766783624274


In [5]:
feature_sel_model = SelectFromModel(Lasso(alpha = lassocv.alpha_, random_state = 0)) # remember to set the seed, the random state in this function
feature_sel_model.fit(X_train_dum, y_train)

SelectFromModel(estimator=Lasso(alpha=126.35766783624274, random_state=0))

In [6]:
print(feature_sel_model.get_support())

[ True False  True  True  True  True  True  True False False False False
  True  True False False  True False  True  True False False  True  True
  True  True False  True False False False  True False False  True  True
 False False False False False False False False False False False False
 False  True False  True False False False  True False False  True False
  True False  True  True False False  True False False  True False False
  True  True  True  True False  True  True  True  True  True  True False
 False  True False  True  True False False  True False False False False
  True False  True False  True  True  True False False False  True False
  True  True  True  True False False False False False  True False False
 False False False False  True False  True  True False False  True False
 False False False False  True False  True False False False  True  True
 False False  True  True False  True False  True False  True  True  True
 False  True  True  True  True False False  True Fa

In [7]:
# let's print the number of total and selected features

# this is how we can make a list of the selected features
selected_feat = X_train_dum.columns[(feature_sel_model.get_support())]

# let's print some stats
print('total features: {}'.format((X_train_dum.shape[1])))
print('No. of features selected by Lasso: {}'.format(len(selected_feat)))

total features: 226
No. of features selected by Lasso: 95


In [8]:
print(selected_feat)

Index(['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'MasVnrArea', '1stFlrSF', '2ndFlrSF', 'BsmtFullBath',
       'FullBath', 'HalfBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
       'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'MoSold', 'YrSold',
       'MSZoning_RL', 'MSZoning_Rare_val', 'LotShape_IR2', 'LandContour_HLS',
       'LandContour_Lvl', 'LotConfig_CulDSac', 'LotConfig_FR2',
       'LandSlope_Mod', 'Neighborhood_BrkSide', 'Neighborhood_Crawfor',
       'Neighborhood_Edwards', 'Neighborhood_Gilbert', 'Neighborhood_IDOTRR',
       'Neighborhood_Mitchel', 'Neighborhood_NAmes', 'Neighborhood_NWAmes',
       'Neighborhood_NoRidge', 'Neighborhood_NridgHt', 'Neighborhood_OldTown',
       'Neighborhood_Sawyer', 'Neighborhood_Somerst', 'Neighborhood_StoneBr',
       'Condition1_Norm', 'Condition2_Rare_val', 'BldgType_Duplex',
       'BldgType_TwnhsE', 'HouseStyle_1Story', 'HouseStyle_2Story',
       'RoofStyle_Hip', 'RoofMatl_Rare_va

In [9]:
X_train_dum = X_train_dum[selected_feat]
display(X_train_dum.head())

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,1stFlrSF,2ndFlrSF,BsmtFullBath,FullBath,HalfBath,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,WoodDeckSF,ScreenPorch,MoSold,YrSold,MSZoning_RL,MSZoning_Rare_val,LotShape_IR2,LandContour_HLS,LandContour_Lvl,LotConfig_CulDSac,LotConfig_FR2,LandSlope_Mod,Neighborhood_BrkSide,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_Sawyer,Neighborhood_Somerst,Neighborhood_StoneBr,Condition1_Norm,Condition2_Rare_val,BldgType_Duplex,BldgType_TwnhsE,HouseStyle_1Story,HouseStyle_2Story,RoofStyle_Hip,RoofMatl_Rare_val,Exterior1st_BrkFace,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_Wd Sdng,Exterior2nd_Plywood,Exterior2nd_Stucco,Exterior2nd_VinylSd,MasVnrType_BrkFace,ExterQual_Gd,ExterQual_TA,Foundation_CBlock,Foundation_PConc,BsmtQual_Fa,BsmtQual_Gd,BsmtQual_TA,BsmtCond_Missing,BsmtCond_TA,BsmtExposure_Gd,BsmtExposure_Missing,BsmtExposure_No,BsmtFinType1_BLQ,BsmtFinType1_GLQ,BsmtFinType1_LwQ,BsmtFinType1_Unf,BsmtFinType2_Unf,HeatingQC_Gd,HeatingQC_TA,CentralAir_Y,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,Functional_Typ,GarageType_Missing,GarageType_Rare_val,GarageFinish_Missing,GarageFinish_RFn,GarageFinish_Unf,GarageQual_Missing,GarageQual_Rare_val,GarageCond_Missing,PavedDrive_Y,Fence_MnPrv,SaleType_New,SaleType_WD,SaleCondition_Normal
0,0.235294,0.366344,0.666667,0.5,0.036765,0.098361,0.1225,0.356155,0.413559,0.333333,0.666667,0.5,0.5,0.0,0.046729,0.5,0.0,0.0,0.090909,0.5,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
1,0.0,0.391317,0.555556,0.875,0.227941,0.52459,0.0,0.503056,0.0,0.0,0.666667,0.0,0.333333,0.333333,0.28972,0.5,0.347725,0.0,0.363636,0.25,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
2,0.235294,0.422359,0.666667,0.5,0.051471,0.114754,0.10125,0.383441,0.41937,0.333333,0.666667,0.5,0.333333,0.333333,0.065421,0.5,0.0,0.0,0.727273,0.5,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
3,0.294118,0.390295,0.666667,0.5,0.669118,0.606557,0.0,0.399941,0.366102,0.333333,0.333333,0.0,0.416667,0.333333,0.074766,0.75,0.0,0.0,0.090909,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.235294,0.468761,0.777778,0.5,0.058824,0.147541,0.21875,0.466237,0.509927,0.333333,0.666667,0.5,0.583333,0.333333,0.074766,0.75,0.224037,0.0,1.0,0.5,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0


In [10]:
X_test_dum = X_test_dum[selected_feat]
display(X_test_dum.head())

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,1stFlrSF,2ndFlrSF,BsmtFullBath,FullBath,HalfBath,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,WoodDeckSF,ScreenPorch,MoSold,YrSold,MSZoning_RL,MSZoning_Rare_val,LotShape_IR2,LandContour_HLS,LandContour_Lvl,LotConfig_CulDSac,LotConfig_FR2,LandSlope_Mod,Neighborhood_BrkSide,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_Sawyer,Neighborhood_Somerst,Neighborhood_StoneBr,Condition1_Norm,Condition2_Rare_val,BldgType_Duplex,BldgType_TwnhsE,HouseStyle_1Story,HouseStyle_2Story,RoofStyle_Hip,RoofMatl_Rare_val,Exterior1st_BrkFace,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_Wd Sdng,Exterior2nd_Plywood,Exterior2nd_Stucco,Exterior2nd_VinylSd,MasVnrType_BrkFace,ExterQual_Gd,ExterQual_TA,Foundation_CBlock,Foundation_PConc,BsmtQual_Fa,BsmtQual_Gd,BsmtQual_TA,BsmtCond_Missing,BsmtCond_TA,BsmtExposure_Gd,BsmtExposure_Missing,BsmtExposure_No,BsmtFinType1_BLQ,BsmtFinType1_GLQ,BsmtFinType1_LwQ,BsmtFinType1_Unf,BsmtFinType2_Unf,HeatingQC_Gd,HeatingQC_TA,CentralAir_Y,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,Functional_Typ,GarageType_Missing,GarageType_Rare_val,GarageFinish_Missing,GarageFinish_RFn,GarageFinish_Unf,GarageQual_Missing,GarageQual_Rare_val,GarageCond_Missing,PavedDrive_Y,Fence_MnPrv,SaleType_New,SaleType_WD,SaleCondition_Normal
0,0.0,0.428726,0.444444,0.625,0.360294,0.819672,0.0,0.373438,0.0,0.0,0.333333,0.0,0.25,0.0,0.457944,0.25,0.163361,0.25,0.454545,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
1,0.0,0.468857,0.555556,0.625,0.382353,0.868852,0.0675,0.522632,0.0,0.0,0.333333,0.5,0.333333,0.0,0.485981,0.25,0.458576,0.0,0.454545,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
2,0.235294,0.462769,0.444444,0.5,0.095588,0.213115,0.0,0.386718,0.339467,0.0,0.666667,0.5,0.333333,0.333333,0.121495,0.5,0.247375,0.0,0.181818,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
3,0.235294,0.398875,0.555556,0.625,0.088235,0.213115,0.0125,0.385901,0.328329,0.0,0.666667,0.5,0.416667,0.333333,0.11215,0.5,0.42007,0.0,0.454545,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
4,0.588235,0.263841,0.777778,0.5,0.132353,0.311475,0.0,0.508416,0.0,0.0,0.666667,0.0,0.25,0.0,0.168224,0.5,0.0,0.3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0


In [11]:
X_train_dum.to_csv('X_train_dum_lasso.csv', index = False)
X_test_dum.to_csv('X_test_dum_lasso.csv', index = False)