# Feature Selection

In [1]:
# import basic libraries
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv('processed_data.csv')

In [3]:
print(dataset.shape)
dataset.head()

(1459, 84)


Unnamed: 0,Id,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontagenan,MasVnrAreanan,GarageYrBltnan
0,1,12.247694,0.235294,0.75,0.418208,0.366344,1.0,0.5,1.0,1.0,...,0.5,0.25,0.0,0.090909,0.5,1.0,0.8,0.0,0.0,0.0
1,2,12.109011,0.0,0.75,0.495064,0.391317,1.0,0.5,1.0,1.0,...,0.5,0.25,0.0,0.363636,0.25,1.0,0.8,0.0,0.0,0.0
2,3,12.317167,0.235294,0.75,0.434909,0.422359,1.0,0.5,0.0,1.0,...,0.5,0.25,0.0,0.727273,0.5,1.0,0.8,0.0,0.0,0.0
3,4,11.849398,0.294118,0.75,0.388581,0.390295,1.0,0.5,0.0,1.0,...,0.5,0.25,0.0,0.090909,0.0,1.0,0.0,0.0,0.0,0.0
4,5,12.429216,0.235294,0.75,0.513123,0.468761,1.0,0.5,0.0,1.0,...,0.5,0.25,0.0,1.0,0.5,1.0,0.8,0.0,0.0,0.0


In [4]:
train_features = [feature for feature in dataset.columns if feature not in ['Id','SalePrice']]

X = dataset[train_features]
y = dataset['SalePrice']

## SelectKBest

In [5]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

selectKbest = SelectKBest(score_func=f_regression, k=50)
fit = selectKbest.fit(X, y)

In [6]:
feature_score = pd.concat([pd.DataFrame(X.columns), pd.DataFrame(fit.scores_)], axis=1)
feature_score.columns = ['feature', 'score']
print(feature_score.nlargest(21, 'score'))

         feature        score
16   OverallQual  2932.749675
45     GrLivArea  1664.757505
60    GarageCars  1257.624983
61    GarageArea  1071.162399
37   TotalBsmtSF   875.430604
42      1stFlrSF   860.097475
48      FullBath   798.054058
18     YearBuilt   768.132931
26     ExterQual   747.309552
29      BsmtQual   727.278126
19  YearRemodAdd   695.202393
53  TotRmsAbvGrd   582.535942
52   KitchenQual   582.467212
57    GarageType   493.870606
58   GarageYrBlt   478.712834
55    Fireplaces   459.391465
25    MasVnrArea   324.569169
39     HeatingQC   322.882114
59  GarageFinish   308.018309
28    Foundation   298.184675
3        LotArea   277.390577


## SelectFromModel

In [7]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

feature_sel_model = SelectFromModel(Lasso(alpha=0.005, random_state=0)) # remember to set the seed, the random state in this function
feature_sel_model.fit(X, y)

SelectFromModel(estimator=Lasso(alpha=0.005, copy_X=True, fit_intercept=True,
                                max_iter=1000, normalize=False, positive=False,
                                precompute=False, random_state=0,
                                selection='cyclic', tol=0.0001,
                                warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [8]:
feature_sel_model.get_support()

array([False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False,  True, False,  True, False,
        True,  True, False, False, False, False, False, False,  True,
       False, False,  True, False,  True,  True, False, False, False,
       False, False, False,  True,  True, False,  True, False, False,
        True,  True, False, False, False, False, False,  True, False,
       False,  True, False,  True, False,  True,  True, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False])

In [9]:
# let's print the number of total and selected features

# this is how we can make a list of the selected features
selected_feat = X.columns[(feature_sel_model.get_support())]

# let's print some stats
print('total features: {}'.format((X.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
    np.sum(feature_sel_model.estimator_.coef_ == 0)))

total features: 82
selected features: 21
features with coefficients shrank to zero: 61


In [10]:
selected_feat

Index(['LotShape', 'BldgType', 'OverallQual', 'YearBuilt', 'YearRemodAdd',
       'ExterQual', 'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'HeatingQC',
       'CentralAir', '1stFlrSF', 'GrLivArea', 'BsmtFullBath', 'KitchenQual',
       'Fireplaces', 'GarageType', 'GarageFinish', 'GarageCars', 'PavedDrive',
       'SaleCondition'],
      dtype='object')

In [11]:
final_data = pd.concat([dataset[['Id','SalePrice']].reset_index(drop=True),
                       pd.DataFrame(dataset[selected_feat], columns=selected_feat)], axis=1)

In [12]:
final_data.to_csv('final_data.csv', index=False)