# Wrapper Method Feature Selection

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split

In [3]:
#Loading House prices dataset.
df=pd.read_csv(r'C:\Users\divesh.kubal\PycharmProjects\OCR_Layer\house_price_data.csv')

In [4]:
df.shape

(1460, 81)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [6]:
#Feature selection should be done after data preprocessing.
#Ideally  all the categorical variables should be encoded into numbers, so that we can assess how deterministic they are for target.
#Currently I will be dealling with numerical columns only.
colType = ['int64','float64']
#Select the columns which are either int64 or float64.
numCols=list(df.select_dtypes(include=colType).columns)
#Assigning numerical columns from df to data variable. We can use the same variable as well.
data=df[numCols]

In [7]:
data.shape

(1460, 38)

In [8]:
#Lets split the data in training set and test set.
X_train,X_test,y_train,y_test=train_test_split(data.drop('SalePrice',axis=1),data['SalePrice'],test_size=.2,random_state=1)

In [9]:
def correlation(dataset,threshold):
    col_corr=set() # set will contains unique values.
    corr_matrix=dataset.corr() #finding the correlation between columns.
    for i in range(len(corr_matrix.columns)): #number of columns
        for j in range(i):
            if abs(corr_matrix.iloc[i,j])>threshold: #checking the correlation between columns.
                colName=corr_matrix.columns[i] #getting the column name
                col_corr.add(colName) #adding the correlated column name heigher than threshold value.
    return col_corr #returning set of column names
col=correlation(X_train,0.8)
print('Correlated columns:',col)    

Correlated columns: {'GarageArea', 'GarageYrBlt', '1stFlrSF', 'TotRmsAbvGrd'}


In [10]:
#remove correlated columns
X_train.drop(columns=col,axis=1,inplace=True)
X_test.drop(columns=col,axis=1,inplace=True)
#lets check the shape of training set and test set.
X_train.shape,X_test.shape

((1168, 33), (292, 33))

In [12]:
#Filling null values with 0.
X_train.fillna(0,inplace=True)

In [13]:
#Checking if there is null values.
X_train.isnull().sum().max()

0

# Forward feature selection

In [15]:
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
#I am going to use RandomForestRegressor algoritham as an estimator. Your can select other regression alogritham as well.
from sklearn.ensemble import RandomForestRegressor
#k_features=10 (It will get top 10 features best suited for prediction)
#forward=True (Forward feature selection model)
#verbose=2 (It will show details output as shown below.)
#cv=5 (Kfold cross valiation: it will split the training set in 5 set and 4 will be using for training the model and 1 will using as validation)
#n_jobs=-1 (Number of cores it will use for execution.-1 means it will use all the cores of CPU for execution.)
#scoring='r2'(R-squared is a statistical measure of how close the data are to the fitted regression line)
model=sfs(RandomForestRegressor(),k_features=10,forward=True,verbose=2,cv=5,n_jobs=-1,scoring='r2')
model.fit(X_train,y_train)

  from numpy.core.umath_tests import inner1d
[Parallel(n_jobs=-1)]: Done  33 out of  33 | elapsed:    3.8s finished

[2019-03-15 16:28:53] Features: 1/10 -- score: 0.6525384968803336[Parallel(n_jobs=-1)]: Done  32 out of  32 | elapsed:    3.9s finished

[2019-03-15 16:28:57] Features: 2/10 -- score: 0.7138199435294974[Parallel(n_jobs=-1)]: Done  31 out of  31 | elapsed:    3.9s finished

[2019-03-15 16:29:02] Features: 3/10 -- score: 0.7232702143057248[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    3.9s finished

[2019-03-15 16:29:06] Features: 4/10 -- score: 0.744480578588591[Parallel(n_jobs=-1)]: Done  29 out of  29 | elapsed:    4.1s finished

[2019-03-15 16:29:10] Features: 5/10 -- score: 0.7901799222600141[Parallel(n_jobs=-1)]: Done  28 out of  28 | elapsed:    4.3s finished

[2019-03-15 16:29:15] Features: 6/10 -- score: 0.8167183166186268[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    4.2s finished

[2019-03-15 16:29:19] Features: 7/10 -- score: 0.8310190208470

SequentialFeatureSelector(clone_estimator=True, cv=5,
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
             floating=False, forward=True, k_features=10, n_jobs=-1,
             pre_dispatch='2*n_jobs', scoring='r2', verbose=2)

In [16]:
#Get the selected feature index.
model.k_feature_idx_

(1, 4, 5, 7, 9, 10, 15, 16, 22, 23)

In [17]:
#Get the column name for the selected feature.
model.k_feature_names_

('MSSubClass',
 'OverallQual',
 'OverallCond',
 'YearRemodAdd',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'GrLivArea',
 'BsmtFullBath',
 'Fireplaces',
 'GarageCars')

# Backward Feature Selection

In [18]:
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.ensemble import RandomForestRegressor
#k_features=10 (It will get top 10 features best suited for prediction)
#forward=False (Backward feature selection model)
#verbose=2 (It will show details output as shown below.)
#cv=5 (Kfold cross valiation: it will split the training set in 5 set and 4 will be using for training the model and 1 will using as validation)
#n_jobs=-1 (Number of cores it will use for execution.-1 means it will use all the cores of CPU for execution.)
#scoring='r2'(R-squared is a statistical measure of how close the data are to the fitted regression line)
backwardModel=sfs(RandomForestRegressor(),k_features=10,forward=False,verbose=2,cv=5,n_jobs=-1,scoring='r2')
#We will convert our training data into numpy array. If we will not convert it, model is not able to read some of the column names. 
backwardModel.fit(np.array(X_train),y_train)

[Parallel(n_jobs=-1)]: Done  33 out of  33 | elapsed:    6.8s finished

[2019-03-15 16:37:44] Features: 32/10 -- score: 0.8414892484492942[Parallel(n_jobs=-1)]: Done  32 out of  32 | elapsed:    6.6s finished

[2019-03-15 16:37:51] Features: 31/10 -- score: 0.8322308983930305[Parallel(n_jobs=-1)]: Done  31 out of  31 | elapsed:    6.2s finished

[2019-03-15 16:37:58] Features: 30/10 -- score: 0.8378245149167822[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    6.2s finished

[2019-03-15 16:38:04] Features: 29/10 -- score: 0.8338692358294433[Parallel(n_jobs=-1)]: Done  29 out of  29 | elapsed:    6.5s finished

[2019-03-15 16:38:11] Features: 28/10 -- score: 0.8328731570959986[Parallel(n_jobs=-1)]: Done  28 out of  28 | elapsed:    6.6s finished

[2019-03-15 16:38:18] Features: 27/10 -- score: 0.8322616199110943[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    5.8s finished

[2019-03-15 16:38:24] Features: 26/10 -- score: 0.8333262784943309[Parallel(n_jobs=-1)]: Done  26 ou

SequentialFeatureSelector(clone_estimator=True, cv=5,
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
             floating=False, forward=False, k_features=10, n_jobs=-1,
             pre_dispatch='2*n_jobs', scoring='r2', verbose=2)

In [19]:
#Get the selected feature index.
backwardModel.k_feature_idx_

(0, 1, 4, 5, 7, 9, 12, 15, 23, 24)

In [20]:
#Get the column name for the selected feature.
X_train.columns[list(backwardModel.k_feature_idx_)]

Index(['Id', 'MSSubClass', 'OverallQual', 'OverallCond', 'YearRemodAdd',
       'BsmtFinSF1', 'TotalBsmtSF', 'GrLivArea', 'GarageCars', 'WoodDeckSF'],
      dtype='object')