In [4]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.metrics import roc_auc_score

from mlxtend.feature_selection import SequentialFeatureSelector as SFS

import warnings
warnings.filterwarnings('ignore')

## 1. Classification Feature Selection

In [5]:
# load the Paribas dataset from Kaggle
data=pd.read_csv('../datasets/paribas.csv')
data.head()

Unnamed: 0,ID,target,v1,v2,v3,v4,v5,v6,v7,v8,...,v122,v123,v124,v125,v126,v127,v128,v129,v130,v131
0,3,1,1.335739,8.727474,C,3.921026,7.915266,2.599278,3.176895,0.012941,...,8.0,1.98978,0.035754,AU,1.804126,3.113719,2.024285,0,0.636365,2.857144
1,4,1,,,C,,9.191265,,,2.30163,...,,,0.598896,AF,,,1.957825,0,,
2,5,1,0.943877,5.310079,C,4.410969,5.326159,3.979592,3.928571,0.019645,...,9.333333,2.477596,0.013452,AE,1.773709,3.922193,1.120468,2,0.883118,1.176472
3,6,1,0.797415,8.304757,C,4.22593,11.627438,2.0977,1.987549,0.171947,...,7.018256,1.812795,0.002267,CJ,1.41523,2.954381,1.990847,1,1.677108,1.034483
4,8,1,,,C,,,,,,...,,,,Z,,,,0,,


In [6]:
# Inpractise, feature selection should be done after data preprocessing
# So Ideally, all the categorical variables are enocded into numbers,
# and then you can assess whether they are correlated with other features

# here for simplicity I will use only numerical variables
# Select numerical columns
numericals=['int16','int32','int64','float16','float32','float64']
numerical_vars=list(data.select_dtypes(include=numericals).columns)
data=data[numerical_vars]
data.shape

(114321, 114)

In [7]:
# Seperate the train and the test dataset to avoid overfitting in data
x_train,x_test,y_train,y_test=train_test_split(data.drop(labels=['target'],axis=1),
                                              data['target'],test_size=0.3,random_state=0)
x_train.shape,x_test.shape

((80024, 113), (34297, 113))

In [8]:
# find and remove some features so as to
# reduce some feature space.
# We'll be removing certain correlated features.

def correlation(dataset,threshold):
    col_corr=set()
    corr_matrix=dataset.corr()
    
    for i in range(len(corr_matrix.columns)):
        for j in range(0,i):
            if abs(corr_matrix.iloc[i,j])>threshold:
                col_name=corr_matrix.columns[i]
                col_corr.add(col_name)
                
    return col_corr

In [10]:
corr_features=correlation(x_train,0.8)
len(corr_features) # 55 features are correlated

55

In [11]:
# Removing these features
x_train.drop(labels=corr_features,axis=1,inplace=True)
x_test.drop(labels=corr_features,axis=1,inplace=True)
x_train.shape

(80024, 58)

In [12]:
# We can now apply SFS for selecting the features
# we'll be selecting 10 features from the total,
# based on roc_auc values.

sfs1=SFS(RandomForestClassifier(n_jobs=4),      # Classifier that'll be used
        k_features=10,     #indicate 10 features to be selected
        forward=True,      # as we are preforming Step forward FS
        verbose=2,         # produces the output as the code is running
        scoring='roc_auc',
        cv=3)

sfs1.fit(x_train.fillna(0),y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  58 out of  58 | elapsed:  2.1min finished

[2020-04-10 20:36:40] Features: 1/10 -- score: 0.6273563865754349[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  57 out of  57 | elapsed:  1.3min finished

[2020-04-10 20:37:59] Features: 2/10 -- score: 0.6509413170380162[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  56 out of  56 | elapsed:  1.6min finished

[2020-04-10 20:39:33] Features: 3/10 -- score: 0.6822388687027305[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  

SequentialFeatureSelector(clone_estimator=True, cv=3,
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=4,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
             floating=False, forward=True, k_features=10, n_jobs=1,
             pre_dispatch='2*n_jobs', scoring='roc_auc', verbose=2)

Conclusion :-<br>
We could have stopped at 3 features itself, as the accuracy is not increasing much after that.

In [15]:
# 'k_feature_idx_' can be used to return the index of the features which can later be 
# used for selecting the features.

sel_features=x_train.columns[list(sfs1.k_feature_idx_)]
sel_features

Index(['v6', 'v10', 'v23', 'v34', 'v38', 'v39', 'v50', 'v57', 'v72', 'v129'], dtype='object')

In [19]:
def run_randomforest(x_train,y_train,x_test,y_test):
    rf=RandomForestClassifier(n_jobs=4,max_depth=4,random_state=39)
    rf.fit(x_train,y_train)
    print('Train set')
    y_scored=rf.predict_proba(x_train)
    print('RandomForest roc_auc value :{}'.format(roc_auc_score(y_train,y_scored[:,1])))
    print('Test set')
    y_scored=rf.predict_proba(x_test)
    print('RandomForest roc_auc value :{}'.format(roc_auc_score(y_test,y_scored[:,1])))

In [20]:
run_randomforest(x_train[sel_features].fillna(0),
                y_train,x_test[sel_features].fillna(0),y_test)

Train set
RandomForest roc_auc value :0.7059609825996227
Test set
RandomForest roc_auc value :0.6970169506277791


## 2. Regression Feature Selection

In [21]:
# Load houseprice prediction dataset from Kaggle
data=pd.read_csv('../datasets/houseprice.csv')
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [24]:
# Inpractise, feature selection should be done after data preprocessing
# So Ideally, all the categorical variables are enocded into numbers,
# and then you can assess whether they are correlated with other features

# here for simplicity I will use only numerical variables
# Select numerical columns
numericals=['int16','int32','int64','float16','float32','float64']
numerical_vars=list(data.select_dtypes(include=numericals).columns)
data=data[numerical_vars]
data.shape

(1460, 38)

In [25]:
# Split the dataset into train and test
x_train,x_test,y_train,y_test=train_test_split(data.drop(labels=['Id','SalePrice'],axis=1),
                                              data['SalePrice'],test_size=0.3,random_state=0)
x_train.shape,x_test.shape

((1022, 36), (438, 36))

In [26]:
# Removing correlated features and reducing
# the feature space
corr_features=correlation(data,0.8)
len(corr_features)

4

In [27]:
# removing the correlated features
x_train.drop(labels=corr_features,axis=1,inplace=True)
x_test.drop(labels=corr_features,axis=1,inplace=True)

x_train.shape,x_test.shape

((1022, 32), (438, 32))

In [29]:
# Performing SFS on the dataset with 32 features
sfs1=SFS(RandomForestRegressor(n_jobs=4),
        k_features=10,
        scoring='r2',
        verbose=2,
        forward=True,
        cv=3)

In [32]:
sfs1.fit(np.array(x_train.fillna(0)),y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  32 out of  32 | elapsed:   12.2s finished

[2020-04-10 21:31:23] Features: 1/10 -- score: 0.6679893014021792[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  31 out of  31 | elapsed:   11.9s finished

[2020-04-10 21:31:34] Features: 2/10 -- score: 0.7209511069475774[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   12.0s finished

[2020-04-10 21:31:46] Features: 3/10 -- score: 0.7432501148042517[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  

SequentialFeatureSelector(clone_estimator=True, cv=3,
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=4,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
             floating=False, forward=True, k_features=10, n_jobs=1,
             pre_dispatch='2*n_jobs', scoring='r2', verbose=2)

In [33]:
sfs1.k_feature_idx_

(0, 3, 4, 8, 12, 13, 14, 15, 17, 22)

In [34]:
x_train.columns[list(sfs1.k_feature_idx_)]

Index(['MSSubClass', 'OverallQual', 'OverallCond', 'BsmtFinSF1', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'FullBath', 'GarageCars'],
      dtype='object')