In [43]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score

In [44]:
# load dataset
data = pd.read_csv('paribas.csv', nrows=50000)
data.shape

(50000, 133)

In [45]:
data.head()

Unnamed: 0,ID,target,v1,v2,v3,v4,v5,v6,v7,v8,...,v122,v123,v124,v125,v126,v127,v128,v129,v130,v131
0,3,1,1.335739,8.727474,C,3.921026,7.915266,2.599278,3.176895,0.012941,...,8.0,1.98978,0.035754,AU,1.804126,3.113719,2.024285,0,0.636365,2.857144
1,4,1,,,C,,9.191265,,,2.30163,...,,,0.598896,AF,,,1.957825,0,,
2,5,1,0.943877,5.310079,C,4.410969,5.326159,3.979592,3.928571,0.019645,...,9.333333,2.477596,0.013452,AE,1.773709,3.922193,1.120468,2,0.883118,1.176472
3,6,1,0.797415,8.304757,C,4.22593,11.627438,2.0977,1.987549,0.171947,...,7.018256,1.812795,0.002267,CJ,1.41523,2.954381,1.990847,1,1.677108,1.034483
4,8,1,,,C,,,,,,...,,,,Z,,,,0,,


In [46]:
# here for simplicity I will use only numerical variables
# select numerical columns:

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_vars = list(data.select_dtypes(include=numerics).columns)
data = data[numerical_vars]
data.shape

(50000, 114)

# Important
In all feature selection procedures, it is good practice to select the features by examining only the training set. And this is to avoid overfit.

In [47]:
# separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target', 'ID'], axis=1),
    data['target'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((35000, 112), (15000, 112))

In [48]:
y_train.shape,y_test.shape

((35000,), (15000,))

# Selecting Feature using SelectFromModel

In [49]:
# first I specify the Random Forest instance, indicating
# the number of trees

# Then I use the selectFromModel object from sklearn
# to automatically select the features

best_select = SelectFromModel(RandomForestClassifier(n_estimators=100))

In [50]:
best_select.fit(X_train.fillna(0),y_train)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                                 class_weight=None,
                                                 criterion='gini',
                                                 max_depth=None,
                                                 max_features='auto',
                                                 max_leaf_nodes=None,
                                                 max_samples=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100, n_jobs=None,
                                                 oob_score=False,

In [51]:
# sklearn will select those features which importance values
# are greater than the mean of all the coefficients.

best_select.get_support()

array([False, False, False, False, False, False, False, False,  True,
       False,  True, False,  True, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False,  True, False,
       False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False])

In [52]:
# let's make a list and count the selected features
selected_feat = X_train.columns[best_select.get_support()]
len(selected_feat)

10

In [53]:
selected_feat

Index(['v10', 'v12', 'v14', 'v21', 'v34', 'v40', 'v50', 'v62', 'v114', 'v129'], dtype='object')

In [54]:
np.mean(best_select.estimator_.feature_importances_)

0.008928571428571428

In [55]:
best_select.estimator_.feature_importances_

array([0.0053375 , 0.00528349, 0.00480911, 0.00541616, 0.00550023,
       0.0048144 , 0.0047725 , 0.00513324, 0.05451907, 0.00521831,
       0.05878131, 0.00486081, 0.05184513, 0.00478442, 0.005242  ,
       0.0041772 , 0.00521166, 0.00480856, 0.00498663, 0.05251564,
       0.00378589, 0.00509831, 0.00465419, 0.00481822, 0.00572589,
       0.00419278, 0.00446934, 0.00447745, 0.05176214, 0.00475455,
       0.00585006, 0.00514133, 0.00276503, 0.00523292, 0.05086946,
       0.00429786, 0.00464146, 0.00488145, 0.00481801, 0.00509163,
       0.0048728 , 0.00426787, 0.00449079, 0.10011517, 0.00491956,
       0.00482791, 0.00518716, 0.00478337, 0.0056511 , 0.00493258,
       0.00448655, 0.00462461, 0.00467624, 0.01214836, 0.00492069,
       0.00428395, 0.00440482, 0.00424455, 0.00550974, 0.00538823,
       0.00552372, 0.0086689 , 0.0044022 , 0.00424507, 0.00431128,
       0.00577249, 0.00525157, 0.00522554, 0.00615632, 0.00455972,
       0.00483544, 0.00514459, 0.00461357, 0.00527632, 0.00536

In [56]:
# now, let's compare the  amount of selected features
# with the amount of features which importance is above the
# mean importance, to make sure we understand the output of
# sklearn

print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients greater than the mean coefficient: {}'.format(
    np.sum(best_select.estimator_.feature_importances_ > best_select.estimator_.feature_importances_.mean())))

total features: 112
selected features: 10
features with coefficients greater than the mean coefficient: 10


In [57]:
X_train_srf = best_select.transform(X_train.fillna(0))
X_test_srf = best_select.transform(X_test.fillna(0))

In [58]:
X_train_srf.shape, X_test_srf.shape

((35000, 10), (15000, 10))

In [59]:
y_train.shape, y_test.shape

((35000,), (15000,))

In [69]:
#def run_randomforest(x_train, x_test, y_train, y_test):
clf = RandomForestClassifier(n_estimators=100,random_state=0,n_jobs=-1)
clf.fit(X_train_srf, y_train)
y_pred = clf.predict(X_test_srf)
print('Accuracy:',accuracy_score(y_test, y_pred))

Accuracy: 0.7597333333333334


Accuracy: 0.7597333333333334


# Feature selection using RFE

In [79]:
best_rfe = RFE(RandomForestClassifier(n_estimators=100,random_state=0,n_jobs=-1), n_features_to_select=10)

In [80]:
best_rfe.fit(X_train.fillna(0),y_train)

RFE(estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                     class_weight=None, criterion='gini',
                                     max_depth=None, max_features='auto',
                                     max_leaf_nodes=None, max_samples=None,
                                     min_impurity_decrease=0.0,
                                     min_impurity_split=None,
                                     min_samples_leaf=1, min_samples_split=2,
                                     min_weight_fraction_leaf=0.0,
                                     n_estimators=100, n_jobs=-1,
                                     oob_score=False, random_state=0, verbose=0,
                                     warm_start=False),
    n_features_to_select=10, step=1, verbose=0)

In [81]:
select_rfe_feat = X_train.columns[best_rfe.get_support()]

In [82]:
len(select_rfe_feat)

10

In [83]:
select_rfe_feat

Index(['v10', 'v11', 'v12', 'v14', 'v21', 'v34', 'v36', 'v40', 'v50', 'v114'], dtype='object')

In [84]:
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(select_rfe_feat)))

total features: 112
selected features: 10


In [85]:
X_train_rfe = best_rfe.transform(X_train.fillna(0))
X_test_rfe = best_rfe.transform(X_test.fillna(0))

In [86]:
X_train_rfe.shape, X_test_rfe.shape

((35000, 10), (15000, 10))

In [87]:
clf = RandomForestClassifier(n_estimators=100,random_state=0,n_jobs=-1)
clf.fit(X_train_rfe, y_train)
y_pred = clf.predict(X_test_rfe)
print('Accuracy:',accuracy_score(y_test, y_pred))

Accuracy: 0.7625333333333333


# In my opinion the RFE from sklearn does not bring forward a massive advantage respect to the SelectFromModel method, and personally I tend to use the SelectFromModelmy features.