### Wrapper Methods for Feature Selection
Wrapper methods are based on greedy search algorithms as they evaluate all possible combinations of the features and select the combination that produces the best result for a specific machine learning algorithm. A downside to this approach is that testing all possible combinations of the features can be computationally very expensive, particularly if the feature set is very large.

In [1]:
# Data Pre-processing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

paribas_data = pd.read_csv("paribas_data.csv", nrows=10000)
paribas_data.shape

(10000, 133)

In [2]:
# removing the non-numeric columns from the dataset
num_colums = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_columns = list(paribas_data.select_dtypes(include=num_colums).columns)
paribas_data = paribas_data[numerical_columns]
paribas_data.shape

(10000, 114)

In [3]:
train_features, test_features, train_labels, test_labels = train_test_split(
    paribas_data.drop(labels=['target', 'ID'], axis=1),
    paribas_data['target'],
    test_size=0.2,
    random_state=41)

In [4]:
# Creating a set of all the columns with a correlation of greater than 0.8
correlated_features = set()
correlation_matrix = paribas_data.corr()
for i in range(len(correlation_matrix .columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

In [5]:
# Removing the columns as per above set
train_features.drop(labels=correlated_features, axis=1, inplace=True)
test_features.drop(labels=correlated_features, axis=1, inplace=True)

train_features.shape, test_features.shape

((8000, 57), (2000, 57))

In [6]:
# Step Forward Feature Selection also know as Sequential Feature Selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

from mlxtend.feature_selection import SequentialFeatureSelector

# # RandomForestClassifieras the estimator to the SequentialFeatureSelector function. 
# k_features specifies the number of features to select. You can set any number of features here. 
# The forward parameter, if set to True, performs step forward feature selection. 
# The verbose parameter is used for logging the progress of the feature selector, 
# the scoring parameter defines the performance evaluation criteria  
# cv refers to cross-validation folds.
feature_selector = SequentialFeatureSelector(RandomForestClassifier(n_estimators=10, n_jobs=-1),
           k_features=15, forward=True, verbose=2, scoring='roc_auc', cv=4)

In [7]:
features = feature_selector.fit(np.array(train_features.fillna(0)), train_labels)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  57 out of  57 | elapsed:  1.0min finished

[2020-03-22 18:45:52] Features: 1/15 -- score: 0.6110139717340152[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  56 out of  56 | elapsed:   50.3s finished

[2020-03-22 18:46:42] Features: 2/15 -- score: 0.6404757207332129[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  55 out of  55 | elapsed:   51.7s finished

[2020-03-22 18:47:34] Features: 3/15 -- score: 0.6608260238276449[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  

In [8]:
filtered_features= train_features.columns[list(features.k_feature_idx_)]
filtered_features

Index(['v1', 'v4', 'v7', 'v8', 'v10', 'v14', 'v15', 'v16', 'v18', 'v23', 'v38',
       'v50', 'v51', 'v69', 'v94'],
      dtype='object')

In [9]:
# performance of the random forest algorithm using these 15 features
clf = RandomForestClassifier(n_estimators=100, random_state=41, max_depth=3)
clf.fit(train_features[filtered_features].fillna(0), train_labels)

train_pred = clf.predict_proba(train_features[filtered_features].fillna(0))
print('Accuracy on training set: {}'.format(roc_auc_score(train_labels, train_pred[:,1])))

test_pred = clf.predict_proba(test_features[filtered_features].fillna(0))
print('Accuracy on test set: {}'.format(roc_auc_score(test_labels, test_pred [:,1])))

Accuracy on training set: 0.7136601229972543
Accuracy on test set: 0.6856697224344284


In [None]:
# Step Backwards Feature Selection
feature_selector = SequentialFeatureSelector(RandomForestClassifier(n_jobs=-1, n_estimators=10),
           k_features=15, forward=False, verbose=2, scoring='roc_auc', cv=4)

features = feature_selector.fit(np.array(train_features.fillna(0)), train_labels)

In [None]:
filtered_features= train_features.columns[list(features.k_feature_idx_)]
filtered_features

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=41, max_depth=3)
clf.fit(train_features[filtered_features].fillna(0), train_labels)

train_pred = clf.predict_proba(train_features[filtered_features].fillna(0))
print('Accuracy on training set: {}'.format(roc_auc_score(train_labels, train_pred[:,1])))

test_pred = clf.predict_proba(test_features[filtered_features].fillna(0))
print('Accuracy on test set: {}'.format(roc_auc_score(test_labels, test_pred [:,1])))

In [None]:
# Exhaustive Feature Selection 
from mlxtend.feature_selection import ExhaustiveFeatureSelector

# min_featuresand max_features attributes which can be used to specify the minimum 
# and the maximum number of features in the combination.
feature_selector = ExhaustiveFeatureSelector(RandomForestClassifier(n_jobs=-1, n_estimators=10),
           min_features=2, max_features=4, scoring='roc_auc', print_progress=True, cv=2)

In [None]:
features = feature_selector.fit(np.array(train_features.fillna(0)), train_labels)

In [None]:
filtered_features= train_features.columns[list(features.k_feature_idx_)]
filtered_features

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=41, max_depth=3)
clf.fit(train_features[filtered_features].fillna(0), train_labels)

train_pred = clf.predict_proba(train_features[filtered_features].fillna(0))
print('Accuracy on training set: {}'.format(roc_auc_score(train_labels, train_pred[:,1])))

test_pred = clf.predict_proba(test_features[filtered_features].fillna(0))
print('Accuracy on test set: {}'.format(roc_auc_score(test_labels, test_pred [:,1])))