In [16]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor

from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings('ignore')

## Classification

In [3]:
# Load the paribas dataset
data=pd.read_csv('../datasets/paribas.csv',nrows=50000)
data.head()

Unnamed: 0,ID,target,v1,v2,v3,v4,v5,v6,v7,v8,...,v122,v123,v124,v125,v126,v127,v128,v129,v130,v131
0,3,1,1.335739,8.727474,C,3.921026,7.915266,2.599278,3.176895,0.012941,...,8.0,1.98978,0.035754,AU,1.804126,3.113719,2.024285,0,0.636365,2.857144
1,4,1,,,C,,9.191265,,,2.30163,...,,,0.598896,AF,,,1.957825,0,,
2,5,1,0.943877,5.310079,C,4.410969,5.326159,3.979592,3.928571,0.019645,...,9.333333,2.477596,0.013452,AE,1.773709,3.922193,1.120468,2,0.883118,1.176472
3,6,1,0.797415,8.304757,C,4.22593,11.627438,2.0977,1.987549,0.171947,...,7.018256,1.812795,0.002267,CJ,1.41523,2.954381,1.990847,1,1.677108,1.034483
4,8,1,,,C,,,,,,...,,,,Z,,,,0,,


In [4]:
# Inpractise, feature selection should be done after data preprocessing
# So Ideally, all the categorical variables are enocded into numbers,
# and then you can assess whether they are correlated with other features

# here for simplicity I will use only numerical variables
# Select numerical columns
numericals=['int16','int32','int64','float16','float32','float64']
numerical_vars=list(data.select_dtypes(include=numericals).columns)
data=data[numerical_vars]
data.shape

(50000, 114)

In [5]:
# split dataset into training and testing
x_train,x_test,y_train,y_test=train_test_split(data.drop(labels=['ID','target'],axis=1),
                                              data['target'],test_size=0.3,random_state=0)
x_train.shape,x_test.shape

((35000, 112), (15000, 112))

In [6]:
# To reduce the computation time
# we will reduce the feature space by removing 
# correlated features

def correlation(dataset,threshold):
    corr_cols=set()
    corr_matrix=dataset.corr()
    
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i,j])>threshold:
                colname=corr_matrix.columns[i]
                corr_cols.add(colname)
                
    return corr_cols

In [7]:
corr_features=correlation(x_train,0.8)
len(corr_features)

55

In [8]:
# Removing all the correlated features
x_train.drop(labels=corr_features,axis=1,inplace=True)
x_test.drop(labels=corr_features,axis=1,inplace=True)

x_train.shape,x_test.shape

((35000, 57), (15000, 57))

In [12]:
# Since exhaustive feature selection is computationally very expensive,
# because of my hardware I would choose only 4 features for feature selection.

efs1=EFS(RandomForestClassifier(n_jobs=2,random_state=0),
        min_features=1,
        max_features=4,
         scoring='roc_auc',
         print_progress=True,
         cv=2
        )
efs1.fit(np.array(x_train[x_train.columns[0:4]].fillna(0)),y_train)

Features: 15/15

ExhaustiveFeatureSelector(clone_estimator=True, cv=2,
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
             max_features=4, min_features=1, n_jobs=1,
             pre_dispatch='2*n_jobs', print_progress=True,
             scoring='roc_auc')

In [13]:
efs1.best_idx_

(0, 1, 2)

In [15]:
selected_feat=x_train.columns[list(efs1.best_idx_)]
selected_feat

Index(['v1', 'v2', 'v4'], dtype='object')

In [17]:
# Evaluate performance of classifier using selected features
def run_randomForests(x_train,x_test,y_train,y_test):
    rf=RandomForestClassifier(n_estimators=200,random_state=39,max_depth=4)
    rf.fit(x_train,y_train)
    print('Train set')
    y_scored=rf.predict_proba(x_train)
    print('RandomForest roc_auc :{}'.format(roc_auc_score(y_train,y_scored[:,1])))
    print('Test set')
    y_scored=rf.predict_proba(x_test)
    print('RandomForest roc_auc :{}'.format(roc_auc_score(y_test,y_scored[:,1])))

In [18]:
run_randomForests(x_train[selected_feat].fillna(0),
                 x_test[selected_feat].fillna(0),
                 y_train,y_test)

Train set
RandomForest roc_auc :0.5433561866210962
Test set
RandomForest roc_auc :0.5253970921093112


Accuracy is less because we choose only 4 feature space. We can use many feature sets to get optimal Features from the set when provided with powerful hardware support.

The same steps will be followed for REGRESSION with the scoring done using 'r2'in place of roc_auc.
