# Step Forward Selection Classification

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('train.csv',nrows=50000)
data.shape

(50000, 133)

In [3]:
data.head()

Unnamed: 0,ID,target,v1,v2,v3,v4,v5,v6,v7,v8,...,v122,v123,v124,v125,v126,v127,v128,v129,v130,v131
0,3,1,1.335739,8.727474,C,3.921026,7.915266,2.599278,3.176895,0.012941,...,8.0,1.98978,0.035754,AU,1.804126,3.113719,2.024285,0,0.636365,2.857144
1,4,1,,,C,,9.191265,,,2.30163,...,,,0.598896,AF,,,1.957825,0,,
2,5,1,0.943877,5.310079,C,4.410969,5.326159,3.979592,3.928571,0.019645,...,9.333333,2.477596,0.013452,AE,1.773709,3.922193,1.120468,2,0.883118,1.176472
3,6,1,0.797415,8.304757,C,4.22593,11.627438,2.0977,1.987549,0.171947,...,7.018256,1.812795,0.002267,CJ,1.41523,2.954381,1.990847,1,1.677108,1.034483
4,8,1,,,C,,,,,,...,,,,Z,,,,0,,


In [4]:
numerics=['int16','int32','int64','float16','float32','float64']
numerical_vars=data.select_dtypes(include=numerics).columns
data=data[numerical_vars]
data.shape

(50000, 114)

In [5]:
data.head()

Unnamed: 0,ID,target,v1,v2,v4,v5,v6,v7,v8,v9,...,v121,v122,v123,v124,v126,v127,v128,v129,v130,v131
0,3,1,1.335739,8.727474,3.921026,7.915266,2.599278,3.176895,0.012941,9.999999,...,0.803572,8.0,1.98978,0.035754,1.804126,3.113719,2.024285,0,0.636365,2.857144
1,4,1,,,,9.191265,,,2.30163,,...,,,,0.598896,,,1.957825,0,,
2,5,1,0.943877,5.310079,4.410969,5.326159,3.979592,3.928571,0.019645,12.666667,...,2.238806,9.333333,2.477596,0.013452,1.773709,3.922193,1.120468,2,0.883118,1.176472
3,6,1,0.797415,8.304757,4.22593,11.627438,2.0977,1.987549,0.171947,8.965516,...,1.956521,7.018256,1.812795,0.002267,1.41523,2.954381,1.990847,1,1.677108,1.034483
4,8,1,,,,,,,,,...,,,,,,,,0,,


In [6]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target', 'ID'], axis=1),
    data['target'],
    test_size=0.3,
    random_state=0)

print(X_train.shape, y_train.shape)
print(X_test.shape,y_test.shape)

(35000, 112) (35000,)
(15000, 112) (15000,)


find and remove correlated features, in order to reduce the feature space a bit so that the algorithm takes shorter
time

In [9]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_train, 0.8)
print('correlated features: ', len(set(corr_features)) )

correlated features:  55


In [10]:
X_train.drop(labels=corr_features,axis=1,inplace=True)
X_test.drop(labels=corr_features,axis=1,inplace=True)

In [11]:
print(X_train.shape, X_test.shape)

(35000, 57) (15000, 57)


Let us use SequentialFeatureSelector from mlxtend to implement step forward selection

In [15]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [None]:
sfs = SFS(RandomForestClassifier(n_jobs=4), # What estimator to use
           k_features=10, #No of features to be selected 
           forward=True, #if True then Step forward 
           floating=False, 
           verbose=2,
           scoring='roc_auc',#for classification ROC_AUC score
           cv=3,n_jobs=-1)

sfs.fit(np.array(X_train.fillna(0)), y_train)

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done  57 out of  57 | elapsed:   29.5s finished

[2018-04-20 20:46:04] Features: 1/10 -- score: 0.6254185766574373[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done  56 out of  56 | elapsed:   21.2s finished

[2018-04-20 20:46:25] Features: 2/10 -- score: 0.6413894864587263[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:   26.8s finished

[2018-04-20 20:46:52] Features: 3/10 -- score: 0.6663670454924108

In [23]:
selected_features= X_train.columns[list(sfs.k_feature_idx_)]
selected_features

Index(['v10', 'v20', 'v23', 'v34', 'v38', 'v50', 'v72', 'v102', 'v117',
       'v129'],
      dtype='object')

In [22]:
def run_randomForests(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [24]:
run_randomForests(X_train[selected_features].fillna(0),
                  X_test[selected_features].fillna(0),
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.7159820896526661
Test set
Random Forests roc-auc: 0.7012170206977828
