# Feature Selection

## Import Data

In [1]:
import pandas as pd
import numpy as np

%matplotlib notebook 

In [8]:
data = pd.read_csv("../../data/v2/data_merged_2018.csv", sep=";")

In [9]:
data.head()

Unnamed: 0,uuid,Act_Raw_Score,ActieveCopingPercentage,ActieveCopingScore,Age,AlgIntakeOpleidingsniveauScore_Raw,AlgIntakeWoonsituatieScore_Raw,AlgemeneGezondheidsbelevingScore,Average,BewegingsangstScore,...,TotaalLevel,TotaalScore,TransformerenScore,Ver_Raw_Score,VerminderingVanEisen,VitaliteitScore,educationLevel,livingConditions,got_go,finished_treatment
0,-9214014786609792531,16.0,38.89,26.0,44.0,1.0,4.0,70.0,39.0,27.0,...,1,6.0,9.0,15.0,4.0,80.0,20.0,20.0,1,1.0
1,-9204323589684605317,14.0,47.22,29.0,40.0,6.0,1.0,25.0,31.0,40.0,...,1,5.0,11.0,16.0,7.0,35.0,42.0,10.0,1,1.0
2,-9189315961929324040,18.0,61.11,34.0,30.0,5.0,2.0,75.0,48.0,39.0,...,2,4.0,14.0,20.0,6.0,70.0,41.0,30.0,1,0.0
3,-9187839909081422277,18.0,72.22,38.0,48.0,9.0,4.0,55.0,50.0,37.0,...,2,5.0,14.0,20.0,7.0,60.0,43.0,20.0,0,
4,-9184078185923068786,16.0,55.56,32.0,69.0,3.0,3.0,30.0,79.0,51.0,...,2,9.0,11.0,19.0,8.0,55.0,32.0,40.0,1,0.0


## Feature Selection

In [10]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import math

In [11]:
# Separate dataframe into scores, go (yes:no) and finished (yes/no) labels 
X = data.iloc[:,1:-3]
s = data.iloc[:,-3]
y= data.iloc[:,-1]

In [12]:
# Fill in null values
X = X.fillna(X.mean().apply(lambda x: math.floor(x)))
X = X.astype(np.float64)
X.describe()

Unnamed: 0,Act_Raw_Score,ActieveCopingPercentage,ActieveCopingScore,Age,AlgIntakeOpleidingsniveauScore_Raw,AlgIntakeWoonsituatieScore_Raw,AlgemeneGezondheidsbelevingScore,Average,BewegingsangstScore,Bsi_Age,...,SociaalFunctionerenScore,SubuitslagScore,Terugtrekken,TotaalLevel,TotaalScore,TransformerenScore,Ver_Raw_Score,VerminderingVanEisen,VitaliteitScore,educationLevel
count,2376.0,2376.0,2376.0,2376.0,2376.0,2376.0,2376.0,2376.0,2376.0,2376.0,...,2376.0,2376.0,2376.0,2376.0,2376.0,2376.0,2376.0,2376.0,2376.0,2376.0
mean,17.650673,44.314373,27.948232,48.234428,4.874158,3.948653,45.911195,46.194865,39.097222,48.235269,...,49.11553,2.535774,13.988215,1.475589,8.00463,8.789141,16.582071,6.805976,40.374579,38.799242
std,3.786628,13.244941,4.768698,14.487983,2.175594,10.818069,19.143252,19.505257,7.889888,14.487127,...,25.55849,1.456305,3.866383,0.499509,8.165936,2.51374,3.270409,1.888643,18.444416,11.292498
min,7.0,0.0,12.0,18.0,1.0,0.0,0.0,0.0,17.0,18.0,...,0.0,0.0,7.0,1.0,0.0,4.0,8.0,3.0,0.0,20.0
25%,15.0,36.11,25.0,37.0,3.0,2.0,30.0,33.0,33.0,37.0,...,25.0,1.0,11.0,1.0,4.0,7.0,14.0,6.0,30.0,32.0
50%,18.0,44.44,28.0,49.0,5.0,3.0,45.0,46.0,39.0,49.0,...,50.0,3.0,14.0,1.0,6.0,9.0,16.0,7.0,40.0,41.0
75%,20.0,52.78,31.0,59.0,7.0,4.0,60.0,59.0,44.25,59.0,...,62.5,4.0,16.0,2.0,8.0,10.0,19.0,8.0,50.0,43.0
max,28.0,88.89,44.0,89.0,10.0,99.0,100.0,99.0,66.0,89.0,...,100.0,5.0,28.0,2.0,52.0,16.0,29.0,12.0,100.0,70.0


## Pipeline: Recursive Feature Elimination with Cross Validation + GridSearchCV

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

from yellowbrick.features import RFECV
import matplotlib.pyplot as plt

In [7]:
# Extend Pipeline class to get access to the features' importance of the model
class PipelineRFE(Pipeline):

    def fit(self, X, y=None, **fit_params):
        super(PipelineRFE, self).fit(X, y, **fit_params)
        self.feature_importances_ = self.named_steps['RFC'].feature_importances_
#         self.support_ = self.named_steps['RFC'].support_
        return self

In [8]:
pipeline = [
    ('scaler', StandardScaler()),
    ('RFC', RandomForestClassifier(class_weight="balanced", n_estimators=150))
]
estimator = PipelineRFE(pipeline)

In [25]:
# Stratified cross validation for class imbalance
cv = StratifiedKFold(2)

## Recursive Feature Elimination (each take about 5 minutes to complete given the current parameters)

# ROC AUC --> BLUE
clf_roc = RFECV(estimator, step=5, cv=cv, scoring="roc_auc")
clf_roc.fit(X, s)
clf_roc.finalize()

# Recall --> GREEN
clf_recall = RFECV(estimator, step=5, cv=cv, scoring="recall")
clf_recall.fit(X, s)
clf_recall.finalize()

# Precision --> RED
clf_precision = RFECV(estimator, step=5, cv=cv, scoring="precision")
clf_precision.fit(X, s)
clf_precision.finalize()

# Accuracy --> PURPLE
clf_accuracy = RFECV(estimator, step=5, cv=cv, scoring="accuracy")
clf_accuracy.fit(X, s)
clf_accuracy.finalize()

<IPython.core.display.Javascript object>