## Feature Selection

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
loans = pd.read_csv('loan_data.csv')

In [3]:
loans.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [4]:
final_data = pd.get_dummies(loans,columns=['purpose'],drop_first=True)
final_data.head()

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,1,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0,0,1,0,0,0,0
1,1,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0,1,0,0,0,0,0
2,1,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0,0,1,0,0,0,0
3,1,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0,0,1,0,0,0,0
4,1,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0,1,0,0,0,0,0


In [5]:
final_data.shape

(9578, 19)

In [6]:
predictors = final_data.drop('not.fully.paid', axis =1)
target = final_data['not.fully.paid']

## Feature selection using Variance Threshold


    VarianceThreshold is a simple baseline approach to feature selection. 
    It removes all features whose variance doesn’t meet some threshold. 
    By default, it removes all zero-variance features, i.e. features that have the same value in all samples.

    As an example, suppose that we have a dataset with boolean features, and we want to remove all features that are either one or zero (on or off) in more than 80% of the samples. 
    Boolean features are Bernoulli random variables, and the variance of such variables is given by {Var}[X] = p(1 - p)

In [7]:
from sklearn.feature_selection import VarianceThreshold

In [8]:
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))

In [9]:
sel.fit(predictors)

VarianceThreshold(threshold=0.15999999999999998)

In [10]:
data = sel.transform(predictors)

In [11]:
sel.get_support()

array([False, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False, False,  True, False, False, False, False], dtype=bool)

In [12]:
predictors[predictors.columns[sel.get_support()]].head()

Unnamed: 0,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,purpose_debt_consolidation
0,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,1
1,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0
2,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,1
3,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,1
4,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0


##  Feature Selection using SelectKBest

    Based on the scoring fuction, this method picks the top K best features
    Since our problem is classification, we can try chi2, f_classif and mutual_info_claasif

## Chi2

Compute chi-squared statistic between feature and target variable. The highest valued features are selected

In [13]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [14]:
sel = SelectKBest(chi2, k=10)

sel.fit(predictors, target)

SelectKBest(k=10, score_func=<function chi2 at 0x000000000B909A60>)

In [15]:
sel.get_support()

array([ True, False,  True, False,  True,  True,  True,  True,  True,
        True, False,  True, False, False, False, False, False,  True], dtype=bool)

In [16]:
predictors[predictors.columns[sel.get_support()]].head()

Unnamed: 0,credit.policy,installment,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,pub.rec,purpose_small_business
0,1,829.1,19.48,737,5639.958333,28854,52.1,0,0,0
1,1,228.22,14.29,707,2760.0,33623,76.7,0,0,0
2,1,366.86,11.63,682,4710.0,3511,25.6,1,0,0
3,1,162.34,8.1,712,2699.958333,33667,73.2,1,0,0
4,1,102.92,14.97,667,4066.0,4740,39.5,0,0,0


## f_classif

Compute the ANOVA F-value between features and target

In [17]:
from sklearn.feature_selection import f_classif

sel = SelectKBest(f_classif, k=10)

sel.fit(predictors, target)

predictors[predictors.columns[sel.get_support()]].head()

Unnamed: 0,credit.policy,int.rate,installment,fico,revol.bal,revol.util,inq.last.6mths,pub.rec,purpose_credit_card,purpose_small_business
0,1,0.1189,829.1,737,28854,52.1,0,0,0,0
1,1,0.1071,228.22,707,33623,76.7,0,0,1,0
2,1,0.1357,366.86,682,3511,25.6,1,0,0,0
3,1,0.1008,162.34,712,33667,73.2,1,0,0,0
4,1,0.1426,102.92,667,4740,39.5,0,0,1,0


##  Feature Selection using SelectPercentile

In [18]:
from sklearn.feature_selection import SelectPercentile

sel = SelectPercentile(f_classif, percentile=30)

sel.fit(predictors, target)

predictors[predictors.columns[sel.get_support()]].head()

Unnamed: 0,credit.policy,int.rate,fico,revol.util,inq.last.6mths,purpose_small_business
0,1,0.1189,737,52.1,0,0
1,1,0.1071,707,76.7,0,0
2,1,0.1357,682,25.6,1,0
3,1,0.1008,712,73.2,1,0
4,1,0.1426,667,39.5,0,0


In [19]:
sel = SelectPercentile(chi2, percentile=30)

sel.fit(predictors, target)

predictors[predictors.columns[sel.get_support()]].head()

Unnamed: 0,installment,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths
0,829.1,737,5639.958333,28854,52.1,0
1,228.22,707,2760.0,33623,76.7,0
2,366.86,682,4710.0,3511,25.6,1
3,162.34,712,2699.958333,33667,73.2,1
4,102.92,667,4066.0,4740,39.5,0


## Recursive Feature Elimination (RFE)

    Given an external estimator that assigns weights to features (e.g., the coefficients of a linear model), recursive feature elimination (RFE) is to select features by recursively considering smaller and smaller sets of features. 
    First, the estimator is trained on the initial set of features and the importance of each feature is obtained either through a coef_ attribute or through a  feature_importances_ attribute. 
    Then, the least important features are pruned from current set of features.
    That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached.

In [20]:
from sklearn.svm import SVC
from sklearn.feature_selection import RFE


# Create the RFE object and rank each pixel
svc = SVC(kernel="linear", C=1)
rfe = RFE(estimator=svc, n_features_to_select=10, step=1)



In [21]:
from sklearn.preprocessing import MinMaxScaler
scaling = MinMaxScaler(feature_range=(-1,1)).fit(predictors)
sclaed_predictors = scaling.transform(predictors)


In [22]:
rfe.fit(sclaed_predictors, target)

RFE(estimator=SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
  n_features_to_select=10, step=1, verbose=0)

In [23]:
predictors[predictors.columns[rfe.get_support()]].head()

Unnamed: 0,credit.policy,dti,fico,revol.bal,inq.last.6mths,delinq.2yrs,purpose_credit_card,purpose_educational,purpose_home_improvement,purpose_major_purchase
0,1,19.48,737,28854,0,0,0,0,0,0
1,1,14.29,707,33623,0,0,1,0,0,0
2,1,11.63,682,3511,1,0,0,0,0,0
3,1,8.1,712,33667,1,0,0,0,0,0
4,1,14.97,667,4740,0,1,1,0,0,0


## Recursive Feature Elimination with cross-validation

http://scikit-learn.org/stable/auto_examples/feature_selection/plot_rfe_with_cross_validation.html#sphx-glr-auto-examples-feature-selection-plot-rfe-with-cross-validation-py

In [24]:
from sklearn.svm import SVC
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_selection import RFECV,RFE
from sklearn.datasets import make_classification

# Build a classification task using 3 informative features
X, y = make_classification(n_samples=1000, n_features=25, n_informative=3,
                           n_redundant=2, n_repeated=0, n_classes=8,
                           n_clusters_per_class=1, random_state=0)

# Create the RFE object and compute a cross-validated score.
svc = SVC(kernel="linear")
# The "accuracy" scoring is proportional to the number of correct
# classifications
rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(y, 2),scoring='accuracy')
rfecv.fit(X, y)
rfe = RFE(estimator=svc, step=1)
rfe.fit(X, y)

print('Original number of features is %s' % X.shape[1])
print("RFE final number of features : %d" % rfe.n_features_)
print("RFECV final number of features : %d" % rfecv.n_features_)
print('')

import numpy as np
g_scores = rfecv.grid_scores_
indices = np.argsort(g_scores)[::-1]
print('Printing RFECV results:')
for f in range(X.shape[1]):
    print("%d. Number of features: %d; Grid_Score: %f" % (f + 1, indices[f]+1, g_scores[indices[f]]))



Original number of features is 25
RFE final number of features : 12
RFECV final number of features : 3

Printing RFECV results:
1. Number of features: 3; Grid_Score: 0.818041
2. Number of features: 4; Grid_Score: 0.816065
3. Number of features: 5; Grid_Score: 0.816053
4. Number of features: 6; Grid_Score: 0.799107
5. Number of features: 7; Grid_Score: 0.797047
6. Number of features: 8; Grid_Score: 0.783034
7. Number of features: 10; Grid_Score: 0.783022
8. Number of features: 9; Grid_Score: 0.781992
9. Number of features: 11; Grid_Score: 0.778028
10. Number of features: 12; Grid_Score: 0.774052
11. Number of features: 14; Grid_Score: 0.762015
12. Number of features: 13; Grid_Score: 0.760075
13. Number of features: 15; Grid_Score: 0.752003
14. Number of features: 16; Grid_Score: 0.750015
15. Number of features: 18; Grid_Score: 0.750003
16. Number of features: 22; Grid_Score: 0.748039
17. Number of features: 17; Grid_Score: 0.746003
18. Number of features: 19; Grid_Score: 0.739105
19. Nu

## https://www.youtube.com/watch?v=wjKvyk8xStg