# Importance of features

In [2]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.datasets import make_classification

x_data_generated, y_data_generated = make_classification()
x_data_generated.shape

(100, 20)

In [17]:
x_data_generated

array([[ 1.34793915,  1.34259015,  1.16845447, ...,  0.53151904,
         1.39887731, -0.31297436],
       [-2.33777249, -0.70077721, -1.39820784, ..., -0.4780427 ,
         0.77924417, -1.36335541],
       [-0.05809792, -1.41985548, -0.28210733, ...,  1.74307962,
         0.10511891, -1.66012329],
       ..., 
       [ 0.61067974, -0.11910972,  0.62508341, ..., -2.1158184 ,
        -0.30088853, -0.24810091],
       [-0.79704326, -0.17195651, -0.83610533, ..., -1.09960286,
        -0.65877379, -1.25722951],
       [-0.14186409, -0.0334284 , -0.02995202, ..., -0.73775394,
        -1.1258902 ,  1.71525909]])

In [3]:
VarianceThreshold(.7).fit_transform(x_data_generated).shape

(100, 20)

In [4]:
VarianceThreshold(.8).fit_transform(x_data_generated).shape

(100, 15)

In [5]:
VarianceThreshold(.9).fit_transform(x_data_generated).shape

(100, 11)

In [6]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

x_data_kbest = SelectKBest(f_classif, k=5).fit_transform(x_data_generated, y_data_generated)
x_data_varth = VarianceThreshold(.9).fit_transform(x_data_generated)

In [7]:
logit = LogisticRegression(solver='lbfgs', random_state=17)

In [8]:
cross_val_score(logit, x_data_generated, y_data_generated, 
                scoring='neg_log_loss', cv=5).mean()

-0.46766120599809791

In [9]:
cross_val_score(logit, x_data_kbest, y_data_generated, 
                scoring='neg_log_loss', cv=5).mean()

-0.33649662605524971

In [10]:
cross_val_score(logit, x_data_varth, y_data_generated, 
                scoring='neg_log_loss', cv=5).mean()

-0.40040131482579

# Selection by Modeling

In [11]:
# Synthetic example

from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline

x_data_generated, y_data_generated = make_classification()

rf = RandomForestClassifier(n_estimators=10, random_state=17)
pipe = make_pipeline(SelectFromModel(estimator=rf), logit)

print(cross_val_score(logit, x_data_generated, y_data_generated, 
                      scoring='neg_log_loss', cv=5).mean())
print(cross_val_score(rf, x_data_generated, y_data_generated, 
                      scoring='neg_log_loss', cv=5).mean())
print(cross_val_score(pipe, x_data_generated, y_data_generated, 
                      scoring='neg_log_loss', cv=5).mean())

-0.373226313776
-0.671146525851
-0.293887045943


In [12]:
from sklearn.preprocessing import StandardScaler
#x_data, y_data = get_data() 
x_data = x_data_generated
y_data = y_data_generated

pipe1 = make_pipeline(StandardScaler(), 
                      SelectFromModel(estimator=rf), logit)

pipe2 = make_pipeline(StandardScaler(), logit)

print('LR + selection: ', cross_val_score(pipe1, x_data, y_data, 
                                          scoring='neg_log_loss', cv=5).mean())
print('LR: ', cross_val_score(pipe2, x_data, y_data, 
                              scoring='neg_log_loss', cv=5).mean())
print('RF: ', cross_val_score(rf, x_data, y_data, 
                              scoring='neg_log_loss', cv=5).mean())

LR + selection:  -0.296954216937
LR:  -0.374785725048
RF:  -0.671146525851


# Selection by Grid Search

In [13]:
# pip install mlxtend
from mlxtend.feature_selection import SequentialFeatureSelector

selector = SequentialFeatureSelector(logit, scoring='neg_log_loss', 
                                     verbose=2, k_features=3, forward=False, n_jobs=-1)

selector.fit(x_data, y_data)

[Parallel(n_jobs=-1)]: Done   8 out of  20 | elapsed:    2.2s remaining:    3.3s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    3.0s finished

[2019-07-02 20:26:14] Features: 19/3 -- score: -0.340573304272[Parallel(n_jobs=-1)]: Done   6 out of  19 | elapsed:    1.8s remaining:    4.0s
[Parallel(n_jobs=-1)]: Done  16 out of  19 | elapsed:    3.0s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  19 out of  19 | elapsed:    3.0s finished

[2019-07-02 20:26:17] Features: 18/3 -- score: -0.311905426986[Parallel(n_jobs=-1)]: Done   5 out of  18 | elapsed:    1.6s remaining:    4.3s
[Parallel(n_jobs=-1)]: Done  15 out of  18 | elapsed:    2.9s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    2.9s finished

[2019-07-02 20:26:20] Features: 17/3 -- score: -0.290231558596[Parallel(n_jobs=-1)]: Done   3 out of  17 | elapsed:    1.3s remaining:    6.7s
[Parallel(n_jobs=-1)]: Done  12 out of  17 | elapsed:    3.1s remaining:    1.2s
[Parallel(n_jobs=-1)]: Done  17

SequentialFeatureSelector(clone_estimator=True, cv=5,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=17, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False),
             floating=False, forward=False, k_features=3, n_jobs=-1,
             pre_dispatch='2*n_jobs', scoring='neg_log_loss', verbose=2)