#  Before we put everything in a pipeline
# Some more feature selection method
- Select most predictive features.
- Reduce the dimension.


# Methods
- Recursive methods(Wrapper method)
- $\ell_1$ based regularization. Effectively a feature selection method.(embedded method)
- Decision tree based method
- Randomized Linear Lasso or logistic regression
- Univariate selection.(Filter method)

## [Variance threshold](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html)
- Baseline feature selector
- Removes features with no variance.
    + We can control the variance threshold
    

In [82]:
from sklearn.datasets import make_regression
import numpy as np

In [2]:
X,y = make_regression(n_samples=10, n_features= 5, n_informative=3, random_state=1)

In [4]:
np.var(X, axis=0)

array([0.28863843, 0.77628566, 0.86544211, 1.07244455, 1.25635731])

In [5]:
from sklearn.feature_selection import VarianceThreshold

In [6]:
X_sel = VarianceThreshold(threshold=.8).fit_transform(X)

In [7]:
X[1,:], X_sel[1,:]

(array([ 1.13376944, -0.38405435, -0.3224172 , -2.06014071,  1.46210794]),
 array([-0.3224172 , -2.06014071,  1.46210794]))

# Should we  normalize our feature before applying Variance threshold?

# Selection based on statistical test
# f_regression

Uses F test 

# f_classif

for regression

In [81]:
from sklearn.feature_selection import SelectPercentile, f_classif, chi2
from sklearn.preprocessing import Binarizer, scale
from sklearn.datasets import make_classification

In [83]:
X,y = make_regression(n_samples=500,
    n_features=100,
    n_informative=25,random_state =10)

In [85]:
sel_f_classif = SelectPercentile(f_classif, percentile=25).fit(X,y)

  msw = sswn / float(dfwn)


In [86]:
f_classif_support = sel_f_classif.get_support()

In [87]:
np.sum(f_classif_support)

25

Can select a subset containing redundant information

# Chi2

- Use if feature are count.
- Same usage as f_classif

# Recursive method
use previous notes on feature selection.

In [88]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [89]:
X, y = make_classification(n_samples=100, n_features=100, n_informative=5, random_state= 2)

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= .2, random_state =5)

In [91]:
clf = LogisticRegression(random_state=1)

In [92]:
clf.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=1, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [None]:
clf.score(X_train, y_train), clf.score(X_test, y_test)

In [3]:
# can you use RFECV to get better features and generalization



# L1 (Lasso based selection)

In [4]:
from sklearn.linear_model import LogisticRegression

In [None]:
# Can you write the code to do lasso

# Stability selection

In [114]:
from sklearn.linear_model import RandomizedLogisticRegression

In [115]:
sel = RandomizedLogisticRegression(n_resampling=200, random_state= 1)

In [116]:
sel.fit(X_train, y_train)

RandomizedLogisticRegression(C=1, fit_intercept=True, memory=None,
               n_jobs=None, n_resampling=200, normalize=True,
               pre_dispatch='3*n_jobs', random_state=1,
               sample_fraction=0.75, scaling=0.5, selection_threshold=0.25,
               tol=0.001, verbose=False)

In [117]:
sel.scores_

array([0.   , 0.   , 0.005, 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.025, 0.   , 0.   , 0.   , 0.   , 0.   , 0.04 , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.56 , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.1  , 0.   , 0.   , 0.   , 0.535,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.005, 0.   , 0.035,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.005, 0.415,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.01 , 0.   , 0.015, 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.015, 0.   , 0.   , 0.   , 0.   , 0.38 , 0.   , 0.   ,
       0.   ])

In [118]:
np.sum(sel.get_support())

4

In [119]:
X_train_s = sel.transform(X_train)
X_test_s = sel.transform(X_test)

In [120]:
clf= LogisticRegression( penalty='l1', random_state= 1)
clf.fit(X_train_s,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=1, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [121]:
clf.score(X_test_s, y_test)

1.0

In [5]:
from sklearn.linear_model import RandomizedLasso

In [None]:
X,y = make_regression(n_samples=100, n_features=10, n_informative=4, random_state=4)

In [None]:
# can u use Random lasso


# Building pipeline in sklearn

# Two kind of data flow serial or parallel
- Use pipeline class for serial flow
- FeatureUnion for parallel flow

# Pipeline
 - Give sequence as a tuple

In [52]:
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.decomposition import PCA, KernelPCA, TruncatedSVD
from sklearn.linear_model import RandomizedLogisticRegression, LogisticRegression

In [54]:
parallel = FeatureUnion(transformer_list=[('pcs', PCA()), ('kpca', KernelPCA()), ('svd',TruncatedSVD())])

In [55]:
clf = LogisticRegression(C=.1, penalty='l1', random_state=1)

In [56]:
sel = RandomizedLogisticRegression(n_resampling=200, random_state=1)

In [57]:
pipeline =Pipeline(steps=[('parallel_transform', parallel), ('select',sel), ('log_reg_clf', clf)])

In [58]:
from sklearn.model_selection import GridSearchCV

In [59]:
hyper_param= [{'log_reg_clf__C':[10, 5, .1], 'log_reg_clf__penalty': ['l1', 'l2']}]

In [60]:
search_clf= GridSearchCV(estimator=pipeline, param_grid=hyper_param, cv = 5)

In [61]:
search_clf.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('parallel_transform', FeatureUnion(n_jobs=None,
       transformer_list=[('pcs', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('kpca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
     f...e, penalty='l1', random_state=1, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'log_reg_clf__penalty': ['l1', 'l2'], 'log_reg_clf__C': [10, 5, 0.1]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [70]:
search_clf.best_estimator_

Pipeline(memory=None,
     steps=[('parallel_transform', FeatureUnion(n_jobs=None,
       transformer_list=[('pcs', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('kpca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
     f...e, penalty='l1', random_state=1, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [71]:
search_clf.best_score_

0.725

In [72]:
search_clf.best_params_

{'log_reg_clf__C': 10, 'log_reg_clf__penalty': 'l1'}

In [73]:
from sklearn.metrics import classification_report


In [74]:
print(classification_report(y_test, search_clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.89      0.80      0.84        10
           1       0.82      0.90      0.86        10

   micro avg       0.85      0.85      0.85        20
   macro avg       0.85      0.85      0.85        20
weighted avg       0.85      0.85      0.85        20

