### RFE (recurssive feature elimination) on diabetes data

In [11]:
import pandas as pd
from sklearn.feature_selection import RFE

import warnings
warnings.filterwarnings(action= "ignore")

In [12]:
url      = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names    = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
colnames = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age']

df = pd.read_csv(url, names=names)

data = df.copy()

In [13]:
data.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [14]:
X = data.drop("class", axis=1)
y = data['class']

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X , y, test_size = 0.20,random_state = 0)
print(X_train.shape, X_test.shape)

(614, 8) (154, 8)


In [20]:
from sklearn.linear_model import LogisticRegression

lore = LogisticRegression()

In [27]:
rfe = RFE(lore, n_features_to_select=3, step= 2)

In [28]:
rfe.fit(X_train, y_train)

RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                 fit_intercept=True, intercept_scaling=1,
                                 l1_ratio=None, max_iter=100,
                                 multi_class='warn', n_jobs=None, penalty='l2',
                                 random_state=None, solver='warn', tol=0.0001,
                                 verbose=0, warm_start=False),
    n_features_to_select=3, step=2, verbose=0)

In [29]:
rfe.get_support()

array([ True, False, False, False, False,  True,  True, False])

In [30]:
features = X.columns[rfe.get_support()]
features

Index(['preg', 'mass', 'pedi'], dtype='object')

In [31]:
new_data = X[features]

In [32]:
new_data.head()

Unnamed: 0,preg,mass,pedi
0,6,33.6,0.627
1,1,26.6,0.351
2,8,23.3,0.672
3,1,28.1,0.167
4,0,43.1,2.288


### Now our feature selection is done, Now let us try pyperparameter tuning

In [34]:
from sklearn.pipeline import Pipeline

pipe = Pipeline( [ ('f_selector', rfe) , ('algorithm' , lore)])

from sklearn.model_selection import GridSearchCV

pipe.get_params()

{'memory': None,
 'steps': [('f_selector',
   RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None, penalty='l2',
                                    random_state=None, solver='warn', tol=0.0001,
                                    verbose=0, warm_start=False),
       n_features_to_select=3, step=2, verbose=0)),
  ('algorithm',
   LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                      intercept_scaling=1, l1_ratio=None, max_iter=100,
                      multi_class='warn', n_jobs=None, penalty='l2',
                      random_state=None, solver='warn', tol=0.0001, verbose=0,
                      warm_start=False))],
 'verbose': False,
 'f_selector': RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=F

In [42]:
param = {'f_selector__n_features_to_select': [3,5],
         'f_selector__step': [1,2],
         'algorithm__C': [0.5,1.0, 2],
         'algorithm__max_iter': [90,100,110]       
}

In [43]:
grid = GridSearchCV(pipe, param, cv = 5, scoring= "recall_weighted")

In [44]:
grid.fit(X_train, y_train)

print("Train", grid.score(X_train, y_train))
print("Test", grid.score(X_test, y_test))

Train 0.7671009771986971
Test 0.8116883116883117


In [45]:
grid.best_params_

{'algorithm__C': 1.0,
 'algorithm__max_iter': 90,
 'f_selector__n_features_to_select': 5,
 'f_selector__step': 1}

In [49]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [59]:
for index in range(1,9):
    sel = RFE(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1), n_features_to_select = index)
    sel.fit(X_train, y_train)
    X_train_rfe = sel.transform(X_train)
    X_test_rfe = sel.transform(X_test)
    print("Selected Features: " , index)
    clf = LogisticRegression()
    clf.fit(X_train_rfe, y_train)
    y_pred = clf.predict(X_test_rfe)
    print("Accuracy : ", accuracy_score(y_test, y_pred))

Selected Features:  1
Accuracy :  0.7792207792207793
Selected Features:  2
Accuracy :  0.7922077922077922
Selected Features:  3
Accuracy :  0.7857142857142857
Selected Features:  4
Accuracy :  0.7922077922077922
Selected Features:  5
Accuracy :  0.7987012987012987
Selected Features:  6
Accuracy :  0.8116883116883117
Selected Features:  7
Accuracy :  0.8181818181818182
Selected Features:  8
Accuracy :  0.8181818181818182


In [54]:
from sklearn.ensemble import GradientBoostingClassifier

In [60]:
for index in range(1,10):
    sel = RFE(GradientBoostingClassifier(n_estimators=100, random_state=0), n_features_to_select = index)
    sel.fit(X_train, y_train)
    X_train_rfe = sel.transform(X_train)
    X_test_rfe = sel.transform(X_test)
    print("Selected Features: " , index)
    clf = LogisticRegression()
    clf.fit(X_train_rfe, y_train)
    y_pred = clf.predict(X_test_rfe)
    print("Accuracy : ", accuracy_score(y_test, y_pred))

Selected Features:  1
Accuracy :  0.7792207792207793
Selected Features:  2
Accuracy :  0.7922077922077922
Selected Features:  3
Accuracy :  0.7987012987012987
Selected Features:  4
Accuracy :  0.7922077922077922
Selected Features:  5
Accuracy :  0.8051948051948052
Selected Features:  6
Accuracy :  0.8051948051948052
Selected Features:  7
Accuracy :  0.8181818181818182
Selected Features:  8
Accuracy :  0.8181818181818182
Selected Features:  9
Accuracy :  0.8181818181818182
