<h1>Pipeline for Selection of features</h1>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import shap

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [2]:
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import mutual_info_classif, SelectFromModel
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
pd.options.plotting.backend = "plotly"

In [3]:
warnings.filterwarnings('ignore')

def setup_matplot(titlesize=int(14),labelsize=int(12)):
    import matplotlib.pyplot as plt
    plt.rcParams['figure.figsize'] = [10,8]
    plt.rcParams['figure.titlesize'] = titlesize
    plt.rcParams['figure.labelsize'] = labelsize
    return(plt)

plt = setup_matplot()

In [4]:
path = '../database/diabetes.csv'

df = pd.read_csv(f'{path}')

In [17]:
df.head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1


In [19]:
df.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

In [20]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [22]:
df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [5]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [6]:
X = df.drop('Outcome', axis=1)
X.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [7]:
y = df[['Outcome']]

y.columns

Index(['Outcome'], dtype='object')

In [8]:
X = X.to_numpy()
y = y.to_numpy()

In [9]:
print(f'{df.shape}')
print(f'{X.shape}')
print(f'{y.shape}')

(768, 9)
(768, 8)
(768, 1)


In [10]:
tree = SelectFromModel(ExtraTreesClassifier(n_estimators=10, random_state=444), 
                      threshold='mean')

rfc = RandomForestClassifier(n_estimators=5000, random_state=444)

In [11]:
pipe = Pipeline(steps=[('tree',tree),('rfc',rfc)])

pipe.fit(X,y)

Pipeline(steps=[('tree',
                 SelectFromModel(estimator=ExtraTreesClassifier(n_estimators=10,
                                                                random_state=444),
                                 threshold='mean')),
                ('rfc',
                 RandomForestClassifier(n_estimators=5000, random_state=444))])

In [16]:
pipe[:-1].get_feature_names_out()

array(['x1', 'x5', 'x7'], dtype=object)

In [25]:
params = {'clf__max_features': ['auto', 'sqrt', 'log2']}

In [27]:
gs = GridSearchCV(pipe, params)

In [32]:
pipe.fit(X, y).score(X, y)

1.0

In [34]:
clf = GridSearchCV(pipe, cv=10, verbose=0, param_grid=params)
clf = clf.fit(X, y)

ValueError: Invalid parameter clf for estimator Pipeline(steps=[('tree',
                 SelectFromModel(estimator=ExtraTreesClassifier(n_estimators=10,
                                                                random_state=444),
                                 threshold='mean')),
                ('rfc',
                 RandomForestClassifier(n_estimators=5000, random_state=444))]). Check the list of available parameters with `estimator.get_params().keys()`.