In [1]:
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [2]:
digits = load_digits()
trainX, testX, trainY, testY = train_test_split(digits.data, digits.target)

In [3]:
digits.data.shape

(1797, 64)

In [4]:
digit_pipeline = make_pipeline(StandardScaler(),RandomForestClassifier())

In [5]:
digit_pipeline.fit(trainX,trainY)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestclassifier', RandomForestClassifier())])

In [6]:
digit_pipeline.predict(testX)

array([0, 6, 8, 4, 7, 9, 0, 7, 3, 2, 8, 1, 3, 8, 8, 6, 6, 3, 3, 2, 2, 7,
       1, 6, 9, 9, 7, 6, 6, 2, 1, 2, 1, 1, 5, 3, 3, 6, 0, 5, 5, 3, 9, 7,
       9, 7, 2, 6, 7, 7, 7, 3, 2, 1, 7, 7, 5, 9, 2, 7, 2, 2, 0, 2, 9, 6,
       8, 3, 5, 3, 5, 3, 1, 4, 5, 7, 9, 7, 6, 1, 4, 9, 1, 7, 5, 9, 9, 6,
       0, 4, 1, 8, 1, 9, 7, 5, 1, 4, 8, 5, 1, 8, 1, 3, 5, 0, 6, 1, 2, 5,
       3, 0, 5, 9, 9, 7, 3, 1, 4, 9, 9, 2, 8, 1, 2, 8, 2, 5, 6, 4, 0, 7,
       5, 8, 3, 5, 5, 9, 0, 9, 1, 9, 0, 7, 6, 7, 0, 0, 4, 6, 1, 8, 7, 8,
       7, 5, 8, 5, 2, 5, 3, 7, 8, 3, 8, 0, 2, 6, 2, 8, 3, 0, 2, 7, 7, 1,
       3, 4, 0, 3, 7, 9, 1, 5, 3, 2, 8, 5, 2, 2, 7, 4, 7, 6, 0, 2, 5, 2,
       8, 1, 6, 4, 3, 6, 1, 0, 1, 5, 8, 9, 3, 9, 2, 5, 0, 5, 5, 1, 4, 3,
       3, 1, 5, 1, 6, 0, 6, 0, 5, 5, 2, 2, 0, 2, 8, 4, 5, 1, 4, 7, 3, 4,
       1, 7, 2, 9, 3, 0, 5, 5, 2, 3, 0, 0, 2, 3, 4, 3, 1, 8, 4, 6, 4, 6,
       4, 6, 2, 7, 7, 3, 0, 2, 5, 5, 4, 3, 7, 8, 7, 0, 0, 0, 0, 1, 9, 5,
       6, 5, 6, 1, 9, 9, 0, 5, 3, 5, 8, 2, 8, 9, 1,

In [7]:
digit_pipeline.steps

[('standardscaler', StandardScaler()),
 ('randomforestclassifier', RandomForestClassifier())]

In [8]:
digit_pipeline.steps[1].feature_importances_

AttributeError: 'tuple' object has no attribute 'feature_importances_'

In [None]:
digit_pipeline.steps[1][1].feature_importances_

Connecting Pipeline with Hyper-parameter Tuning

    We could determine the best combination of hyper-parameters for preprocessor & estimator

In [9]:
from sklearn.feature_selection import SelectKBest,f_classif

In [10]:
digit_pipeline = make_pipeline(StandardScaler(),SelectKBest(k=20,score_func=f_classif),RandomForestClassifier(n_estimators=100))

In [11]:
digit_pipeline.fit(trainX,trainY)

  f = msb / msw


Pipeline(steps=[('standardscaler', StandardScaler()),
                ('selectkbest', SelectKBest(k=20)),
                ('randomforestclassifier', RandomForestClassifier())])

In [12]:
digit_pipeline.steps

[('standardscaler', StandardScaler()),
 ('selectkbest', SelectKBest(k=20)),
 ('randomforestclassifier', RandomForestClassifier())]

In [13]:
params = {'selectkbest__k':[10,50,70],'randomforestclassifier__n_estimators':[500,600,700]}

In [14]:
gs = GridSearchCV(digit_pipeline,param_grid= params,cv = 5, n_jobs = 3)

In [15]:
gs.fit(trainX,trainY)

  f = msb / msw


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('selectkbest', SelectKBest(k=20)),
                                       ('randomforestclassifier',
                                        RandomForestClassifier())]),
             n_jobs=3,
             param_grid={'randomforestclassifier__n_estimators': [500, 600,
                                                                  700],
                         'selectkbest__k': [10, 50, 70]})

In [16]:
gs.best_score_

0.9747459727385378

In [17]:
gs.best_params_

{'randomforestclassifier__n_estimators': 500, 'selectkbest__k': 50}

In [18]:
gs.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('selectkbest', SelectKBest(k=50)),
                ('randomforestclassifier',
                 RandomForestClassifier(n_estimators=500))])

In [19]:

Column Transformer for dealing with hetrogenous data

    Regular Pipeline intends to do same processing for all the columns
    This doesn't work for hetrogenous data

SyntaxError: invalid syntax (<ipython-input-19-fc6644270a6f>, line 1)

In [None]:
import pandas as pd

In [None]:
hr_data = pd.read_csv('https://raw.githubusercontent.com/edyoda/data-science-complete-tutorial/master/Data/HR_comma_sep.csv.txt')

In [None]:
hr_data.rename(columns={'sales':'dept'},inplace=True)

In [None]:
hr_data.sample(5)

In [None]:
feature_data = hr_data.drop(columns=['left'])

In [None]:
target_data = hr_data.left

In [None]:
feature_data.dtypes

satisfaction_level & last_evaluation don't need preprocessing
number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years need MinMaxScaler
dept & Salary need OrdinalEncoder

In [None]:
cat_data = feature_data.select_dtypes(include=['object'])
int_data = feature_data.select_dtypes(include=['int64'])
float_data = feature_data.select_dtypes(include=['float64']) #doesnt require scaling

In [None]:
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer

In [None]:
cat_pipeline = make_pipeline(OrdinalEncoder())
int_pipeline = make_pipeline(MinMaxScaler(),SelectKBest(k=5,score_func=f_classif))

In [None]:
#if missing values are there:
#cat_pipeline = make_pipeline(SimpleImputer(),OrdinalEncoder())

In [None]:
from sklearn.compose import make_column_transformer

In [None]:
preprocessor_pipeline = make_column_transformer(
    (cat_pipeline,cat_data.columns),
    (int_pipeline,int_data.columns),
    remainder = 'passthrough')

In [None]:
pipeline = make_pipeline(preprocessor_pipeline,RandomForestClassifier())

trainX,testX, trainY, testY = train_test_split(feature_data, target_data)

In [None]:
pipeline.fit(trainX, trainY)

In [None]:
pipeline.predict(testX[:2])

In [None]:
testY[:2]

In [None]:
pipeline.score(testX,testY)

In [None]:
pipeline.steps[0]

In [None]:
params = {'columntransformer__pipeline-2__selectkbest__k':[4,5,6],
          'randomforestclassifier__n_estimators':[50,100,200]}

In [None]:
gs = GridSearchCV(pipeline,param_grid=params,cv=5,n_jobs=4)

In [None]:
gs.fit(trainX,trainY)

In [None]:
gs.best_score_


Disadvantages of Pipeline

    Doesn't support Online Learning
    Online Learning will be discussed later

Dealing with imbalanced data in pipeline

    Use imblearn make_pipeline rather than scikit make_pipeline as RandomOverSampler is not supported in scikit



In [None]:
target_data.value_counts()

In [None]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import make_pipeline

#This make_pipeline is not scikit pipeline but imblearn pipeline which support Oversampler as part of pipeline

In [None]:
imb_pipeline = make_pipeline(preprocessor_pipeline,RandomOverSampler,RandomForestClassifier())

In [None]:
target_data.value_counts()