# Advanced Pipelines with scikit-learn

Author: Konstantin Rink

Article from [towardsdatascience.com](https://towardsdatascience.com/advanced-pipelines-with-scikit-learn-4204bb71019b).

> Note: In this notebook, I am studying the article mentioned above. Some changes may have been made to the code during its implementation.

# Library

In [1]:
# The usual suspects
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Sklearn
from sklearn.model_selection import (train_test_split, RandomizedSearchCV, RepeatedStratifiedKFold, cross_validate)

# Assemble pipeline(s)
from sklearn import set_config
from sklearn.pipeline import make_pipeline, Pipeline
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

# Handle constant/duplicates and missing features/columns
from feature_engine.selection import (DropFeatures, DropConstantFeatures, DropDuplicateFeatures)

# Sampling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Models
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.inspection import permutation_importance
from scipy.stats import loguniform

set_config(display='diagram') # make pipeline visible

# Step 0: Preparation and data loading

In [2]:
XX = pd.read_excel('datasets/NewspaperChurn new version.xlsx')
XX.columns = [k.lower().replace(' ', '_') for k in XX.columns]
XX.rename(columns={'subscriber':'churn'}, inplace=True)
XX['churn'].replace({'NO':False, 'YES':True}, inplace=True)

  XX = pd.read_excel('datasets/NewspaperChurn new version.xlsx')


In [3]:
XX[XX.select_dtypes(['object']).columns] = XX.select_dtypes(['object']).apply(lambda x: x.astype('category'))

In [4]:
X = XX.drop('churn', axis=1)
y = XX['churn']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [6]:
X.head()

Unnamed: 0,subscriptionid,hh_income,home_ownership,ethnicity,dummy_for_children,year_of_residence,age_range,language,address,state,city,county,zip_code,weekly_fee,deliveryperiod,nielsen_prizm,reward_program,source_channel
0,180590686,"$ 30,000 - $39,999",RENTER,German,N,1,25-29,German,1 3RD PL UNIT 703,CA,LONG BEACH,LOS ANGELES,90802,$7.00 - $7.99,7Day,FM,0,CircAdm
1,181401653,"$500,000 Plus",OWNER,unknown,Y,14,50-54,,1 AVIGNON,CA,NEWPORT COAST,ORANGE,92657,$0.01 - $0.50,SunOnly,MW,0,Partner
2,180374685,"$100,000 - $124,999",OWNER,Italian,Y,7,45-49,English,1 BLACKSWAN,CA,IRVINE,ORANGE,92604,$0.01 - $0.50,SunOnly,MW,0,Partner
3,180703483,"$200,000 - $249,999",OWNER,English,N,23,55-59,English,1 BLUE HORIZON,CA,LAGUNA NIGUEL,ORANGE,92677,$1.00 - $1.99,SunOnly,MW,1,Internet
4,180358906,"$ 50,000 - $59,999",OWNER,Italian,N,23,60-64,English,1 BRISA DEL LAGO,CA,RANCHO SANTA MARGARITA,ORANGE,92688,$8.00 - $8.99,Thu-Sun,MM,0,Crew


In [7]:
X.shape

(15855, 18)

In [8]:
y.head()

0    False
1     True
2     True
3    False
4     True
Name: churn, dtype: bool

In [8]:
y.shape

(15855,)

# Step 1: Exclude features

In [9]:
ppl = Pipeline(
    [
        # Step 1: Drop irrelevant columns/features
        ('drop_columns', DropFeatures(['subscriptionid'])),
        ('drop_constant_values', DropConstantFeatures(tol=1, missing_values='ignore')),
        ('drop_duplicates', DropDuplicateFeatures())
    ]
)

# Step 2: Impute and transform values

In [11]:
ppl = Pipeline(
    [
        # Step 1: Drop irrelevant columns/features
        ('drop_columns', DropFeatures(['subscriptionid'])),
        ('drop_constant_values', DropConstantFeatures(tol=1, missing_values='ignore')),
        ('drop_duplicates', DropDuplicateFeatures()),

        # Step 2: Impute and scale columns/features
        (
            'cleaning',
            ColumnTransformer(
                [
                    # Step 2.1: apply steps for numerical features
                    (
                        'num',
                        make_pipeline(
                            SimpleImputer(strategy='mean'),
                            MinMaxScaler()
                        ),
                        make_column_selector(dtype_include='int64')
                    ),
                    # Step 2.2: apply steps for categorical features
                    (
                        'cat',
                        make_pipeline(
                            SimpleImputer(strategy='most_frequent'),
                            OneHotEncoder(sparse=False, handle_unknown='ignore')
                        ),
                        make_column_selector(dtype_include='category')
                    )
                ]
            )
        )
    ]
)

# Step 3: Sampling

In [5]:
ppl = Pipeline(
    [
        # Step 1: Drop irrelevant columns/features
        ('drop_columns', DropFeatures(['subscriptionid'])),
        ('drop_constant_values', DropConstantFeatures(tol=1, missing_values='ignore')),
        ('drop_duplicates', DropDuplicateFeatures()),

        # Step 2: Impute and scale columns/features
        (
            'cleaning',
            ColumnTransformer(
                [
                    # Step 2.1: apply steps for numerical features
                    (
                        'num',
                        make_pipeline(
                            SimpleImputer(strategy='mean'),
                            MinMaxScaler()
                        ),
                        make_column_selector(dtype_include='int64')
                    ),
                    # Step 2.2: apply steps for categorical features
                    (
                        'cat',
                        make_pipeline(
                            SimpleImputer(strategy='most_frequent'),
                            OneHotEncoder(sparse=False, handle_unknown='ignore')
                        ),
                        make_column_selector(dtype_include='category')
                    )
                ]
            )
        ),

        # Step 3: Sampling
        ('smote', SMOTE())
    ]
)

In [6]:
ppl

## Extract transformed and scaled features from the pipeline


In [6]:
ppl_fts = ppl[0:4]
ppl_fts.fit(X_train, y_train)
features = ppl_fts.get_feature_names_out()
pd.Series(features)

0                    num__year_of_residence
1                             num__zip_code
2                       num__reward_program
3        cat__hh_income_$  20,000 - $29,999
4        cat__hh_income_$  30,000 - $39,999
                        ...                
12900               cat__source_channel_TMC
12901            cat__source_channel_TeleIn
12902           cat__source_channel_TeleOut
12903               cat__source_channel_VRU
12904          cat__source_channel_iSrvices
Length: 12905, dtype: object

# Step 4: Building the ensemble classifier

In [7]:
# Linear model (logistic regression)
lr = LogisticRegression(warm_start=True, max_iter=400)
# RandomForest
rf = RandomForestClassifier()
# XGB
xgb = XGBClassifier(tree_method='hist', verbosity=0, silent=True)
# Ensemble
lr_xgb_rf = VotingClassifier(
    estimators=[
        ('lr', lr), ('xgb', xgb), ('rf', rf)
    ],
    voting='soft'
)

In [8]:
ppl = imbPipeline( # It would crash with the Pipeline object from sklearn
    [
        # Step 1: Drop irrelevant columns/features
        ('drop_columns', DropFeatures(['subscriptionid'])),
        ('drop_constant_values', DropConstantFeatures(tol=1, missing_values='ignore')),
        ('drop_duplicates', DropDuplicateFeatures()),

        # Step 2: Impute and scale columns/features
        (
            'cleaning',
            ColumnTransformer(
                [
                    # Step 2.1: apply steps for numerical features
                    (
                        'num',
                        make_pipeline(
                            SimpleImputer(strategy='mean'),
                            MinMaxScaler()
                        ),
                        make_column_selector(dtype_include='int64')
                    ),
                    # Step 2.2: apply steps for categorical features
                    (
                        'cat',
                        make_pipeline(
                            SimpleImputer(strategy='most_frequent'),
                            OneHotEncoder(sparse=False, handle_unknown='ignore')
                        ),
                        make_column_selector(dtype_include='category')
                    )
                ]
            )
        ),

        # Step 3: Sampling
        ('smote', SMOTE()),

        # Step 4: Voting classifier
        ('ensemble', lr_xgb_rf)
    ]
)

In [10]:
ppl

# Step 5: Hyperparameter tuning and feature importance

## Hyperparameter tuning

In [None]:
# Hyperparameter tuning
params = {
    'ensemble__lr__solver': ['newton-cg', 'lbfgs', 'liblinear'],
    'ensemble__lr__penalty': ['none', 'l1', 'l2', 'elasticnet'],
    'ensemble__lr__C': loguniform(1e-5, 100),
    'ensemble__xgb__learning_rate': [0.1],
    'ensemble__xgb__max_depth': [7, 10, 15, 20],
    'ensemble__xgb__min_child_weight': [10, 15, 20, 25],
    'ensemble__xgb__colsample_bytree': [0.8, 0.9, 1],
    'ensemble__xgb__n_estimators': [300, 400, 500, 600],
    'ensemble__xgb__reg_alpha': [0.5, 0.2, 1],
    'ensemble__xgb__reg_lambda': [2, 3, 5],
    'ensemble__xgb__gamma': [1, 2, 3],
    'ensemble__rf__max_depth': [7, 10, 15, 20],
    'ensemble__rf__min_samples_leaf': [1, 2, 4],
    'ensemble__rf__min_samples_split': [2, 5, 10],
    'ensemble__rf__n_estimators': [300, 400, 500, 600],
}

rsf = RepeatedStratifiedKFold(random_state=42)
clf = RandomizedSearchCV(ppl, params, scoring='roc_auc', verbose=2, cv=rsf, n_jobs=-2)
clf.fit(X_train, y_train)

print("Best Score: ", clf.best_score_)
print("Best Params: ", clf.best_params_)
print("AUC:", roc_auc_score(y_val, clf.predict(X_val)))

Fitting 50 folds for each of 10 candidates, totalling 500 fits


## Feature importance plot

In [11]:
# https://inria.github.io/scikit-learn-mooc/python_scripts/dev_features_importance.html
def plot_feature_importances(perm_importance_result, feat_name):
    """ bar plot the feature importance """
    fig, ax = plt.subplots()

    indices = perm_importance_result['importances_mean'].argsort()
    plt.barh(
        range(len(indices)),
        perm_importance_result['importances_mean'][indices],
        xerr=perm_importance_result['importances_std'][indices]
    )
    ax.set_yticks(range(len(indices)))
    ax.set_title("Permutation importance")

    tmp = np.array(feat_name)
    _ = ax.set_yticklabels(tmp[indices])

# Extract feature names after the transformation steps
# Therefore, we have to fit one part ([0:4]) of our pipeline to our data
ppl_fts = ppl[0:4]
ppl_fts.fit(X_train, y_train)
features = ppl_fts.get_feature_names_out()

# We provide the function our hyperparameter-tuned model/pipeline: clf
# In case we do not use hyperparameter tuning, we could provide here a fitted version of ppl
# For example: ppl.fit(X_train, y_train)
perm_importance_result_train = permutation_importance(clf, X_train, y_train, random_state=42, n_jobs=-2)
plot_feature_importances(perm_importance_result_train, features)

NotFittedError: This RandomizedSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.