In [32]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV

import warnings
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv('https://raw.githubusercontent.com/srivatsan88/YouTubeLI/master/dataset/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [6]:
df.shape

(7043, 21)

In [11]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [9]:
X,y = (df.drop(columns='Churn'), df.Churn)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=124)

In [12]:
drop_feat = ['customerID', 'gender', 'MultipleLines', 'PaperlessBilling', 'PaymentMethod']
numeric_feat = ['tenure', 'TotalCharges']
categorical_feat = ['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract']

In [13]:
drop_transformer = ColumnTransformer(transformers=[('drop_columns', 'drop', drop_feat)], remainder='passthrough')

In [14]:
pipeline = Pipeline([('drop_column', drop_transformer)])

In [15]:
pipeline.fit(X_train)

In [16]:
df.replace(r'^\s*$', np.nan, regex=True).isna().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [17]:
def remove_space(input_df):
    input_df['TotalCharges'] = input_df['TotalCharges'].replace(r'^\s*$', np.nan, regex=True)
    return input_df

In [18]:
class SpaceImputeTransformer():

    def __init__(self, func) -> None:
        self.func = func

    def transform(self, input_df, **transform_params):
        return self.func(input_df)
    
    def fit(self, X, y=None, **fit_params):
        return self

In [19]:
pipeline = Pipeline([
    ('space_remover', SpaceImputeTransformer(remove_space)),
    ('drop_column', drop_transformer)
])

In [20]:
pipeline.fit(X_train)

In [25]:
numeric_transformer = Pipeline(steps=[
    ('meanimpute', SimpleImputer(strategy='mean')),
    ('stdscaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('ohe', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

In [26]:
col_transformer = ColumnTransformer(transformers=[
    ('drop_columns', 'drop', drop_feat),
    ('numeric_processing', numeric_transformer, numeric_feat),
    ('categorical_processing', categorical_transformer, categorical_feat)
], remainder='drop')

In [27]:
pipeline = Pipeline([
    ('space_remover', SpaceImputeTransformer(remove_space)),
    ('transform_column', col_transformer)
])

In [28]:
pipeline.fit(X_train)

In [34]:
pipeline_lr = Pipeline([
    ('space_remover', SpaceImputeTransformer(remove_space)),
    ('transform_column', col_transformer),
    ('logistic', LogisticRegression())
])

In [35]:
col_transformer_rf = ColumnTransformer(transformers=[
    ('drop_columns', 'drop', drop_feat),
    ('numeric_processing', numeric_transformer, numeric_feat)
], remainder='drop')

In [42]:
pipeline_rf = Pipeline([
    ('space_remover', SpaceImputeTransformer(remove_space)),
    ('transform_columns', col_transformer_rf),
    ('RForest', RandomForestClassifier())
])

In [37]:
pipeline_all = [pipeline_lr, pipeline_rf]

In [38]:
for pipe in pipeline_all:
    pipe.fit(X_train, y_train)

In [39]:
for i, model in enumerate(pipeline_all):
    print(model.score(X_test, y_test))

0.808329389493611
0.7321344060577378


In [43]:
grid_param = [
    {
        'RForest': [RandomForestClassifier()],
        'RForest__n_estimators': [10, 50, 100],
        'RForest__max_leaf_nodes': [5, 6, 8]
    }
]

grid_search = GridSearchCV(pipeline_rf, param_grid=grid_param, cv=5, verbose=1, n_jobs=1)

In [44]:
rf_model = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [45]:
rf_model.best_score_

0.765314401622718

In [46]:
rf_model.best_params_

{'RForest': RandomForestClassifier(),
 'RForest__max_leaf_nodes': 8,
 'RForest__n_estimators': 100}

In [47]:
lr_grid_param = [
    {
        'logistic': [LogisticRegression()],
        'logistic__penalty': ['l1', 'l2'],
        'logistic__C': [0.001, 0.01, 0.1, 1, 10, 100],
        'logistic__solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
        'logistic__max_iter': [100, 200, 300]
    }
]

In [48]:
lr_grid_search = GridSearchCV(pipeline_lr, param_grid=lr_grid_param, cv=5, verbose=1, n_jobs=1)

In [49]:
lr_grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 180 candidates, totalling 900 fits


In [50]:
lr_grid_search.best_score_

0.7941176470588235

In [51]:
lr_grid_search.best_params_

{'logistic': LogisticRegression(),
 'logistic__C': 10,
 'logistic__max_iter': 100,
 'logistic__penalty': 'l1',
 'logistic__solver': 'saga'}

In [53]:
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(lr_grid_search, f)