# Part 3: Modeling and Evaluation

This notebook demonstrates how to build a **customer churn model** and evaluate its **performance**.

In [14]:
# Standard
import pandas as pd
import numpy as np

# Datapath and Setup
data_path = "C:/Users/Sadek/Documents/GitHub/Project/telecom-customer-churn/data/"

import time

from sklearn import set_config
set_config(display="diagram")

from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline, Pipeline

from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv(data_path+"WA_Fn-UseC_-Telco-Customer-Churn.csv")
## data preprocessing and munging
# drope customerID and TotalCharges columns
dataset = dataset.drop(["customerID", "TotalCharges"], axis=1)
# Change 1/0 to Yes/No to match the other binary features
dataset = dataset.replace({'SeniorCitizen': {1: 'Yes', 0: 'No'}})
dataset = dataset.replace(regex=r'No.*service', value='No')
dataset.head(3)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,Churn
0,Female,No,Yes,No,1,No,No,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,No
1,Male,No,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,No
2,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,Yes


In [3]:
target_name = "Churn"
target = dataset[target_name]
data = dataset.drop(columns=target_name)

In [4]:
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(data)
categorical_columns = categorical_columns_selector(data)

In [5]:
data_train, data_test, target_train, target_test = train_test_split(data, target, random_state=42)

print(f"Number of samples in training: {data_train.shape[0]} => "
      f"{data_train.shape[0] / dataset.shape[0] * 100:.1f}% of the"
      f" original set")

print(f"Number of samples in testing: {data_test.shape[0]} => "
      f"{data_test.shape[0] / dataset.shape[0] * 100:.1f}% of the"
      f" original set")

Number of samples in training: 5282 => 75.0% of the original set
Number of samples in testing: 1761 => 25.0% of the original set


## Logistic regression model

In [77]:
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard_scaler', numerical_preprocessor, numerical_columns)
])

model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression())
])

### Manual tuning

In [97]:
from sklearn.model_selection import cross_val_score

best_score = 0
best_params = {}
cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0)
for C in [1e-3, 1e-2, 1e-1, 1, 10]:
    print(f"Evaluating model with C = {C:.3f}")
    model.set_params(classifier__C=C)
    scores = cross_val_score(model, data_train, target_train, cv=cv, scoring="balanced_accuracy")
    mean_score = scores.mean()
    print(f"score: {mean_score:.3f}")
    if mean_score > best_score:
        best_score = mean_score
        best_params = {'C': C}
        print(f"Found new best model with score {best_score:.4f}!")
    print("\n")

print(f"The best accuracy obtained is {best_score:.3f}")
print(f"The best parameters found:\n {best_params}")

Evaluating model with C = 0.001
score: 0.513
Found new best model with score 0.5130!


Evaluating model with C = 0.010
score: 0.688
Found new best model with score 0.6880!


Evaluating model with C = 0.100
score: 0.709
Found new best model with score 0.7087!


Evaluating model with C = 1.000
score: 0.712
Found new best model with score 0.7119!


Evaluating model with C = 10.000
score: 0.711


The best accuracy obtained is 0.712
The best parameters found:
 {'C': 1}


In [98]:
best_C = best_params['C']

model.set_params(classifier__C=best_C)
model.fit(data_train, target_train)
test_score = cross_val_score(model, data_test, target_test, cv=cv, scoring="balanced_accuracy")

print(f"The mean cross-validated test score is: "
      f"{test_score.mean():.2f}")

print(f"The standard deviation of the test score is: "
      f"{test_score.std():.2f}")

The mean cross-validated test score is: 0.74
The standard deviation of the test score is: 0.02


### Hyperparameter tuning with grid-search

In [102]:
%%time

param_grid = {
    'classifier__C' : np.logspace(-4, 4, 20),
    'classifier__max_iter' : [100, 1000,2500, 5000]
}

model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression())
])

model_grid_search = GridSearchCV(model, param_grid=param_grid,
                                 n_jobs=2, cv=cv, scoring="balanced_accuracy")

model_grid_search.fit(data_train, target_train)

CPU times: total: 4.59 s
Wall time: 46.2 s


In [104]:
# get the parameter names
column_results = [
    f"param_{name}" for name in param_grid.keys()]
column_results += [
    "mean_test_score", "std_test_score", "rank_test_score"]

cv_results = pd.DataFrame(model_grid_search.cv_results_)
cv_results = cv_results[column_results].sort_values(
    "mean_test_score", ascending=False)

def shorten_param(param_name):
    if "__" in param_name:
        return param_name.rsplit("__", 1)[1]
    return param_name

cv_results = cv_results.rename(shorten_param, axis=1)
cv_results

Unnamed: 0,C,max_iter,mean_test_score,std_test_score,rank_test_score
59,78.475997,5000,0.712076,0.011088,1
56,78.475997,100,0.712076,0.011088,1
57,78.475997,1000,0.712076,0.011088,1
58,78.475997,2500,0.712076,0.011088,1
43,1.623777,5000,0.711993,0.010729,5
...,...,...,...,...,...
8,0.000695,100,0.500000,0.000000,69
9,0.000695,1000,0.500000,0.000000,69
10,0.000695,2500,0.500000,0.000000,69
11,0.000695,5000,0.500000,0.000000,69


In [106]:
cv_test_scores = cv_results['mean_test_score']
print(
    "Generalization score with hyperparameters tuning:\n"
    f"{cv_test_scores.mean():.3f} ± {cv_test_scores.std():.3f}"
)

Generalization score with hyperparameters tuning:
0.671 ± 0.077


In [None]:
accuracy = model_grid_search.score(data_test, target_test)
print(
    f"The test accuracy score of the grid-searched pipeline is: "
    f"{accuracy:.2f}"
)

In [100]:
test_score = cross_val_score(model_grid_search, data_test, target_test, cv=cv, scoring="balanced_accuracy")

In [89]:
print(f"The mean cross-validated score is: "
      f"{test_score.mean():.2f}")

print(f"The standard deviation of the score is: "
      f"{test_score.std():.2f}")

The mean cross-validated score is: 0.73
The standard deviation of the score is: 0.03


In [72]:
accuracy = model_grid_search.score(data_test, target_test)
print(
    f"The test accuracy score of the grid-searched pipeline is: "
    f"{accuracy:.2f}"
)

The test accuracy score of the grid-searched pipeline is: 0.81


In [12]:
print(f"The best set of parameters is: "
      f"{model_grid_search.best_params_}")

The best set of parameters is: {'classifier__C': 545.5594781168514, 'classifier__max_iter': 100}


In [73]:
cv_results = pd.DataFrame(model_grid_search.cv_results_).sort_values(
    "mean_test_score", ascending=False)
cv_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__C,param_classifier__max_iter,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
64,0.064388,0.000911,0.030369,0.000236,545.559478,100,"{'classifier__C': 545.5594781168514, 'classifi...",0.800833,0.801212,0.801022,0.000189,1
65,0.066351,0.004596,0.031154,9.6e-05,545.559478,1000,"{'classifier__C': 545.5594781168514, 'classifi...",0.800833,0.801212,0.801022,0.000189,1
66,0.066002,0.002004,0.031751,0.000756,545.559478,2500,"{'classifier__C': 545.5594781168514, 'classifi...",0.800833,0.801212,0.801022,0.000189,1
67,0.065471,0.001991,0.029493,0.000498,545.559478,5000,"{'classifier__C': 545.5594781168514, 'classifi...",0.800833,0.801212,0.801022,0.000189,1
61,0.06222,0.000486,0.03087,0.001222,206.913808,1000,"{'classifier__C': 206.913808111479, 'classifie...",0.800833,0.800833,0.800833,0.0,5


In [74]:
# get the parameter names
column_results = [f"param_{name}" for name in param_grid.keys()]
column_results += [
    "mean_test_score", "std_test_score", "rank_test_score"]
cv_results = cv_results[column_results]

In [75]:
def shorten_param(param_name):
    if "__" in param_name:
        return param_name.rsplit("__", 1)[1]
    return param_name


cv_results = cv_results.rename(shorten_param, axis=1)
cv_results

Unnamed: 0,C,max_iter,mean_test_score,std_test_score,rank_test_score
64,545.559478,100,0.801022,0.000189,1
65,545.559478,1000,0.801022,0.000189,1
66,545.559478,2500,0.801022,0.000189,1
67,545.559478,5000,0.801022,0.000189,1
61,206.913808,1000,0.800833,0.000000,5
...,...,...,...,...,...
8,0.000695,100,0.736842,0.000000,69
9,0.000695,1000,0.736842,0.000000,69
10,0.000695,2500,0.736842,0.000000,69
11,0.000695,5000,0.736842,0.000000,69


In [76]:
# cv_results = pd.DataFrame(cv_results)
cv_test_scores = cv_results['mean_test_score']
print(
    "Generalization score with hyperparameters tuning:\n"
    f"{cv_test_scores.mean():.3f} ± {cv_test_scores.std():.3f}"
)

Generalization score with hyperparameters tuning:
0.788 ± 0.024


In [21]:
for cv_fold, estimator_in_fold in enumerate(cv_results["estimator"]):
    print(
        f"Best hyperparameters for fold #{cv_fold + 1}:\n"
        f"{estimator_in_fold.best_params_}"
    )

KeyError: 'estimator'