# Part 3: Model Building and Evaluation

This notebook demonstrates how to build a **customer churn model** and evaluate its **performance**.

In [1]:
# Standard
import pandas as pd
import numpy as np

# Datapath and Setup
data_path = "C:/Users/Sadek/Documents/GitHub/Project/telecom-customer-churn/data/"

import time

from sklearn import set_config
set_config(display="diagram")

from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline, Pipeline

from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv(data_path+"WA_Fn-UseC_-Telco-Customer-Churn.csv")
## data preprocessing and munging
# drope customerID and TotalCharges columns
dataset = dataset.drop(["customerID", "TotalCharges"], axis=1)
# Change 1/0 to Yes/No to match the other binary features
dataset = dataset.replace({'SeniorCitizen': {1: 'Yes', 0: 'No'}})
dataset = dataset.replace(regex=r'No.*service', value='No')
dataset.head(3)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,Churn
0,Female,No,Yes,No,1,No,No,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,No
1,Male,No,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,No
2,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,Yes


In [3]:
target_name = "Churn"
target = dataset[target_name]
data = dataset.drop(columns=target_name)

In [4]:
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(data)
categorical_columns = categorical_columns_selector(data)

In [5]:
data_train, data_test, target_train, target_test = train_test_split(data, target, random_state=42)

print(f"Number of samples in training: {data_train.shape[0]} => "
      f"{data_train.shape[0] / dataset.shape[0] * 100:.1f}% of the"
      f" original set")

print(f"Number of samples in testing: {data_test.shape[0]} => "
      f"{data_test.shape[0] / dataset.shape[0] * 100:.1f}% of the"
      f" original set")

Number of samples in training: 5282 => 75.0% of the original set
Number of samples in testing: 1761 => 25.0% of the original set


## Logistic regression model

In [6]:
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard_scaler', numerical_preprocessor, numerical_columns)
])

model = make_pipeline(preprocessor, LogisticRegression())

In [7]:
cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0)
cv_results = cross_validate(model, data_train, target_train, scoring="balanced_accuracy", cv=cv)

In [8]:
cv_results = pd.DataFrame(cv_results)
cv_results

Unnamed: 0,fit_time,score_time,test_score
0,0.062647,0.017636,0.710479
1,0.07,0.023538,0.718246
2,0.080005,0.016522,0.717264
3,0.070039,0.009808,0.709851
4,0.079529,0.015974,0.728384
5,0.063245,0.015254,0.724978
6,0.069889,0.029864,0.717079
7,0.069842,0.01006,0.699176
8,0.07086,0.020985,0.69504
9,0.078457,0.019819,0.698229


In [9]:
print(f"The mean cross-validated score is: "
      f"{cv_results['test_score'].mean():.2f}")

print(f"The standard deviation of the score is: "
      f"{cv_results['test_score'].std():.2f}")

The mean cross-validated score is: 0.71
The standard deviation of the score is: 0.01


In [10]:
%%time

param_grid = {
    'classifier__C' : np.logspace(-4, 4, 20),
    'classifier__max_iter' : [100, 1000,2500, 5000]
}

model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression())
])

model_grid_search = GridSearchCV(model, param_grid=param_grid,
                                 n_jobs=2, cv=2)

model_grid_search.fit(data_train, target_train)

CPU times: total: 1.36 s
Wall time: 9.57 s


In [11]:
accuracy = model_grid_search.score(data_test, target_test)
print(
    f"The test accuracy score of the grid-searched pipeline is: "
    f"{accuracy:.2f}"
)

The test accuracy score of the grid-searched pipeline is: 0.81


In [12]:
print(f"The best set of parameters is: "
      f"{model_grid_search.best_params_}")

The best set of parameters is: {'classifier__C': 545.5594781168514, 'classifier__max_iter': 100}
