In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import NearMiss
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle

In [4]:
data = 'WA_Fn-UseC_-Telco-Customer-Churn.csv'
df = pd.read_csv(data)

In [5]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   str    
 1   gender            7043 non-null   str    
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   str    
 4   Dependents        7043 non-null   str    
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   str    
 7   MultipleLines     7043 non-null   str    
 8   InternetService   7043 non-null   str    
 9   OnlineSecurity    7043 non-null   str    
 10  OnlineBackup      7043 non-null   str    
 11  DeviceProtection  7043 non-null   str    
 12  TechSupport       7043 non-null   str    
 13  StreamingTV       7043 non-null   str    
 14  StreamingMovies   7043 non-null   str    
 15  Contract          7043 non-null   str    
 16  PaperlessBilling  7043 non-null   str    
 17  Paymen

## Clean the data

In [6]:
# Dropping ID because its not needed for modelling
df = df.drop(columns=["customerID"])

In [7]:
# Identify unique categorical values 
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
for col in df.columns:
    if col not in numerical_features:
        print(col, df[col].unique())
        print("-"*50)

gender <StringArray>
['Female', 'Male']
Length: 2, dtype: str
--------------------------------------------------
SeniorCitizen [0 1]
--------------------------------------------------
Partner <StringArray>
['Yes', 'No']
Length: 2, dtype: str
--------------------------------------------------
Dependents <StringArray>
['No', 'Yes']
Length: 2, dtype: str
--------------------------------------------------
PhoneService <StringArray>
['No', 'Yes']
Length: 2, dtype: str
--------------------------------------------------
MultipleLines <StringArray>
['No phone service', 'No', 'Yes']
Length: 3, dtype: str
--------------------------------------------------
InternetService <StringArray>
['DSL', 'Fiber optic', 'No']
Length: 3, dtype: str
--------------------------------------------------
OnlineSecurity <StringArray>
['No', 'Yes', 'No internet service']
Length: 3, dtype: str
--------------------------------------------------
OnlineBackup <StringArray>
['Yes', 'No', 'No internet service']
Length: 3, 

In [8]:
# Since there exist whitespace values, we must filter them out
print(f"There are {len(df[df['TotalCharges'] == " "])} empty values \' \'")

There are 11 empty values ' '


In [9]:
df['TotalCharges'] = df['TotalCharges'].replace(" ", "0.0")
df['TotalCharges'] = df['TotalCharges'].astype(float)

In [10]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   str    
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   str    
 3   Dependents        7043 non-null   str    
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   str    
 6   MultipleLines     7043 non-null   str    
 7   InternetService   7043 non-null   str    
 8   OnlineSecurity    7043 non-null   str    
 9   OnlineBackup      7043 non-null   str    
 10  DeviceProtection  7043 non-null   str    
 11  TechSupport       7043 non-null   str    
 12  StreamingTV       7043 non-null   str    
 13  StreamingMovies   7043 non-null   str    
 14  Contract          7043 non-null   str    
 15  PaperlessBilling  7043 non-null   str    
 16  PaymentMethod     7043 non-null   str    
 17  Monthl

In [11]:
# checking the distribution of the target column
print(df["Churn"].value_counts())

Churn
No     5174
Yes    1869
Name: count, dtype: int64


In [12]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
count,7043.0,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692,2279.734304
std,0.368612,24.559481,30.090047,2266.79447
min,0.0,0.0,18.25,0.0
25%,0.0,9.0,35.5,398.55
50%,0.0,29.0,70.35,1394.55
75%,0.0,55.0,89.85,3786.6
max,1.0,72.0,118.75,8684.8


In [13]:
# Label encoding of target column
df["Churn"] = df["Churn"].replace({"Yes" : 1, "No": 0})
df["Churn"] = df["Churn"].astype(int)  # Add this line to make sure the column become integer and not object. FACED THIS PROBLEM
print(df["Churn"].value_counts())

Churn
0    5174
1    1869
Name: count, dtype: int64


In [14]:
# Label encoding of categorical feature columns
categorical_cols = df.select_dtypes(include="str").columns
print(categorical_cols)

Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod'],
      dtype='str')


In [15]:
# Initialize dictionary for saving encoders
# OR DO ONE HOT ENCODING
encoders = {}

# Apply label encoding and store the encoders
for col in categorical_cols:
    label_encoder = LabelEncoder()
    df[col] = label_encoder.fit_transform(df[col])
    encoders[col] = label_encoder

# Save the encoders to a pickle file
with open("encoders_under.pkl", "wb") as f:
    pickle.dump(encoders, f)

In [16]:
encoders

{'gender': LabelEncoder(),
 'Partner': LabelEncoder(),
 'Dependents': LabelEncoder(),
 'PhoneService': LabelEncoder(),
 'MultipleLines': LabelEncoder(),
 'InternetService': LabelEncoder(),
 'OnlineSecurity': LabelEncoder(),
 'OnlineBackup': LabelEncoder(),
 'DeviceProtection': LabelEncoder(),
 'TechSupport': LabelEncoder(),
 'StreamingTV': LabelEncoder(),
 'StreamingMovies': LabelEncoder(),
 'Contract': LabelEncoder(),
 'PaperlessBilling': LabelEncoder(),
 'PaymentMethod': LabelEncoder()}

# Splitting the data

In [17]:
# splitting the features and target
X = df.drop(columns=['Churn'])
y = df['Churn']

In [23]:
# split training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
pd.Series(y_train).value_counts()

Churn
0    4138
1    1496
Name: count, dtype: int64

# Perform undersampling

In [22]:
rus = RandomUnderSampler(random_state=42, sampling_strategy='auto')
X_train_under, y_train_under = rus.fit_resample(X_train, y_train)

In [28]:
pd.Series(y_train_under).value_counts()

Churn
0    1496
1    1496
Name: count, dtype: int64

In [35]:
# dictionary to store the cross-val scores
cv_scores = {}

# perform 5-fold cross-val for each model
for model_name, model in models.items():
    print(f"Training {model_name} with default parameters")
    scores = cross_val_score(model, X_train_under, y_train_under, cv=5, scoring="accuracy")
    cv_scores[model_name] = scores
    print(f"{model_name} cross-validation accuracy: {np.mean(scores):.2f}")
    print("-"*50)

Training Logistic Regression Model with default parameters


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to sca

Logistic Regression Model cross-validation accuracy: 0.76
--------------------------------------------------
Training Decision Tree Model with default parameters
Decision Tree Model cross-validation accuracy: 0.68
--------------------------------------------------
Training Random Forest Model with default parameters
Random Forest Model cross-validation accuracy: 0.75
--------------------------------------------------
Training XGBoost Model with default parameters
XGBoost Model cross-validation accuracy: 0.73
--------------------------------------------------


# Perform oversampling

In [30]:
smote = smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [32]:
pd.Series(y_train_smote).value_counts()

Churn
0    4138
1    4138
Name: count, dtype: int64

In [33]:
models = {
    "Logistic Regression Model" : LogisticRegression(random_state=42),
    "Decision Tree Model" : DecisionTreeClassifier(random_state=42),
    "Random Forest Model" : RandomForestClassifier(random_state=42),
    "XGBoost Model" : XGBClassifier(random_state=42)
}

In [34]:
# dictionary to store the cross-val scores
cv_scores = {}

# perform 5-fold cross-val for each model
for model_name, model in models.items():
    print(f"Training {model_name} with default parameters")
    scores = cross_val_score(model, X_train_smote, y_train_smote, cv=5, scoring="accuracy")
    cv_scores[model_name] = scores
    print(f"{model_name} cross-validation accuracy: {np.mean(scores):.2f}")
    print("-"*50)

Training Logistic Regression Model with default parameters


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to sca

Logistic Regression Model cross-validation accuracy: 0.79
--------------------------------------------------
Training Decision Tree Model with default parameters
Decision Tree Model cross-validation accuracy: 0.78
--------------------------------------------------
Training Random Forest Model with default parameters
Random Forest Model cross-validation accuracy: 0.84
--------------------------------------------------
Training XGBoost Model with default parameters
XGBoost Model cross-validation accuracy: 0.83
--------------------------------------------------


# Perform NearMiss sampling

In [37]:
nm = NearMiss()
X_train_nm, y_train_nm = nm.fit_resample(X_train, y_train)

In [39]:
pd.Series(y_train_nm).value_counts()

Churn
0    1496
1    1496
Name: count, dtype: int64

In [40]:
# dictionary to store the cross-val scores
cv_scores = {}

# perform 5-fold cross-val for each model
for model_name, model in models.items():
    print(f"Training {model_name} with default parameters")
    scores = cross_val_score(model, X_train_nm, y_train_nm, cv=5, scoring="accuracy")
    cv_scores[model_name] = scores
    print(f"{model_name} cross-validation accuracy: {np.mean(scores):.2f}")
    print("-"*50)

Training Logistic Regression Model with default parameters
Logistic Regression Model cross-validation accuracy: 0.65
--------------------------------------------------
Training Decision Tree Model with default parameters


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to sca

Decision Tree Model cross-validation accuracy: 0.55
--------------------------------------------------
Training Random Forest Model with default parameters
Random Forest Model cross-validation accuracy: 0.61
--------------------------------------------------
Training XGBoost Model with default parameters
XGBoost Model cross-validation accuracy: 0.60
--------------------------------------------------


# Cost sensitive learning of class weights for Random Forest

In [47]:
rfc_weighted = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rfc_weighted.fit(X_train, y_train) # Use original X_train, not SMOTE
y_pred = rfc_weighted.predict(X_test)
print("Accuracy Score:\n", accuracy_score(y_test, y_pred))

Accuracy Score:
 0.7991483321504613


# Cost sensitive learning of class weights for XGBoost

In [48]:
weight = y_train.value_counts()[0] / y_train.value_counts()[1]
from xgboost import XGBClassifier
xgb_weighted = XGBClassifier(scale_pos_weight=weight, random_state=42)
xgb_weighted.fit(X_train, y_train)
y_pred = xgb_weighted.predict(X_test)
print("Accuracy Score:\n", accuracy_score(y_test, y_pred))

Accuracy Score:
 0.7679205110007097


# SMOTETomek (Hybrid Method)

In [51]:
smote_tomek = SMOTETomek()
X_train_smote_tomek, y_train_smote_tomek = smote_tomek.fit_resample(X_train, y_train)

In [52]:
pd.Series(y_train_smote_tomek).value_counts()

Churn
0    3814
1    3814
Name: count, dtype: int64

In [53]:
# dictionary to store the cross-val scores
cv_scores = {}

# perform 5-fold cross-val for each model
for model_name, model in models.items():
    print(f"Training {model_name} with default parameters")
    scores = cross_val_score(model, X_train_smote_tomek, y_train_smote_tomek, cv=5, scoring="accuracy")
    cv_scores[model_name] = scores
    print(f"{model_name} cross-validation accuracy: {np.mean(scores):.2f}")
    print("-"*50)

Training Logistic Regression Model with default parameters


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to sca

Logistic Regression Model cross-validation accuracy: 0.80
--------------------------------------------------
Training Decision Tree Model with default parameters
Decision Tree Model cross-validation accuracy: 0.80
--------------------------------------------------
Training Random Forest Model with default parameters
Random Forest Model cross-validation accuracy: 0.86
--------------------------------------------------
Training XGBoost Model with default parameters
XGBoost Model cross-validation accuracy: 0.85
--------------------------------------------------
