In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('/content/Customer_Churn_Pre_Processed_Data.csv')
df.drop(df.columns[0], axis = 1, inplace =True) # Remove the irrelevant unnamed columns
df.head()

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,0,0.0,0,0,1,0,0,0,...,1,0,0,1,0,0,0,0,1,0
1,0,0,0,0.464789,0,1,0,1,0,0,...,1,0,0,0,1,0,0,0,0,1
2,0,0,0,0.014085,0,1,1,0,0,0,...,1,0,0,1,0,0,0,0,0,1
3,0,0,0,0.619718,0,1,0,1,1,0,...,1,0,0,0,1,0,1,0,0,0
4,0,0,0,0.014085,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0


In [3]:
# Create features (X) and labels (Y) variables
X = df.drop(columns=['Churn'], inplace=False)
Y = df['Churn'].values
X.head()

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,0,0.0,0,0,1,0,0,0,...,1,0,0,1,0,0,0,0,1,0
1,0,0,0,0.464789,0,1,0,1,0,0,...,1,0,0,0,1,0,0,0,0,1
2,0,0,0,0.014085,0,1,1,0,0,0,...,1,0,0,1,0,0,0,0,0,1
3,0,0,0,0.619718,0,1,0,1,1,0,...,1,0,0,0,1,0,1,0,0,0
4,0,0,0,0.014085,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0


In [4]:
# split the data into train and test (70%-30% split)
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size= 0.3, random_state= 42)
# split the data into test and validation (15%-15% split)
X_test, X_val, Y_test, Y_val = train_test_split(X_temp, Y_temp, test_size = 0.5, random_state= 42)

# Method 1: Logistic Regression

In [5]:
# Define the regression model
dt_clf_lr = LogisticRegression()
# Fit the resampled data to the model
dt_clf_lr.fit(X_train, Y_train)

In [6]:
# Predict the label values from data and validate them
Y_test_pred_lr = dt_clf_lr.predict(X_test)
Y_val_pred_lr = dt_clf_lr.predict(X_val)

In [7]:
# Estimate the training and validation accuracies
train_acc_lr = accuracy_score(Y_test, Y_test_pred_lr)
val_acc_lr = accuracy_score(Y_val, Y_val_pred_lr)
print(train_acc_lr)
print(val_acc_lr)

0.8018957345971564
0.7895734597156399


In [8]:
# Estimate the training and validation classification metrics/scores
Classification_report_test_lr = classification_report(Y_test, Y_test_pred_lr, labels = [0,1], digits = 4)
Classification_report_val_lr = classification_report(Y_val, Y_val_pred_lr, labels = [0,1], digits = 4)
print(Classification_report_test_lr)
print(Classification_report_val_lr)

              precision    recall  f1-score   support

           0     0.8396    0.8997    0.8686       768
           1     0.6681    0.5401    0.5973       287

    accuracy                         0.8019      1055
   macro avg     0.7539    0.7199    0.7330      1055
weighted avg     0.7930    0.8019    0.7948      1055

              precision    recall  f1-score   support

           0     0.8380    0.8873    0.8619       781
           1     0.6140    0.5109    0.5578       274

    accuracy                         0.7896      1055
   macro avg     0.7260    0.6991    0.7099      1055
weighted avg     0.7798    0.7896    0.7829      1055



# Method 2: Decision Tree Classifier

In [9]:
# Define the Classifier model
dt_clf_dtc = DecisionTreeClassifier(criterion = 'gini', random_state = 100, max_depth= 6, min_samples_leaf= 8)
# Fit the data to the model
dt_clf_dtc.fit(X_train, Y_train)

In [10]:
# Predict the label values and validate them
Y_test_pred_dtc = dt_clf_dtc.predict(X_test)
Y_val_pred_dtc = dt_clf_dtc.predict(X_val)

In [11]:
# Estimate the training and validation accuracies
train_acc_dtc = accuracy_score(Y_test, Y_test_pred_dtc)
val_acc_dtc = accuracy_score(Y_val, Y_val_pred_dtc)
print('Accuracy Score Test', train_acc_dtc)
print('Accuracy Score Val', val_acc_dtc)

Accuracy Score Test 0.7829383886255924
Accuracy Score Val 0.776303317535545


In [12]:
# Estimate the training and validation classification metrics/scores
Classification_report_test_dtc = classification_report(Y_test, Y_test_pred_dtc, labels = [0,1], digits = 4)
Classification_report_val_dtc = classification_report(Y_val, Y_val_pred_dtc, labels = [0,1], digits = 4)
print(Classification_report_test_dtc)
print(Classification_report_val_dtc)

              precision    recall  f1-score   support

           0     0.8267    0.8880    0.8562       768
           1     0.6261    0.5017    0.5571       287

    accuracy                         0.7829      1055
   macro avg     0.7264    0.6949    0.7067      1055
weighted avg     0.7721    0.7829    0.7749      1055

              precision    recall  f1-score   support

           0     0.8344    0.8707    0.8521       781
           1     0.5792    0.5073    0.5409       274

    accuracy                         0.7763      1055
   macro avg     0.7068    0.6890    0.6965      1055
weighted avg     0.7681    0.7763    0.7713      1055



# Data Upscaling

In [13]:
# Upscale the data using smoteenn
sm = SMOTEENN()
X_resampled, Y_resampled = sm.fit_resample(X,Y)

In [14]:
# split the resampled data into train and test (70%-30% split)
Xr_train, Xr_temp, Yr_train, Yr_temp = train_test_split(X_resampled, Y_resampled, test_size= 0.3, random_state= 42)
# split the resampled data into test and validation (15%-15% split)
Xr_test, Xr_val, Yr_test, Yr_val = train_test_split(Xr_temp, Yr_temp, test_size = 0.5, random_state= 42)

# Method 1: Logistic Regression (with Upscaled Data)

In [15]:
# Define the Classifier model
dt_r_clf_lr = LogisticRegression()
# Fit the resampled data to the model
dt_r_clf_lr.fit(Xr_train, Yr_train)

In [16]:
# Predict the label values from resampled data and validate them
Yr_test_pred_lr = dt_r_clf_lr.predict(Xr_test)
Yr_val_pred_lr = dt_r_clf_lr.predict(Xr_val)

In [17]:
# Estimate the training and validation accuracies
train_acc_lr_r = accuracy_score(Yr_test, Yr_test_pred_lr)
val_acc_lr_r = accuracy_score(Yr_val, Yr_val_pred_lr)
print('Accuracy Score Test', train_acc_lr_r)
print('Accuracy Score Val', val_acc_lr_r)

Accuracy Score Test 0.8997844827586207
Accuracy Score Val 0.9031216361679225


In [18]:
# Estimate the training and validation classification metrics/scores
Classification_report_test_r_lr = classification_report(Yr_test, Yr_test_pred_lr, labels = [0,1], digits = 4)
Classification_report_val_r_lr = classification_report(Yr_val, Yr_val_pred_lr, labels = [0,1], digits = 4)
print(Classification_report_test_r_lr)
print(Classification_report_val_r_lr)

              precision    recall  f1-score   support

           0     0.8951    0.8889    0.8920       432
           1     0.9038    0.9093    0.9065       496

    accuracy                         0.8998       928
   macro avg     0.8995    0.8991    0.8993       928
weighted avg     0.8998    0.8998    0.8998       928

              precision    recall  f1-score   support

           0     0.8930    0.8972    0.8951       428
           1     0.9118    0.9082    0.9100       501

    accuracy                         0.9031       929
   macro avg     0.9024    0.9027    0.9026       929
weighted avg     0.9032    0.9031    0.9031       929



# Method 2: Decision Tree Classifier (with Upscaled Data)

In [19]:
# Define the Classifier model
dt_r_clf_dtc = DecisionTreeClassifier(criterion = 'gini', random_state = 100, max_depth= 6, min_samples_leaf= 8)
# Fit the resampled data to the model
dt_r_clf_dtc.fit(Xr_train, Yr_train)

In [20]:
# Predict the label values from resampled data and validate them
Yr_test_pred_dtc = dt_r_clf_dtc.predict(Xr_test)
Yr_val_pred_dtc = dt_r_clf_dtc.predict(Xr_val)

In [21]:
# Estimate the training and validation accuracies
train_acc_dtc_r = accuracy_score(Yr_test, Yr_test_pred_dtc)
val_acc_dtc_r = accuracy_score(Yr_val, Yr_val_pred_dtc)
print('Accuracy Score Test', train_acc_dtc_r)
print('Accuracy Score Val', val_acc_dtc_r)

Accuracy Score Test 0.9148706896551724
Accuracy Score Val 0.9128094725511302


In [22]:
# Estimate the training and validation classification metrics/scores
Classification_report_test_r_dtc = classification_report(Yr_test, Yr_test_pred_dtc, labels = [0,1], digits = 4)
Classification_report_val_r_dtc = classification_report(Yr_val, Yr_val_pred_dtc, labels = [0,1], digits = 4)
print(Classification_report_test_r_dtc)
print(Classification_report_val_r_dtc)

              precision    recall  f1-score   support

           0     0.9233    0.8912    0.9069       432
           1     0.9080    0.9355    0.9215       496

    accuracy                         0.9149       928
   macro avg     0.9156    0.9133    0.9142       928
weighted avg     0.9151    0.9149    0.9148       928

              precision    recall  f1-score   support

           0     0.9161    0.8925    0.9041       428
           1     0.9102    0.9301    0.9200       501

    accuracy                         0.9128       929
   macro avg     0.9131    0.9113    0.9121       929
weighted avg     0.9129    0.9128    0.9127       929

