In [34]:
# Import Libraries

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import roc_auc_score, classification_report

from imblearn.over_sampling import SMOTE



In [26]:
# Load Data

df = pd.read_csv('Telco_customer_churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [27]:
# Remove CustomerID column

df.drop('customerID', axis=1, inplace=True)
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [28]:
# Encode Target Variable

df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})



In [29]:
# One hot encode categorical variables

X = df.drop(columns=["Churn"])
y = df["Churn"]

X = pd.get_dummies(X, drop_first=True)

X.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,...,TotalCharges_995.35,TotalCharges_996.45,TotalCharges_996.85,TotalCharges_996.95,TotalCharges_997.65,TotalCharges_997.75,TotalCharges_998.1,TotalCharges_999.45,TotalCharges_999.8,TotalCharges_999.9
0,0,1,29.85,False,True,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
1,0,34,56.95,True,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,0,2,53.85,True,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,0,45,42.3,True,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
4,0,2,70.7,False,False,False,True,False,False,True,...,False,False,False,False,False,False,False,False,False,False


In [31]:
# Split data

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [32]:
# Feature Scaling

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [35]:
# Treat Imbalance with SMOTE

smote = SMOTE(random_state=42)

X_train_res, y_train_res = smote.fit_resample(
    X_train_scaled, y_train
)

In [36]:
# PCA to reduce dimention of the dataset

pca = PCA(n_components=0.95)

X_train_pca = pca.fit_transform(X_train_res)
X_test_pca = pca.transform(X_test_scaled)

print("PCA Components:", X_train_pca.shape[1])

PCA Components: 4965


In [37]:
# Logistic Regression

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_pca, y_train_res)

y_prob_lr = lr.predict_proba(X_test_pca)[:, 1]
print("Logistic Regression ROC-AUC:", roc_auc_score(y_test, y_prob_lr))
print(classification_report(y_test, lr.predict(X_test_pca)))

Logistic Regression ROC-AUC: 0.8116407037123149
              precision    recall  f1-score   support

           0       0.82      0.90      0.86      1035
           1       0.62      0.45      0.52       374

    accuracy                           0.78      1409
   macro avg       0.72      0.68      0.69      1409
weighted avg       0.77      0.78      0.77      1409



In [38]:
# Support Vector Machine (SVM)

svm = SVC(kernel="rbf", probability=True)
svm.fit(X_train_pca, y_train_res)

y_prob_svm = svm.predict_proba(X_test_pca)[:, 1]
print("SVM ROC-AUC:", roc_auc_score(y_test, y_prob_svm))
print(classification_report(y_test, svm.predict(X_test_pca)))


SVM ROC-AUC: 0.796450437882663
              precision    recall  f1-score   support

           0       0.90      0.62      0.74      1035
           1       0.44      0.81      0.57       374

    accuracy                           0.67      1409
   macro avg       0.67      0.72      0.65      1409
weighted avg       0.78      0.67      0.69      1409



In [39]:
# Naive Bayes

nb = GaussianNB()
nb.fit(X_train_pca, y_train_res)

y_prob_nb = nb.predict_proba(X_test_pca)[:, 1]
print("Naive Bayes ROC-AUC:", roc_auc_score(y_test, y_prob_nb))
print(classification_report(y_test, nb.predict(X_test_pca)))


Naive Bayes ROC-AUC: 0.49110284429977524
              precision    recall  f1-score   support

           0       0.67      0.05      0.09      1035
           1       0.26      0.94      0.41       374

    accuracy                           0.28      1409
   macro avg       0.46      0.49      0.25      1409
weighted avg       0.56      0.28      0.17      1409



In [40]:
# K-Nearest Neighbors

knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train_pca, y_train_res)

y_prob_knn = knn.predict_proba(X_test_pca)[:, 1]
print("KNN ROC-AUC:", roc_auc_score(y_test, y_prob_knn))
print(classification_report(y_test, knn.predict(X_test_pca)))


KNN ROC-AUC: 0.5373233614921594
              precision    recall  f1-score   support

           0       0.74      0.99      0.85      1035
           1       0.69      0.05      0.09       374

    accuracy                           0.74      1409
   macro avg       0.72      0.52      0.47      1409
weighted avg       0.73      0.74      0.65      1409



In [41]:
# Decision Tree

dt = DecisionTreeClassifier(
    max_depth=6,
    class_weight="balanced",
    random_state=42
)
dt.fit(X_train_pca, y_train_res)

y_prob_dt = dt.predict_proba(X_test_pca)[:, 1]
print("Decision Tree ROC-AUC:", roc_auc_score(y_test, y_prob_dt))
print(classification_report(y_test, dt.predict(X_test_pca)))


Decision Tree ROC-AUC: 0.6437675992663205
              precision    recall  f1-score   support

           0       0.81      0.11      0.20      1035
           1       0.27      0.93      0.42       374

    accuracy                           0.33      1409
   macro avg       0.54      0.52      0.31      1409
weighted avg       0.66      0.33      0.26      1409

