#### Import all the required libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import (train_test_split, cross_validate, StratifiedKFold, GridSearchCV, cross_val_score)

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix, recall_score, f1_score, roc_auc_score)

import joblib

#### Data import and Basic Exploratory Data Analysis

In [2]:
df = pd.read_csv("data_folder\WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head(3)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes


In [3]:
df.shape

(7043, 21)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [5]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [6]:
df.describe().T.round(1)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
SeniorCitizen,7043.0,0.2,0.4,0.0,0.0,0.0,0.0,1.0
tenure,7043.0,32.4,24.6,0.0,9.0,29.0,55.0,72.0
MonthlyCharges,7043.0,64.8,30.1,18.2,35.5,70.4,89.8,118.8


In [7]:
df.Churn.value_counts()

Churn
No     5174
Yes    1869
Name: count, dtype: int64

#### Data Cleaning

In [8]:
df.drop(columns=["customerID"], inplace=True)

df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df.dropna(inplace=True)

df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})

#### Spliting Features and Target variables

In [9]:
X = df.drop(columns=["Churn"])
y = df["Churn"]

In [10]:
X.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges'],
      dtype='object')

In [11]:
cat = X.select_dtypes(include=["object"]).columns.tolist()
num = X.select_dtypes(exclude=["object"]).columns.tolist()

#### Train & Test Split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=42)

In [13]:
print(X_train.shape, y_train.shape)

(5625, 19) (5625,)


In [14]:
print(X_test.shape, y_test.shape)

(1407, 19) (1407,)


#### Preprocessing Pipeline

In [15]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat)
    ]
)

#### Logistic Regression

In [16]:
pipeline = Pipeline([("prep", preprocessor), ("model", LogisticRegression(max_iter=2000))])

#### Cross-Validation

In [17]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = {"accuracy": "accuracy", "recall": "recall"}
cv_results = cross_validate( pipeline, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)

print("CV Recall mean:", cv_results["test_recall"].mean())
print("CV Accuracy mean:", cv_results["test_accuracy"].mean())

CV Recall mean: 0.5464882943143812
CV Accuracy mean: 0.8024888888888888


#### Define Model Pipelines for Other Algorithms

In [18]:
pipelines = {

    "log_reg": Pipeline([ ("prep", preprocessor), ("model", LogisticRegression(max_iter=2000))]),

    "rf": Pipeline([("prep", preprocessor),("model", RandomForestClassifier(random_state=42, n_jobs=-1))]),

    "xgb": Pipeline([("prep", preprocessor), ("model", XGBClassifier(objective="binary:logistic", eval_metric="logloss", random_state=42))])
}

#### Cross Validation for ALL Models

In [19]:
from sklearn.model_selection import cross_validate

scoring = {"recall": "recall", "accuracy": "accuracy"}

cv_results = {}

for name, pipe in pipelines.items():

    scores = cross_validate(pipe, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)

    cv_results[name] = {"recall_mean": scores["test_recall"].mean(),"accuracy_mean": scores["test_accuracy"].mean()}

    print(f"{name} CV Recall: {cv_results[name]['recall_mean']:.4f}")
    print(f"{name} CV Accuracy: {cv_results[name]['accuracy_mean']:.4f}")

log_reg CV Recall: 0.5465
log_reg CV Accuracy: 0.8025
rf CV Recall: 0.4936
rf CV Accuracy: 0.7916
xgb CV Recall: 0.5164
xgb CV Accuracy: 0.7867


#### Hyperparameter Tuning

In [20]:
param_grids = {

    "log_reg": {
        "model__C": [0.01, 0.1, 1, 10],
        "model__class_weight": [None, "balanced"]
    },

    "rf": {
        "model__n_estimators": [200, 400],
        "model__max_depth": [None, 8, 12],
        "model__min_samples_split": [2, 5]
    },

    "xgb": {
        "model__n_estimators": [200, 400],
        "model__max_depth": [3, 5],
        "model__learning_rate": [0.05, 0.1],
        "model__subsample": [0.8, 1.0]
    }
}

#### GridSearch for ALL Models

In [None]:
grid_models = {}

for name in pipelines.keys():

    print(f"\nTuning {name}...")

    grid = GridSearchCV(
        pipelines[name],
        param_grids[name],
        cv=cv,
        scoring="recall",  # churn focus
        n_jobs=-1,
        verbose=1
    )

    grid.fit(X_train, y_train)

    grid_models[name] = grid.best_estimator_

    print("Best Params:", grid.best_params_)
    print("Best CV Recall:", grid.best_score_)


Tuning log_reg...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Params: {'model__C': 1, 'model__class_weight': 'balanced'}
Best CV Recall: 0.8020066889632107

Tuning rf...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Params: {'model__max_depth': 12, 'model__min_samples_split': 2, 'model__n_estimators': 200}
Best CV Recall: 0.5270903010033445

Tuning xgb...
Fitting 5 folds for each of 16 candidates, totalling 80 fits


#### GridSearch for ALL Models

In [None]:
best_model_name = None
best_recall = 0
best_model = None

for name, model in grid_models.items():

    preds = model.predict(X_test)
    rec = recall_score(y_test, preds)

    print(f"{name} Test Recall: {rec:.4f}")

    if rec > best_recall:
        best_recall = rec
        best_model = model
        best_model_name = name

print("\nBest Model:", best_model_name)

In [None]:
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

In [None]:
joblib.dump(best_model, "telco_churn_best.pkl")
print("Best model saved:", best_model_name)