3. Customer Analytics & Churn

5️⃣ Telco Customer Churn 

Problem: Predict customer churn 
ML Type: Classification

Extend it 

Churn probability score 
Customer segmentation + churn 

In [51]:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


# Load the dataset

df = pd.read_csv("Datasets/telco_churn_synthetic.csv")

# Overall churn probability (from dataset column)
overall_churn_probability = df["ChurnProbability"].mean()


#Filter to churned customers and count per contract
churn_by_contract = (
    df.loc[df["Churn"] == "Yes"]
      .groupby("Contract")["Churn"]
      .count()
      .sort_values(ascending=False)
)

print("Total churn count by Contract:")
print(churn_by_contract)


# Sort by probability, highest first
top_high_risk = (df
    .sort_values("ChurnProbability", ascending=False)
    .loc[:, ["gender","SeniorCitizen","Partner","Dependents","tenure","Contract",
             "PaperlessBilling","PaymentMethod","PhoneService","InternetService",
             "OnlineSecurity","StreamingTV","MonthlyCharges","TotalCharges",
             "Churn","ChurnProbability","Segment"]]
    .head(25)
)

print("Top 25 high-risk customers by probability:")
print(top_high_risk.to_string(index=False))



# Target: churn (Yes/No → 1/0)
y = (df["Churn"] == "Yes").astype(int)

# Feature columns
feature_cols = [
    "gender","SeniorCitizen","Partner","Dependents",
    "tenure","Contract","PaperlessBilling","PaymentMethod",
    "PhoneService","InternetService","OnlineSecurity","StreamingTV",
    "MonthlyCharges","TotalCharges"
]
X = df[feature_cols].copy()

# 2) Manual column grouping

numeric_cols = ["tenure","MonthlyCharges","TotalCharges","SeniorCitizen"]

ordinal_col = ["Contract"]
ordinal_order = [["Month-to-month","One year","Two year"]]

nominal_cols = [col for col in feature_cols if col not in numeric_cols + ordinal_col]


# 3) Split BEFORE encoding

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# 4) Scale numeric columns manually

scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train[numeric_cols])
X_test_num  = scaler.transform(X_test[numeric_cols])


# 5) Ordinal encode Contract

ord_encoder = OrdinalEncoder(
    categories=ordinal_order,
    handle_unknown="use_encoded_value",
    unknown_value=-1
)
X_train_ord = ord_encoder.fit_transform(X_train[ordinal_col])
X_test_ord  = ord_encoder.transform(X_test[ordinal_col])


# 6) One‑Hot encode nominal columns

ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
X_train_nom = ohe.fit_transform(X_train[nominal_cols])
X_test_nom  = ohe.transform(X_test[nominal_cols])


# 7) Combine all processed features manually

X_train_final = np.hstack([X_train_num, X_train_ord, X_train_nom])
X_test_final  = np.hstack([X_test_num, X_test_ord, X_test_nom])

# ---------------------------
# 8) RandomForest classifier

clf = RandomForestClassifier(
    n_estimators=500,
    max_depth=10,
    max_features="sqrt",
    random_state=42,
    n_jobs=-1
)
clf.fit(X_train_final, y_train)

# 9) Predictions + accuracy

y_pred = clf.predict(X_test_final)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy*100}%")
print(f"Overall churn probability: {round(overall_churn_probability*100,2)}%")
print("y_pred: ",y_pred)




Total churn count by Contract:
Contract
Month-to-month    1633
One year           149
Two year            42
Name: Churn, dtype: int64
Top 25 high-risk customers by probability:
gender  SeniorCitizen Partner Dependents  tenure       Contract PaperlessBilling    PaymentMethod PhoneService InternetService OnlineSecurity StreamingTV  MonthlyCharges  TotalCharges Churn  ChurnProbability                Segment
  Male              1      No         No       0 Month-to-month              Yes Electronic check          Yes     Fiber optic             No         Yes           83.41          0.00    No             0.891         New High-Value
  Male              1     Yes         No       2 Month-to-month              Yes Electronic check          Yes     Fiber optic             No         Yes           84.47        165.95   Yes             0.885         New High-Value
Female              1      No         No       0 Month-to-month              Yes Electronic check          Yes     Fiber optic   