 1. Import Libraries

In [1]:
# Basic & Visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Preprocessing & Modeling
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Evaluation
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score

# Imbalanced data
from imblearn.over_sampling import SMOTE

# Bayesian Optimization
from skopt import BayesSearchCV
from skopt.space import Real, Integer

import warnings
warnings.filterwarnings("ignore")


📊 2. Load & Clean Dataset

In [2]:
df = pd.read_csv(r"c:\Users\admin\Downloads\archive (3)\WA_Fn-UseC_-Telco-Customer-Churn\WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Convert TotalCharges to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.dropna(inplace=True)

# Drop customerID
df.drop('customerID', axis=1, inplace=True)

# Binary Mapping
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
for col in binary_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0})

# One-hot encode remaining
df = pd.get_dummies(df, drop_first=True)

print("Final Shape:", df.shape)


Final Shape: (7032, 31)


In [3]:
print(df.shape)
print(df.columns.tolist())


(7032, 31)
['SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'PaperlessBilling', 'MonthlyCharges', 'TotalCharges', 'Churn', 'gender_Male', 'MultipleLines_No phone service', 'MultipleLines_Yes', 'InternetService_Fiber optic', 'InternetService_No', 'OnlineSecurity_No internet service', 'OnlineSecurity_Yes', 'OnlineBackup_No internet service', 'OnlineBackup_Yes', 'DeviceProtection_No internet service', 'DeviceProtection_Yes', 'TechSupport_No internet service', 'TechSupport_Yes', 'StreamingTV_No internet service', 'StreamingTV_Yes', 'StreamingMovies_No internet service', 'StreamingMovies_Yes', 'Contract_One year', 'Contract_Two year', 'PaymentMethod_Credit card (automatic)', 'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check']


In [4]:
df

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,gender_Male,...,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,0,1,0,1,29.85,29.85,0,False,...,False,False,False,False,False,False,False,False,True,False
1,0,0,0,34,1,0,56.95,1889.50,0,True,...,False,False,False,False,False,True,False,False,False,True
2,0,0,0,2,1,1,53.85,108.15,1,True,...,False,False,False,False,False,False,False,False,False,True
3,0,0,0,45,0,0,42.30,1840.75,0,True,...,True,False,False,False,False,True,False,False,False,False
4,0,0,0,2,1,1,70.70,151.65,1,False,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,1,1,24,1,1,84.80,1990.50,0,True,...,True,False,True,False,True,True,False,False,False,True
7039,0,1,1,72,1,1,103.20,7362.90,0,False,...,False,False,True,False,True,True,False,True,False,False
7040,0,1,1,11,0,1,29.60,346.45,0,False,...,False,False,False,False,False,False,False,False,True,False
7041,1,1,0,4,1,1,74.40,306.60,1,True,...,False,False,False,False,False,False,False,False,False,True


🧪 3. Feature Engineering & Splitting

In [5]:
# Features and Target
X = df.drop('Churn', axis=1)
y = df['Churn']

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [6]:
df

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,gender_Male,...,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,0,1,0,1,29.85,29.85,0,False,...,False,False,False,False,False,False,False,False,True,False
1,0,0,0,34,1,0,56.95,1889.50,0,True,...,False,False,False,False,False,True,False,False,False,True
2,0,0,0,2,1,1,53.85,108.15,1,True,...,False,False,False,False,False,False,False,False,False,True
3,0,0,0,45,0,0,42.30,1840.75,0,True,...,True,False,False,False,False,True,False,False,False,False
4,0,0,0,2,1,1,70.70,151.65,1,False,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,1,1,24,1,1,84.80,1990.50,0,True,...,True,False,True,False,True,True,False,False,False,True
7039,0,1,1,72,1,1,103.20,7362.90,0,False,...,False,False,True,False,True,True,False,True,False,False
7040,0,1,1,11,0,1,29.60,346.45,0,False,...,False,False,False,False,False,False,False,False,True,False
7041,1,1,0,4,1,1,74.40,306.60,1,True,...,False,False,False,False,False,False,False,False,False,True


In [7]:
import pandas as pd

# Assuming X_scaled is your scaled feature array
# Reconstruct DataFrame from scaled X
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
X_scaled_df['Churn'] = y.values  # Add target column

# Save to CSV
X_scaled_df.to_csv("telco_scaled_data.csv", index=False)


⚙️ 4. Modeling: Logistic Regression + Random Forest + XGBoost

In [8]:
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

# XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)

📈 5. Evaluate All Models

In [9]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(f"Model: {model.__class__.__name__}")
    print(classification_report(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print("ROC AUC:", roc_auc_score(y_test, model.predict_proba(X_test)[:,1]))
    print("-"*50)

models = [logreg, rf, xgb]
for m in models:
    evaluate_model(m, X_test, y_test)


Model: LogisticRegression
              precision    recall  f1-score   support

           0       0.82      0.75      0.78      1037
           1       0.77      0.83      0.80      1029

    accuracy                           0.79      2066
   macro avg       0.79      0.79      0.79      2066
weighted avg       0.79      0.79      0.79      2066

Precision: 0.7670863309352518
Recall: 0.8289601554907677
F1 Score: 0.796823914058851
ROC AUC: 0.8589998060113976
--------------------------------------------------
Model: RandomForestClassifier
              precision    recall  f1-score   support

           0       0.86      0.84      0.85      1037
           1       0.84      0.86      0.85      1029

    accuracy                           0.85      2066
   macro avg       0.85      0.85      0.85      2066
weighted avg       0.85      0.85      0.85      2066

Precision: 0.8439581351094196
Recall: 0.8620019436345967
F1 Score: 0.8528846153846154
ROC AUC: 0.9192510727944576
------------

🎯 6. Bayesian Hyperparameter Optimization (XGBoost)

In [10]:
search_spaces = {
    'learning_rate': Real(0.01, 0.3),
    'n_estimators': Integer(100, 300),
    'max_depth': Integer(3, 10),
    'subsample': Real(0.5, 1.0)
}

opt = BayesSearchCV(
    XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    search_spaces,
    n_iter=30,
    scoring='f1',
    cv=3,
    random_state=42,
    n_jobs=-1
)

opt.fit(X_train, y_train)
print("Best Params:", opt.best_params_)


Best Params: OrderedDict([('learning_rate', 0.15061876004031435), ('max_depth', 3), ('n_estimators', 300), ('subsample', 0.9213486630378338)])


👥 7. Customer Segmentation (KMeans)

In [11]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X_scaled)
df['Segment'] = clusters

px.scatter_3d(df, x='MonthlyCharges', y='tenure', z='TotalCharges',
              color='Segment', title="Customer Segmentation (3D View)")


In [12]:
import joblib

# After training your model
import pickle

with open("model.pkl", "wb") as f:
    pickle.dump(models, f)

# Save scaler (for dashboard usewith open("telco_churn_model.pkl", "wb") as f:)
joblib.dump(scaler, 'scaler.pkl')

# Save columns (important for dashboard input processing)
import json
with open('model_columns.json', 'w') as f:
    json.dump(list(X.columns), f)


In [13]:
joblib.dump(X_test, "X_test.pkl")
joblib.dump(y_test, "y_test.pkl")


['y_test.pkl']

In [14]:
with open("model.pkl", "rb") as f:
    model = pickle.load(f)

print("Type of model:", type(model))
print("Is list?", isinstance(model, list))


Type of model: <class 'list'>
Is list? True


In [15]:
model = model[0]  # Extract the real model
