# Model Training & Evaluation – Customer Churn

In [None]:
import sys
sys.executable

# Get the path of the Python interpreter being used

In [None]:
import os
os.getcwd()
# Get current working directory
os.chdir(r"C:\Users\MVS Sastri\Desktop\customer churn analytics")
# Change working directory to the project folder   

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
import pandas as pd

df = pd.read_csv("data/processed/cleaned_telco_churn.csv")
df.head()
#loaded the cleaned dataframe from the CSV file

In [None]:
df_encoded = pd.get_dummies(df, drop_first=True)
#encoded categorical variables using one-hot encoding

In [None]:
df_encoded.info()
#get a concise summary of the dataframe

In [None]:
X = df_encoded.drop("Churn", axis=1)
y = df_encoded["Churn"]

X_train, X_test, y_train, y_test = train_test_split(
    
    X, y, test_size=0.2, random_state=42
)
#split data into training and testing sets, features and targets

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
#trained a logistic regression model

In [None]:
y_pred = model.predict(X_test)

print("Accuracy (Logistic Regression):", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix (Logistic Regression):\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report (Logistic Regression):\n", classification_report(y_test, y_pred))
#evaluated the model's performance using accuracy, confusion matrix, and classification report

In [None]:
model_recall = LogisticRegression(
    max_iter=1000,
    class_weight={0: 1, 1: 2}   # churn is more important
)

model_recall.fit(X_train, y_train)
#trained a logistic regression model with adjusted class weights to prioritize recall

In [None]:
y_pred_recall = model_recall.predict(X_test)
print("Accuracy (Recall Model):", accuracy_score(y_test, y_pred_recall))
print("\nConfusion Matrix (Recall Model):\n", confusion_matrix(y_test, y_pred_recall))
print("\nClassification Report (Recall Model):\n", classification_report(y_test, y_pred_recall))
#evaluated the recall-prioritized model's performance using accuracy, confusion matrix, and classification report

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
#standardized the feature data using StandardScaler


In [None]:
model_scaled = LogisticRegression(
    max_iter=1000,
    class_weight={0: 1, 1: 2},
    C=1.0
)

model_scaled.fit(X_train_scaled, y_train)
#trained a logistic regression model on standardized data with adjusted class weights

In [None]:
y_pred_scaled = model_scaled.predict(X_test_scaled)

print("Accuracy (Scaled Model):", accuracy_score(y_test, y_pred_scaled))
print("\nConfusion Matrix (Scaled Model):\n", confusion_matrix(y_test, y_pred_scaled))
print("\nClassification Report (Scaled Model):\n", classification_report(y_test, y_pred_scaled))
#evaluated the standardized model's performance using accuracy, confusion matrix, and classification report

In [None]:
from sklearn.ensemble import RandomForestClassifier
#import random forest

In [None]:
rf_model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight={0: 1, 1: 2}   # still care more about churn
)

rf_model.fit(X_train, y_train)

#trained a random forest model with adjusted class weights

In [None]:
y_pred_rf = rf_model.predict(X_test)

print("Accuracy (Random Forest):", accuracy_score(y_test, y_pred_rf))
print("\nConfusion Matrix (Random Forest):\n", confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report (Random Forest):\n", classification_report(y_test, y_pred_rf))
#evaluated the random forest model's performance using accuracy, confusion matrix, and classification report


In [None]:
y_prob_rf = rf_model.predict_proba(X_test)[:, 1]
#threshold tuning for random forest model

In [None]:
y_prob_rf = rf_model.predict_proba(X_test)[:, 1]
#obtained predicted probabilities for the positive class using the random forest model

In [None]:
import numpy as np

y_pred_rf_tuned = (y_prob_rf >= 0.35).astype(int)
#applied threshold tuning to the random forest model's predicted probabilities

In [None]:
print("Accuracy (RF Tuned):", accuracy_score(y_test, y_pred_rf_tuned))
print("\nConfusion Matrix (RF Tuned):\n", confusion_matrix(y_test, y_pred_rf_tuned))
print("\nClassification Report (RF Tuned):\n", classification_report(y_test, y_pred_rf_tuned))


In [None]:
X = df_encoded.drop("Churn", axis=1)

In [None]:
from sklearn.metrics import classification_report

# Get reports as dictionaries
report_logreg = classification_report(y_test, y_pred, output_dict=True)
report_recall = classification_report(y_test, y_pred_recall, output_dict=True)
report_scaled = classification_report(y_test, y_pred_scaled, output_dict=True)
report_rf_tuned = classification_report(y_test, y_pred_rf_tuned, output_dict=True)
#compiled classification reports from different models into a single dataframe for comparison

In [None]:
pd.DataFrame(report_scaled).T
pd.DataFrame(report_logreg).T
pd.DataFrame(report_recall).T
pd.DataFrame(report_rf_tuned).T
#displayed the classification report for the standardized logistic regression model

In [None]:
model_comparison = pd.DataFrame([
    {
        "model": "Logistic Regression (Baseline)",
        "accuracy": accuracy_score(y_test, y_pred),
        "precision_churn": report_logreg["1"]["precision"],
        "recall_churn": report_logreg["1"]["recall"],
        "f1_churn": report_logreg["1"]["f1-score"]
    },
    {
        "model": "Logistic Regression (Class-Weighted)",
        "accuracy": accuracy_score(y_test, y_pred_recall),
        "precision_churn": report_recall["1"]["precision"],
        "recall_churn": report_recall["1"]["recall"],
        "f1_churn": report_recall["1"]["f1-score"]
    },
    {
        "model": "Logistic Regression (Scaled + Weighted)",
        "accuracy": accuracy_score(y_test, y_pred_scaled),
        "precision_churn": report_scaled["1"]["precision"],
        "recall_churn": report_scaled["1"]["recall"],
        "f1_churn": report_scaled["1"]["f1-score"]
    },
    {
        "model": "Random Forest (Tuned)",
        "accuracy": accuracy_score(y_test, y_pred_rf_tuned),
        "precision_churn": report_rf_tuned["1"]["precision"],
        "recall_churn": report_rf_tuned["1"]["recall"],
        "f1_churn": report_rf_tuned["1"]["f1-score"]
    }
])

model_comparison.to_csv("models/model_comparison.csv", index=False)
#created a summary dataframe comparing key metrics across all models

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred_scaled)

ConfusionMatrixDisplay(
    confusion_matrix=cm,
    display_labels=["No Churn", "Churn"]
).plot(cmap="Blues")

plt.title("Confusion Matrix – Final Logistic Regression Model")
plt.show()
#visualized the confusion matrix for the standardized logistic regression model

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))

plt.bar(
    model_comparison["model"],
    model_comparison["recall_churn"]
)

plt.ylabel("Recall (Churn)")
plt.xlabel("Model")
plt.title("Model Performance Comparison – Churn Recall")
plt.xticks(rotation=30, ha="right")
plt.ylim(0, 1)

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

# Get feature importance from logistic regression
feature_importance = pd.DataFrame({
    "feature": X.columns,
    "importance": abs(model_scaled.coef_[0])
})

# Sort and select top 10
feature_importance = feature_importance.sort_values(
    by="importance", ascending=False
).head(10)

# Create visuals folder if not exists
os.makedirs("visuals", exist_ok=True)

# Plot
plt.figure(figsize=(8, 5))
plt.barh(
    feature_importance["feature"],
    feature_importance["importance"]
)
plt.xlabel("Importance (Absolute Coefficient Value)")
plt.ylabel("Feature")
plt.title("Top 10 Feature Importance – Logistic Regression")
plt.gca().invert_yaxis()  # most important on top
plt.tight_layout()

# Save
plt.savefig("visuals/feature_importance_logistic.png", dpi=300)
plt.show()

In [None]:
X = df_encoded.drop("Churn", axis=1)
