In [3]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (accuracy_score, roc_auc_score, precision_score,
                             recall_score, f1_score, matthews_corrcoef)

# STEP 1: Load Data (Ensure file is uploaded to BITS Lab first)
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# STEP 2: Cleaning (Specific to Telco Dataset)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())
df.drop('customerID', axis=1, inplace=True)
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# STEP 3: Feature Engineering (Dummies ensure > 12 features) [cite: 30]
df_final = pd.get_dummies(df, drop_first=True)
X = df_final.drop('Churn', axis=1)
y = df_final['Churn']

# STEP 4: Split and Scale (Crucial for Logistic/kNN)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# STEP 5: Define the 6 Required Models [cite: 32-39]
models = {
    "Logistic Regression": LogisticRegression(max_iter=5000),
    "Decision Tree": DecisionTreeClassifier(),
    "kNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(eval_metric='logloss')
}

# STEP 6: Train, Evaluate, and Save [cite: 40-46, 55]
performance_report = []

for name, model in models.items():
    # Fit on scaled data
    model.fit(X_train_scaled, y_train)
   
    # Predict
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]
   
    # Calculate all 6 metrics
    metrics = {
        "ML Model Name": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_proba),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }
    performance_report.append(metrics)
   
    # Save model file for GitHub 'model/' folder
    filename = f"{name.replace(' ', '_').lower()}.pkl"
    joblib.dump(model, filename)
    print(f"Finished {name} and saved as {filename}")

# STEP 7: Display Comparison Table for your PDF/README [cite: 71]
results_df = pd.DataFrame(performance_report)
print("\n--- FINAL COMPARISON TABLE ---")
print(results_df.to_string(index=False))

Finished Logistic Regression and saved as logistic_regression.pkl
Finished Decision Tree and saved as decision_tree.pkl
Finished kNN and saved as knn.pkl
Finished Naive Bayes and saved as naive_bayes.pkl
Finished Random Forest and saved as random_forest.pkl
Finished XGBoost and saved as xgboost.pkl

--- FINAL COMPARISON TABLE ---
      ML Model Name  Accuracy      AUC  Precision   Recall       F1      MCC
Logistic Regression  0.819730 0.862004   0.683077 0.595174 0.636103 0.519211
      Decision Tree  0.711852 0.632181   0.456233 0.461126 0.458667 0.262356
                kNN  0.770759 0.789473   0.573964 0.520107 0.545710 0.393761
        Naive Bayes  0.665720 0.837725   0.435864 0.892761 0.585752 0.422170
      Random Forest  0.791341 0.837230   0.649057 0.461126 0.539185 0.419267
            XGBoost  0.789212 0.839177   0.628378 0.498660 0.556054 0.425070


In [5]:
import pandas as pd

# Data from your latest execution output
comparison_data = {
    "ML Model Name": ["Logistic Regression", "Decision Tree", "kNN", "Naive Bayes", "Random Forest", "XGBoost"],
    "Accuracy": [0.819730, 0.711852, 0.770759, 0.665720, 0.791341, 0.789212],
    "AUC": [0.862004, 0.632181, 0.789473, 0.837725, 0.837230, 0.839177],
    "Precision": [0.683077, 0.456233, 0.573964, 0.435864, 0.649057, 0.628378],
    "Recall": [0.595174, 0.461126, 0.520107, 0.892761, 0.461126, 0.498660],
    "F1": [0.636103, 0.458667, 0.545710, 0.585752, 0.539185, 0.556054],
    "MCC": [0.519211, 0.262356, 0.393761, 0.422170, 0.419267, 0.425070]
}

# 1. Generate Comparison Table [cite: 188-196]
results_df = pd.DataFrame(comparison_data)

# 2. Define Observations based on your results [cite: 197-198]
# These analyze the specific numbers from your latest run
observations_data = {
    "ML Model Name": [
        "Logistic Regression", 
        "Decision Tree", 
        "kNN", 
        "Naive Bayes", 
        "Random Forest", 
        "XGBoost"
    ],
    "Observation about model performance": [
        "Best overall performer with highest Accuracy (0.819) and MCC (0.519).",
        "Weakest performance; lowest AUC (0.632) and MCC (0.262) suggest overfitting.",
        "Moderate performance; better than Decision Tree but lacks ensemble power.",
        "Highest Recall (0.892) but very low Precision; tends to over-predict churn.",
        "Strong ensemble scores; robust but slightly less accurate than Logistic Regression.",
        "Solid AUC (0.839); competitive with Random Forest but needs more tuning."
    ]
}

obs_df = pd.DataFrame(observations_data)

# Print tables for your README and PDF
print("--- STEP 5: COMPARISON TABLE ---")
print(results_df.to_string(index=False))

print("\n--- STEP 5: OBSERVATION TABLE ---")
print(obs_df.to_string(index=False))

--- STEP 5: COMPARISON TABLE ---
      ML Model Name  Accuracy      AUC  Precision   Recall       F1      MCC
Logistic Regression  0.819730 0.862004   0.683077 0.595174 0.636103 0.519211
      Decision Tree  0.711852 0.632181   0.456233 0.461126 0.458667 0.262356
                kNN  0.770759 0.789473   0.573964 0.520107 0.545710 0.393761
        Naive Bayes  0.665720 0.837725   0.435864 0.892761 0.585752 0.422170
      Random Forest  0.791341 0.837230   0.649057 0.461126 0.539185 0.419267
            XGBoost  0.789212 0.839177   0.628378 0.498660 0.556054 0.425070

--- STEP 5: OBSERVATION TABLE ---
      ML Model Name                                                 Observation about model performance
Logistic Regression               Best overall performer with highest Accuracy (0.819) and MCC (0.519).
      Decision Tree        Weakest performance; lowest AUC (0.632) and MCC (0.262) suggest overfitting.
                kNN           Moderate performance; better than Decision Tree but