# All Models — Training & Evaluation (Excel)

This notebook trains **all 6 models** and writes pickled models and metrics under `models/`.

In [83]:
# ---- CONFIG: EDIT THESE ----
DATASET_XLSX_PATH = '/Users/srilalitha_gunturu@optum.com/Downloads/Customer Churn.csv'   # e.g., data/Heart_Disease.xlsx
TARGET_COLUMN     = 'Churn'              # e.g., 'target'
RANDOM_STATE      = 42
TEST_SIZE         = 0.2
# ----------------------------

import pandas as pd
import numpy as np
import sklearn

# Excel loader with CSV fallback
_df = None
try:
    _df = pd.read_excel(DATASET_XLSX_PATH, engine='openpyxl')
    print(f"[INFO] Loaded Excel file: {DATASET_XLSX_PATH}")
except Exception as _e:
    print("[WARN] Excel load failed, trying CSV fallback...")
    for _enc in ['utf-8','utf-8-sig','latin-1','cp1252']:
        try:
            _df = pd.read_csv(DATASET_XLSX_PATH, encoding=_enc)
            print(f"[INFO] Loaded CSV fallback with encoding={_enc}")
            break
        except Exception:
            pass
    if _df is None:
        raise ValueError("Could not load dataset. Check path/format.")

df = _df
assert TARGET_COLUMN in df.columns, f"Target '{TARGET_COLUMN}' not in columns: {df.columns.tolist()}"

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(df[TARGET_COLUMN])
X = df.drop(columns=[TARGET_COLUMN])

categorical_cols = X.select_dtypes(include=['object','category']).columns.tolist()
numeric_cols     = X.select_dtypes(include=['number']).columns.tolist()

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# OHE compatibility
_ohe_sparse = {'handle_unknown':'ignore'}
_ohe_dense  = {'handle_unknown':'ignore'}
_maj,_min = [int(v) for v in sklearn.__version__.split('.')[:2]]
if (_maj,_min) >= (1,2):
    _ohe_sparse['sparse_output'] = True
    _ohe_dense['sparse_output']  = False
else:
    _ohe_sparse['sparse'] = True
    _ohe_dense['sparse']  = False

preprocessor_sparse = ColumnTransformer([
    ('cat', OneHotEncoder(**_ohe_sparse), categorical_cols),
    ('num', 'passthrough', numeric_cols)
])

preprocessor_dense = ColumnTransformer([
    ('cat', OneHotEncoder(**_ohe_dense), categorical_cols),
    ('num', 'passthrough', numeric_cols)
])

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

import joblib
from pathlib import Path
Path('../models').mkdir(exist_ok=True)


[WARN] Excel load failed, trying CSV fallback...
[INFO] Loaded CSV fallback with encoding=utf-8


In [84]:
test_df = X_test.copy()

try:
    y_test_raw = le.inverse_transform(y_test)
except Exception:
    y_test_raw = y_test

test_df[TARGET_COLUMN] = y_test_raw
test_csv_path  = "../data/test_split.csv"

test_df.to_csv(test_csv_path, index=False)

print(f" test split to:\n -{test_csv_path}\nRows: {len(test_df)}")

 test split to:
 -../data/test_split.csv
Rows: 630


In [85]:
import numpy as np
from sklearn.metrics import roc_auc_score

def safe_auc(y_true, y_proba):
    """
    Returns AUC or None if it cannot be computed.
    - Handles binary (uses proba[:,1]) and multiclass (OvR, weighted).
    - Returns None if y_true has a single class or probabilities are missing.
    """
    try:
        if len(np.unique(y_true)) < 2:
            return None
        if y_proba is None:
            return None

        y_proba = np.asarray(y_proba)
        if y_proba.ndim == 1:
            # Already the positive-class probability
            return float(roc_auc_score(y_true, y_proba))
        elif y_proba.ndim == 2:
            if y_proba.shape[1] == 2:
                # Probability of positive class
                return float(roc_auc_score(y_true, y_proba[:, 1]))
            else:
                # Multiclass OvR weighted
                return float(roc_auc_score(y_true, y_proba, multi_class='ovr', average='weighted'))
        return None
    except Exception:
        return None

### Train: logistic_regression

In [86]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
pipe = Pipeline([
    ('prep', preprocessor_sparse),
    ('scaler', StandardScaler(with_mean=False)),
    ('clf', LogisticRegression(max_iter=200))
])
clf = pipe.fit(X_train, y_train)


In [87]:
import json, joblib
from pathlib import Path
Path('../models').mkdir(exist_ok=True)
joblib.dump(pipe, '../models/logistic_regression.pkl')
# compute metrics and accumulate
proba = clf.predict_proba(X_test) if hasattr(clf,'predict_proba') else None
pred = clf.predict(X_test)
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef)
auc  = safe_auc(y_test, proba)
_metrics = dict(
    accuracy=float(accuracy_score(y_test, pred)),
    auc=None if auc is None else float(auc),
    precision=float(precision_score(y_test, pred, average='weighted', zero_division=0)),
    recall=float(recall_score(y_test, pred, average='weighted', zero_division=0)),
    f1=float(f1_score(y_test, pred, average='weighted', zero_division=0)),
    mcc=float(matthews_corrcoef(y_test, pred))
)
print('logistic_regression:', _metrics)
try:
    _metrics_summary
except NameError:
    _metrics_summary={}
_metrics_summary['logistic_regression'] = _metrics


logistic_regression: {'accuracy': 0.8968253968253969, 'auc': 0.9207517738591184, 'precision': 0.8920246305418719, 'recall': 0.8968253968253969, 'f1': 0.8821356728194392, 'mcc': 0.5509047964827836}


### Train: decision_tree

In [88]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ('prep', preprocessor_sparse),
    ('clf', DecisionTreeClassifier(random_state=RANDOM_STATE))
])
clf = pipe.fit(X_train, y_train)


In [89]:
import json, joblib
from pathlib import Path
Path('../models').mkdir(exist_ok=True)
joblib.dump(pipe, '../models/decision_tree.pkl')
# compute metrics and accumulate
proba = clf.predict_proba(X_test) if hasattr(clf,'predict_proba') else None
pred = clf.predict(X_test)
auc  = safe_auc(y_test, proba)

_metrics = dict(
    accuracy=float(accuracy_score(y_test, pred)),
    auc=None if auc is None else float(auc),
    precision=float(precision_score(y_test, pred, average='weighted', zero_division=0)),
    recall=float(recall_score(y_test, pred, average='weighted', zero_division=0)),
    f1=float(f1_score(y_test, pred, average='weighted', zero_division=0)),
    mcc=float(matthews_corrcoef(y_test, pred))
)
print('decision_tree:', _metrics)
try:
    _metrics_summary
except NameError:
    _metrics_summary={}
_metrics_summary['decision_tree'] = _metrics


decision_tree: {'accuracy': 0.926984126984127, 'auc': 0.8673362628164888, 'precision': 0.9264070603773896, 'recall': 0.926984126984127, 'f1': 0.9266802209605647, 'mcc': 0.7221310923316494}


### Train: knn

In [90]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
pipe = Pipeline([
    ('prep', preprocessor_sparse),
    ('scaler', StandardScaler(with_mean=False)),
    ('clf', KNeighborsClassifier(n_neighbors=5))
])
clf = pipe.fit(X_train, y_train)


In [91]:
import json, joblib
from pathlib import Path
Path('../models').mkdir(exist_ok=True)
joblib.dump(pipe, '../models/knn.pkl')
# compute metrics and accumulate
proba = clf.predict_proba(X_test) if hasattr(clf,'predict_proba') else None
pred = clf.predict(X_test)
from sklearn.metrics import (accuracy_score,  precision_score, recall_score, f1_score, matthews_corrcoef)
auc  = safe_auc(y_test, proba)
_metrics = dict(
    accuracy=float(accuracy_score(y_test, pred)),
    auc=None if auc is None else float(auc),
    precision=float(precision_score(y_test, pred, average='weighted', zero_division=0)),
    recall=float(recall_score(y_test, pred, average='weighted', zero_division=0)),
    f1=float(f1_score(y_test, pred, average='weighted', zero_division=0)),
    mcc=float(matthews_corrcoef(y_test, pred))
)
print('knn:', _metrics)
try:
    _metrics_summary
except NameError:
    _metrics_summary={}
_metrics_summary['knn'] = _metrics


knn: {'accuracy': 0.9555555555555556, 'auc': 0.9680324906313607, 'precision': 0.9552194348271794, 'recall': 0.9555555555555556, 'f1': 0.9553705692803439, 'mcc': 0.8308909747531027}


### Train: naive_bayes_gaussian

In [92]:
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# Dense OHE to keep pipeline picklable and NB-friendly
pipe = Pipeline([
    ('prep', preprocessor_dense),
    ('scaler', StandardScaler(with_mean=True)),
    ('clf', GaussianNB())
])
clf = pipe.fit(X_train, y_train)


In [93]:
import json, joblib
from pathlib import Path
Path('../models').mkdir(exist_ok=True)
joblib.dump(pipe, '../models/naive_bayes_gaussian.pkl')
# compute metrics and accumulate
proba = clf.predict_proba(X_test) if hasattr(clf,'predict_proba') else None
pred = clf.predict(X_test)
from sklearn.metrics import (accuracy_score,  precision_score, recall_score, f1_score, matthews_corrcoef)
auc  = safe_auc(y_test, proba)
_metrics = dict(
    accuracy=float(accuracy_score(y_test, pred)),
    auc=None if auc is None else float(auc),
    precision=float(precision_score(y_test, pred, average='weighted', zero_division=0)),
    recall=float(recall_score(y_test, pred, average='weighted', zero_division=0)),
    f1=float(f1_score(y_test, pred, average='weighted', zero_division=0)),
    mcc=float(matthews_corrcoef(y_test, pred))
)
print('naive_bayes_gaussian:', _metrics)
try:
    _metrics_summary
except NameError:
    _metrics_summary={}
_metrics_summary['naive_bayes_gaussian'] = _metrics


naive_bayes_gaussian: {'accuracy': 0.7380952380952381, 'auc': 0.8986094466320454, 'precision': 0.8783399667520114, 'recall': 0.7380952380952381, 'f1': 0.772747360355577, 'mcc': 0.45355276340886874}


### Train: random_forest

In [94]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ('prep', preprocessor_sparse),
    ('clf', RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE))
])
clf = pipe.fit(X_train, y_train)


In [95]:
import json, joblib
from pathlib import Path
Path('../models').mkdir(exist_ok=True)
joblib.dump(pipe, '../models/random_forest.pkl')
# compute metrics and accumulate
proba = clf.predict_proba(X_test) if hasattr(clf,'predict_proba') else None
pred = clf.predict(X_test)
from sklearn.metrics import (accuracy_score,  precision_score, recall_score, f1_score, matthews_corrcoef)
auc  = safe_auc(y_test, proba)
_metrics = dict(
    accuracy=float(accuracy_score(y_test, pred)),
    auc=None if auc is None else float(auc),
    precision=float(precision_score(y_test, pred, average='weighted', zero_division=0)),
    recall=float(recall_score(y_test, pred, average='weighted', zero_division=0)),
    f1=float(f1_score(y_test, pred, average='weighted', zero_division=0)),
    mcc=float(matthews_corrcoef(y_test, pred))
)
print('random_forest:', _metrics)
try:
    _metrics_summary
except NameError:
    _metrics_summary={}
_metrics_summary['random_forest'] = _metrics


random_forest: {'accuracy': 0.9650793650793651, 'auc': 0.9877209001502787, 'precision': 0.9644559522110541, 'recall': 0.9650793650793651, 'f1': 0.964474738247488, 'mcc': 0.8647556156318591}


### Train: xgboost

In [96]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
import numpy as np
n_classes = len(np.unique(y))
objective = 'binary:logistic' if n_classes==2 else 'multi:softprob'
pipe = Pipeline([
    ('prep', preprocessor_sparse),
    ('clf', XGBClassifier(
        random_state=RANDOM_STATE,
        n_estimators=300, learning_rate=0.1, max_depth=6,
        subsample=0.9, colsample_bytree=0.9,
        objective=objective,
        eval_metric='logloss' if n_classes==2 else 'mlogloss',
        n_jobs=4
    ))
])
clf = pipe.fit(X_train, y_train)


In [97]:
import json, joblib
from pathlib import Path
Path('../models').mkdir(exist_ok=True)
joblib.dump(pipe, '../models/xgboost.pkl')
# compute metrics and accumulate
proba = clf.predict_proba(X_test) if hasattr(clf,'predict_proba') else None
pred = clf.predict(X_test)
from sklearn.metrics import (accuracy_score,  precision_score, recall_score, f1_score, matthews_corrcoef)
auc  = safe_auc(y_test, proba)
_metrics = dict(
    accuracy=float(accuracy_score(y_test, pred)),
    auc=None if auc is None else float(auc),
    precision=float(precision_score(y_test, pred, average='weighted', zero_division=0)),
    recall=float(recall_score(y_test, pred, average='weighted', zero_division=0)),
    f1=float(f1_score(y_test, pred, average='weighted', zero_division=0)),
    mcc=float(matthews_corrcoef(y_test, pred))
)
print('xgboost:', _metrics)
try:
    _metrics_summary
except NameError:
    _metrics_summary={}
_metrics_summary['xgboost'] = _metrics


xgboost: {'accuracy': 0.9603174603174603, 'auc': 0.992181704046111, 'precision': 0.9604878706199461, 'recall': 0.9603174603174603, 'f1': 0.9603984646539552, 'mcc': 0.8508250392273519}


In [98]:
# === SUMMARY OF ALL MODEL METRICS (plain, no CSS) ===

import pandas as pd
import numpy as np

# Ensure metrics summary exists
try:
    _metrics_summary
except NameError:
    raise RuntimeError("ERROR: `_metrics_summary` not found. Run all model training cells first.")

if not _metrics_summary:
    raise RuntimeError("ERROR: No metrics collected. Check that per‑model blocks ran.")

# Convert to a dataframe
metrics_df = pd.DataFrame.from_dict(_metrics_summary, orient='index')
metrics_df = metrics_df[['accuracy','auc','precision','recall','f1','mcc']]  # ordered columns

# Convert NumPy numeric types to Python floats
metrics_df = metrics_df.applymap(lambda x: float(x) if isinstance(x, (np.floating, np.integer)) else x)

print("=== ALL MODEL METRICS (TEST SPLIT) ===\n")
print(metrics_df.round(4))

# --- Determine best model per metric ---
print("\n=== BEST MODEL PER METRIC ===\n")
best = {}
for metric in ['accuracy','auc','precision','recall','f1','mcc']:
    if metrics_df[metric].notna().any():
        idx = metrics_df[metric].idxmax()
        val = metrics_df.loc[idx, metric]
        best[metric] = (idx, val)
        print(f"{metric.upper():10s} → {idx:25s}  ({val:.4f})")
    else:
        print(f"{metric.upper():10s} → No valid values found")

# Optional: also store this table to view later
best_df = pd.DataFrame(best, index=['best_model','value']).T
print("\n=== TABLE OF BEST MODELS ===\n")
print(best_df)

=== ALL MODEL METRICS (TEST SPLIT) ===

                      accuracy     auc  precision  recall      f1     mcc
logistic_regression     0.8968  0.9208     0.8920  0.8968  0.8821  0.5509
decision_tree           0.9270  0.8673     0.9264  0.9270  0.9267  0.7221
naive_bayes_gaussian    0.7381  0.8986     0.8783  0.7381  0.7727  0.4536
random_forest           0.9651  0.9877     0.9645  0.9651  0.9645  0.8648
xgboost                 0.9603  0.9922     0.9605  0.9603  0.9604  0.8508
knn                     0.9556  0.9680     0.9552  0.9556  0.9554  0.8309

=== BEST MODEL PER METRIC ===

ACCURACY   → random_forest              (0.9651)
AUC        → xgboost                    (0.9922)
PRECISION  → random_forest              (0.9645)
RECALL     → random_forest              (0.9651)
F1         → random_forest              (0.9645)
MCC        → random_forest              (0.8648)

=== TABLE OF BEST MODELS ===

              best_model     value
accuracy   random_forest  0.965079
auc            

  metrics_df = metrics_df.applymap(lambda x: float(x) if isinstance(x, (np.floating, np.integer)) else x)


In [99]:
# Save label encoder & metadata for the app
import joblib, json
from pathlib import Path
Path('../models').mkdir(exist_ok=True)
joblib.dump(le, '../models/label_encoder.pkl')
meta = {
    'target_column': TARGET_COLUMN,
    'class_names': [str(c) for c in le.classes_],
    'numeric_columns': [str(c) for c in numeric_cols],
    'categorical_columns': [str(c) for c in categorical_cols],
}
with open('../models/metadata.json','w') as f:
    json.dump(meta, f, indent=2)
with open('../models/metrics_summary.json','w') as f:
    json.dump(_metrics_summary, f, indent=2)
print('[DONE] Wrote models/*.pkl, metadata.json, metrics_summary.json')


[DONE] Wrote models/*.pkl, metadata.json, metrics_summary.json
