# Model Comparison with CV, Metric Distributions, and Statistical Tests
We compare a **penalized logistic regression** vs a **GBM** on the breast_cancer dataset using stratified 5-fold CV:
1. Compute per-fold metrics (ROC-AUC, PR-AUC, Brier)
2. Plot metric distributions
3. Build **OOF predictions** for both models and run **DeLong** test on ROC-AUC
4. Run **paired t-test / Wilcoxon** on per-fold metrics


In [None]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss
from src.data.loaders import load_classification_breast_cancer
from src.features.pipelines import build_leakage_safe_preprocessor
from src.models.baselines import penalized_logistic
from src.models.gbm import lgbm_classifier, xgb_classifier
from src.evaluation.stats import delong_roc_test, paired_ttest, wilcoxon_signed

plt.rcParams['figure.figsize'] = (6,4)


In [None]:
X, y = load_classification_breast_cancer()
pre = build_leakage_safe_preprocessor(X)
log_reg = penalized_logistic()
gbm = lgbm_classifier() or xgb_classifier()
log_pipe = Pipeline([('prep', pre), ('model', log_reg)])
gbm_pipe = Pipeline([('prep', pre), ('model', gbm)]) if gbm is not None else None

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

metrics_log, metrics_gbm = [], []
oof_proba_log = np.zeros(len(y))
oof_proba_gbm = np.zeros(len(y)) if gbm_pipe is not None else None

for fold, (tr, va) in enumerate(kf.split(X, y), 1):
    X_tr, X_va = X.iloc[tr], X.iloc[va]
    y_tr, y_va = y.iloc[tr], y.iloc[va]

    # Logistic
    log_pipe.fit(X_tr, y_tr)
    p_log = log_pipe.predict_proba(X_va)[:,1]
    oof_proba_log[va] = p_log
    metrics_log.append([
        roc_auc_score(y_va, p_log),
        average_precision_score(y_va, p_log),
        brier_score_loss(y_va, p_log)
    ])

    # GBM
    if gbm_pipe is not None:
        gbm_pipe.fit(X_tr, y_tr)
        p_gbm = gbm_pipe.predict_proba(X_va)[:,1]
        oof_proba_gbm[va] = p_gbm
        metrics_gbm.append([
            roc_auc_score(y_va, p_gbm),
            average_precision_score(y_va, p_gbm),
            brier_score_loss(y_va, p_gbm)
        ])

metrics_log = np.array(metrics_log)
metrics_gbm = np.array(metrics_gbm) if gbm_pipe is not None else None
metrics_log, metrics_gbm


In [None]:
# 1) Per-fold distributions
labels = ['ROC-AUC', 'PR-AUC', 'Brier']
fig, axes = plt.subplots(1, 3, figsize=(12,4))
for j, ax in enumerate(axes):
    ax.boxplot([metrics_log[:,j], metrics_gbm[:,j] if metrics_gbm is not None else metrics_log[:,j]], labels=['LogReg','GBM' if metrics_gbm is not None else 'LogReg'])
    ax.set_title(labels[j])
plt.tight_layout(); plt.show()


In [None]:
# 2) Statistical tests
if metrics_gbm is not None:
    for j, name in enumerate(labels):
        t, p = paired_ttest(metrics_log[:,j], metrics_gbm[:,j])
        w, pw = wilcoxon_signed(metrics_log[:,j], metrics_gbm[:,j])
        print(f"{name}: paired t={t:.3f}, p={p:.3g}; Wilcoxon W={w:.3f}, p={pw:.3g}")
else:
    print('GBM unavailable; only baseline shown.')


In [None]:
# 3) DeLong on OOF predictions (ROC-AUC)
if metrics_gbm is not None:
    z, p, auc_log, auc_gbm = delong_roc_test(y, oof_proba_log, oof_proba_gbm)
    print(f"DeLong z={z:.3f}, p={p:.3g}; AUC_log={auc_log:.4f}, AUC_gbm={auc_gbm:.4f}")
else:
    print('GBM unavailable; DeLong not run.')
