# EDA → Model Selection: Ready-to-Run Evaluation
This notebook demonstrates the corrected, leakage-safe workflow:
1. Load data (no internet needed)
2. Split (stratified if classification)
3. Build leakage-safe Pipelines (missingness flags + ColumnTransformer)
4. Train baseline (penalized linear/logistic)
5. Optional GBM (LightGBM/XGBoost if installed)
6. Diagnostics: ROC/PR & calibration (classification) or residuals (regression)
7. Final test evaluation


In [None]:
TASK = 'classification'  # 'classification' or 'regression'
RANDOM_STATE = 42
N_SPLITS = 5


In [None]:
import warnings, numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.calibration import CalibratedClassifierCV

from src.data.loaders import load_classification_breast_cancer, load_regression_synthetic, split_data
from src.features.pipelines import build_leakage_safe_preprocessor
from src.models.baselines import ridge_regressor, elasticnet_regressor, robust_huber_regressor, penalized_logistic, log1p_wrapper
from src.models.gbm import xgb_regressor, xgb_classifier, lgbm_regressor, lgbm_classifier
from src.evaluation.metrics import classification_metrics, regression_metrics
from src.evaluation.plots import plot_roc_pr, plot_calibration, plot_residuals

plt.rcParams['figure.figsize'] = (6,4)


In [None]:
# 1) Load dataset and split
if TASK == 'classification':
    X, y = load_classification_breast_cancer()
else:
    X, y = load_regression_synthetic()

data = split_data(X, y, task=TASK, random_state=RANDOM_STATE)
data


In [None]:
# 2) Build leakage-safe preprocessor
pre = build_leakage_safe_preprocessor(data.X_train)
pre


In [None]:
# 3) Baseline model
if TASK == 'classification':
    base = penalized_logistic(random_state=RANDOM_STATE)
else:
    base = ridge_regressor(random_state=RANDOM_STATE)
base_pipe = Pipeline([('prep', pre), ('model', base)])
base_pipe.fit(data.X_train, data.y_train)

if TASK == 'classification':
    y_proba = base_pipe.predict_proba(data.X_valid)[:,1]
    base_scores = classification_metrics(data.y_valid, y_proba)
    base_scores
else:
    y_pred = base_pipe.predict(data.X_valid)
    base_scores = regression_metrics(data.y_valid, y_pred)
    base_scores


In [None]:
# 4) Optional GBM (LightGBM preferred; fallback to XGBoost). Skips automatically if not installed.
best_pipe = base_pipe
if TASK == 'classification':
    gbm = lgbm_classifier(random_state=RANDOM_STATE) or xgb_classifier(random_state=RANDOM_STATE)
    if gbm is not None:
        gbm_pipe = Pipeline([('prep', pre), ('model', gbm)])
        gbm_pipe.fit(data.X_train, data.y_train)
        y_gbm = gbm_pipe.predict_proba(data.X_valid)[:,1]
        gbm_scores = classification_metrics(data.y_valid, y_gbm)
        display({'baseline': base_scores, 'gbm': gbm_scores})
        if gbm_scores['roc_auc'] > base_scores['roc_auc']:
            best_pipe = gbm_pipe
else:
    gbm = lgbm_regressor(random_state=RANDOM_STATE) or xgb_regressor(random_state=RANDOM_STATE)
    if gbm is not None:
        gbm_pipe = Pipeline([('prep', pre), ('model', gbm)])
        gbm_pipe.fit(data.X_train, data.y_train)
        y_gbm = gbm_pipe.predict(data.X_valid)
        from src.evaluation.metrics import regression_metrics
        gbm_scores = regression_metrics(data.y_valid, y_gbm)
        display({'baseline': base_scores, 'gbm': gbm_scores})
        if gbm_scores['rmse'] < base_scores['rmse']:
            best_pipe = gbm_pipe


In [None]:
# 5) Diagnostics
if TASK == 'classification':
    from sklearn.metrics import roc_auc_score
    y_proba_valid = best_pipe.predict_proba(data.X_valid)[:,1]
    plot_roc_pr(data.y_valid, y_proba_valid)
    plot_calibration(data.y_valid, y_proba_valid)
else:
    y_pred_valid = best_pipe.predict(data.X_valid)
    plot_residuals(data.y_valid, y_pred_valid)
plt.show()


In [None]:
# 6) Final fit, optional calibration (classification), and test evaluation
from sklearn.metrics import roc_auc_score
if TASK == 'classification':
    try:
        calib = CalibratedClassifierCV(best_pipe, method='isotonic', cv=5)
        calib.fit(pd.concat([data.X_train, data.X_valid]), pd.concat([data.y_train, data.y_valid]))
        y_proba_test = calib.predict_proba(data.X_test)[:,1]
        print('Test ROC-AUC (calibrated):', roc_auc_score(data.y_test, y_proba_test))
        plot_calibration(data.y_test, y_proba_test)
    except Exception as e:
        warnings.warn(f'Calibration failed: {e}')
        best_pipe.fit(pd.concat([data.X_train, data.X_valid]), pd.concat([data.y_train, data.y_valid]))
        y_proba_test = best_pipe.predict_proba(data.X_test)[:,1]
        print('Test ROC-AUC (uncalibrated):', roc_auc_score(data.y_test, y_proba_test))
else:
    best_pipe.fit(pd.concat([data.X_train, data.X_valid]), pd.concat([data.y_train, data.y_valid]))
    y_pred_test = best_pipe.predict(data.X_test)
    from src.evaluation.metrics import regression_metrics
    print('Test scores:', regression_metrics(data.y_test, y_pred_test))
