In [None]:
# ml_plus_feature_importance.py
# ------------------------------------------------------------------
# One-pass pipeline:
# 1) HPO with Optuna + 5-fold CV (AUC) for 9 models
# 2) 70/30 hold-out evaluation, full metrics, ROC, DeLong vs LGBM
# 3) Refit 3 diverse models with their BEST params:
#       - HistGradientBoostingClassifier  -> "Gradient Boosting"
#       - RandomForestClassifier          -> "Random Forest"
#       - SVC (RBF, probability=True)     -> "Support Vector Machine"
#    Compute permutation importances (ROC–AUC), build cross-model table,
#    plot TOP-50 scatter + PDPs (1D & 2D) on the hold-out set.
# ------------------------------------------------------------------

from __future__ import annotations
import warnings, numpy as np, pandas as pd, matplotlib.pyplot as plt
from pathlib import Path
from typing import Dict, List, Tuple
warnings.filterwarnings("ignore")

# ------------------------ Dependencies ------------------------
import optuna
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    roc_auc_score, roc_curve, average_precision_score,
    f1_score, accuracy_score, balanced_accuracy_score,
    confusion_matrix, cohen_kappa_score
)
from sklearn.inspection import permutation_importance, PartialDependenceDisplay

from sklearn.ensemble import (
    RandomForestClassifier, ExtraTreesClassifier,
    BaggingClassifier, HistGradientBoostingClassifier
)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from scipy import stats

# ------------------------- BASIC CONFIG -------------------------
CSV_PATH    = "normalised_with_binary_cai.csv"   
TARGET_COL  = "binary_cai"                                  
POS_LABEL   = 1
RANDOM_SEED = 12345
N_SPLITS    = 5
N_TRIALS    = 60        
N_JOBS      = -1

OUT = Path("ml_one_by_one"); OUT.mkdir(exist_ok=True)
(OUT/"reports").mkdir(exist_ok=True); (OUT/"figs").mkdir(exist_ok=True); (OUT/"preds").mkdir(exist_ok=True)
FI_OUT = Path("feature_importance_outputs"); FI_OUT.mkdir(exist_ok=True)

# ========================= LOAD + SPLIT =========================
df = pd.read_csv(CSV_PATH)
if TARGET_COL not in df.columns:
    raise ValueError(f"{TARGET_COL} not found in data.")
y = df[TARGET_COL].astype(int).values
X = df.drop(columns=[TARGET_COL])

cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = X.select_dtypes(include=[np.number, "bool"]).columns.tolist()

# Preprocessors
prep_tree = ColumnTransformer([
    ("num", Pipeline([("imp", SimpleImputer(strategy="median"))]), num_cols),
    ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                      ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))]), cat_cols)
], remainder="drop")

prep_linear = ColumnTransformer([
    ("num", Pipeline([("imp", SimpleImputer(strategy="median")),
                      ("scaler", StandardScaler(with_mean=True))]), num_cols),
    ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                      ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))]), cat_cols)
], remainder="drop")

X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=RANDOM_SEED
)

# ============================ Utils =============================
def fit_transform_fold(prep: ColumnTransformer, X_train, X_valid):
    P = prep
    Xt = P.fit_transform(X_train)
    Xv = P.transform(X_valid)
    return P, Xt, Xv

def metrics_from_cm(cm: np.ndarray) -> Dict[str, float]:
    tn, fp, fn, tp = cm.ravel()
    eps = 1e-12
    sens = tp / (tp + fn + eps)
    spec = tn / (tn + fp + eps)
    ppv  = tp / (tp + fp + eps)
    npv  = tn / (tn + fn + eps)
    prev = (tp + fn) / (tp + tn + fp + fn + eps)
    det_rate = tp / (tp + tn + fp + fn + eps)
    det_prev = (tp + fp) / (tp + tn + fp + fn + eps)
    return {"Sensitivity": sens, "Specificity": spec,
            "PosPredValue": ppv, "NegPredValue": npv,
            "Prevalence": prev, "DetectionRate": det_rate,
            "DetectionPrevalence": det_prev, "TN": tn, "FP": fp, "FN": fn, "TP": tp}

# scores for holdout
def get_proba_fitted(prep: ColumnTransformer, clf, X):
    X_ = prep.transform(X)
    if hasattr(clf, "predict_proba"):
        return clf.predict_proba(X_)[:, 1]
    if hasattr(clf, "decision_function"):
        s = clf.decision_function(X_)
        return (s - s.min()) / (s.max() - s.min() + 1e-12)
    raise RuntimeError("Classifier has neither predict_proba nor decision_function.")

# ---------- DeLong (paired AUC) ----------
def _compute_midrank(x):
    J = np.argsort(x); Z = x[J]; N = len(x); T = np.zeros(N); i = 0
    while i < N:
        j = i
        while j < N and Z[j] == Z[i]:
            j += 1
        T[i:j] = 0.5 * (i + j - 1) + 1
        i = j
    T2 = np.empty(N); T2[J] = T; return T2

def _fast_delong(pred_sorted_transposed: np.ndarray, label_1_count: int):
    m = label_1_count; n = pred_sorted_transposed.shape[1] - m
    pos_preds = pred_sorted_transposed[:, :m]; neg_preds = pred_sorted_transposed[:, m:]
    tx = np.apply_along_axis(_compute_midrank, 1, pos_preds)
    ty = np.apply_along_axis(_compute_midrank, 1, neg_preds)
    tz = np.apply_along_axis(_compute_midrank, 1, pred_sorted_transposed)
    aucs = (tz[:, :m].sum(axis=1) / (m * n)) - (m + 1) / (2 * n)
    v01 = (tx / n) - (tz[:, :m] / n)
    v10 = 1 - (ty / m) + (tz[:, m:] / m)
    sx = np.cov(v01); sy = np.cov(v10); s = sx / m + sy / n
    return aucs, s

def _auc_cov(y_true: np.ndarray, probs: np.ndarray):
    order = np.argsort(-probs[:,0]); y = y_true[order]; preds = probs[order].T
    m = int(y.sum()); aucs, cov = _fast_delong(preds, m); return aucs, cov

def delong_test(y_true: np.ndarray, score_ref: np.ndarray, score_cmp: np.ndarray):
    aucs, cov = _auc_cov(y_true.astype(int), np.vstack([score_ref, score_cmp]).T)
    diff = aucs[0] - aucs[1]; var = cov[0,0] + cov[1,1] - 2*cov[0,1]
    z = diff / max(np.sqrt(var), 1e-12); p = 2 * stats.norm.sf(abs(z))
    return aucs[0], aucs[1], z, p

# ---------- CV objective ----------
def cv_auc(prep_kind: str, make_clf, params: dict) -> float:
    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)
    aucs = []
    for tr_idx, va_idx in skf.split(X_tr, y_tr):
        Xtr, Xva = X_tr.iloc[tr_idx], X_tr.iloc[va_idx]
        ytr, yva = y_tr[tr_idx], y_tr[va_idx]
        prep = prep_tree if prep_kind == "tree" else prep_linear
        P, Xtr_, Xva_ = fit_transform_fold(prep, Xtr, Xva)
        clf = make_clf(params)
        # Early stopping for boosters
        if isinstance(clf, LGBMClassifier):
            clf.set_params(random_state=RANDOM_SEED, n_jobs=N_JOBS)
            clf.fit(Xtr_, ytr, eval_set=[(Xva_, yva)], eval_metric="auc", verbose=-1)
        elif isinstance(clf, XGBClassifier):
            clf.set_params(random_state=RANDOM_SEED, n_jobs=N_JOBS)
            clf.fit(Xtr_, ytr, eval_set=[(Xva_, yva)], eval_metric="auc",
                    early_stopping_rounds=100, verbose=False)
        else:
            clf.fit(Xtr_, ytr)
        s = clf.predict_proba(Xva_)[:, 1] if hasattr(clf,"predict_proba") else \
            (lambda d: (d-d.min())/(d.max()-d.min()+1e-12))(clf.decision_function(Xva_))
        aucs.append(roc_auc_score(yva, s))
    return float(np.mean(aucs))

# ===================== Search spaces (same as before) =====================
def optimize_HGB():
    def make(params):
        return HistGradientBoostingClassifier(
            learning_rate=params["lr"], max_depth=params["max_depth"],
            max_leaf_nodes=params["max_leaf_nodes"], min_samples_leaf=params["min_samples_leaf"],
            l2_regularization=params["l2"], max_bins=params["max_bins"], random_state=RANDOM_SEED)
    def objective(trial):
        params = {
            "lr": trial.suggest_float("lr", 0.01, 0.3, log=True),
            "max_depth": trial.suggest_int("max_depth", 2, 20),
            "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 16, 512, log=True),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 10, 200, log=True),
            "l2": trial.suggest_float("l2", 1e-8, 10.0, log=True),
            "max_bins": trial.suggest_int("max_bins", 64, 255)
        }
        return cv_auc("tree", make, params)
    st = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED))
    st.optimize(objective, n_trials=N_TRIALS)
    return st, make

def optimize_LGBM():
    def make(params):
        return LGBMClassifier(
            objective="binary", boosting_type="gbdt",
            n_estimators=params["n_estimators"], learning_rate=params["lr"],
            num_leaves=params["num_leaves"], max_depth=params["max_depth"],
            min_child_samples=params["min_child_samples"],
            subsample=params["subsample"], colsample_bytree=params["colsample"],
            reg_lambda=params["l2"], reg_alpha=params["l1"],
            random_state=RANDOM_SEED, n_jobs=N_JOBS)
    def objective(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 300, 3000, log=True),
            "lr": trial.suggest_float("lr", 0.005, 0.3, log=True),
            "num_leaves": trial.suggest_int("num_leaves", 31, 512, log=True),
            "max_depth": trial.suggest_int("max_depth", -1, 32),
            "min_child_samples": trial.suggest_int("min_child_samples", 10, 200, log=True),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample": trial.suggest_float("colsample", 0.6, 1.0),
            "l2": trial.suggest_float("l2", 1e-8, 10.0, log=True),
            "l1": trial.suggest_float("l1", 1e-8, 10.0, log=True)
        }
        return cv_auc("tree", make, params)
    st = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED))
    st.optimize(objective, n_trials=N_TRIALS)
    return st, make

def optimize_ETC():
    def make(params):
        return ExtraTreesClassifier(
            n_estimators=params["n_estimators"], max_depth=params["max_depth"],
            max_features=params["max_features"], min_samples_split=params["min_split"],
            min_samples_leaf=params["min_leaf"], bootstrap=params["bootstrap"],
            random_state=RANDOM_SEED, n_jobs=N_JOBS)
    def objective(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 200, 2000, log=True),
            "max_depth": trial.suggest_int("max_depth", 2, 40),
            "max_features": trial.suggest_float("max_features", 0.2, 1.0),
            "min_split": trial.suggest_int("min_split", 2, 50, log=True),
            "min_leaf": trial.suggest_int("min_leaf", 1, 50, log=True),
            "bootstrap": trial.suggest_categorical("bootstrap", [False, True])
        }
        return cv_auc("tree", make, params)
    st = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED))
    st.optimize(objective, n_trials=N_TRIALS)
    return st, make

def optimize_RF():
    def make(params):
        return RandomForestClassifier(
            n_estimators=params["n_estimators"], max_depth=params["max_depth"],
            max_features=params["max_features"], min_samples_split=params["min_split"],
            min_samples_leaf=params["min_leaf"], bootstrap=params["bootstrap"],
            random_state=RANDOM_SEED, n_jobs=N_JOBS)
    def objective(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 200, 2000, log=True),
            "max_depth": trial.suggest_int("max_depth", 2, 40),
            "max_features": trial.suggest_float("max_features", 0.2, 1.0),
            "min_split": trial.suggest_int("min_split", 2, 50, log=True),
            "min_leaf": trial.suggest_int("min_leaf", 1, 50, log=True),
            "bootstrap": trial.suggest_categorical("bootstrap", [True, False])
        }
        return cv_auc("tree", make, params)
    st = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED))
    st.optimize(objective, n_trials=N_TRIALS)
    return st, make

def optimize_XGB():
    def make(params):
        return XGBClassifier(
            objective="binary:logistic", eval_metric="auc",
            n_estimators=params["n_estimators"], learning_rate=params["lr"],
            max_depth=params["max_depth"], subsample=params["subsample"],
            colsample_bytree=params["colsample_bytree"], reg_lambda=params["l2"],
            reg_alpha=params["l1"], min_child_weight=params["min_child_weight"],
            gamma=params["gamma"], random_state=RANDOM_SEED, n_jobs=N_JOBS, use_label_encoder=False)
    def objective(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 300, 3000, log=True),
            "lr": trial.suggest_float("lr", 0.005, 0.3, log=True),
            "max_depth": trial.suggest_int("max_depth", 2, 16),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
            "l2": trial.suggest_float("l2", 1e-8, 10.0, log=True),
            "l1": trial.suggest_float("l1", 1e-8, 10.0, log=True),
            "min_child_weight": trial.suggest_float("min_child_weight", 1e-2, 10.0, log=True),
            "gamma": trial.suggest_float("gamma", 0.0, 5.0)
        }
        return cv_auc("tree", make, params)
    st = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED))
    st.optimize(objective, n_trials=N_TRIALS)
    return st, make

def optimize_LinearSVC():
    def make(params):
        base = LinearSVC(C=params["C"], tol=params["tol"], max_iter=20000)
        return CalibratedClassifierCV(base, cv=3, method="sigmoid")
    def objective(trial):
        params = {
            "C": trial.suggest_float("C", 1e-3, 100.0, log=True),
            "tol": trial.suggest_float("tol", 1e-5, 1e-2, log=True)
        }
        return cv_auc("linear", make, params)
    st = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED))
    st.optimize(objective, n_trials=N_TRIALS)
    return st, make

def optimize_Bagging():
    def make(params):
        return BaggingClassifier(
            n_estimators=params["n_estimators"], max_samples=params["max_samples"],
            max_features=params["max_features"], bootstrap=params["bootstrap"],
            bootstrap_features=params["bootstrap_features"], random_state=RANDOM_SEED, n_jobs=N_JOBS)
    def objective(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 1000, log=True),
            "max_samples": trial.suggest_float("max_samples", 0.5, 1.0),
            "max_features": trial.suggest_float("max_features", 0.5, 1.0),
            "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
            "bootstrap_features": trial.suggest_categorical("bootstrap_features", [True, False])
        }
        return cv_auc("tree", make, params)
    st = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED))
    st.optimize(objective, n_trials=N_TRIALS)
    return st, make

def optimize_SVC():
    def make(params):
        return SVC(C=params["C"], gamma=params["gamma"], kernel="rbf",
                   probability=True, random_state=RANDOM_SEED)
    def objective(trial):
        params = {
            "C": trial.suggest_float("C", 1e-2, 1e3, log=True),
            "gamma": trial.suggest_float("gamma", 1e-4, 1e1, log=True)
        }
        return cv_auc("linear", make, params)
    st = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED))
    st.optimize(objective, n_trials=N_TRIALS)
    return st, make

def optimize_LogReg():
    def make(params):
        return LogisticRegression(
            C=params["C"], penalty=params["penalty"], solver=params["solver"],
            max_iter=20000, n_jobs=N_JOBS)
    def objective(trial):
        penalty = trial.suggest_categorical("penalty", ["l2", "l1"])
        solver = trial.suggest_categorical("solver", ["lbfgs", "liblinear", "saga"])
        if penalty == "l1" and solver not in ["liblinear", "saga"]:
            raise optuna.exceptions.TrialPruned()
        params = {
            "C": trial.suggest_float("C", 1e-3, 100.0, log=True),
            "penalty": penalty,
            "solver": solver
        }
        return cv_auc("linear", make, params)
    st = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED))
    st.optimize(objective, n_trials=N_TRIALS)
    return st, make

# ======================= RUN EACH MODEL =======================
optimizers = [
    ("HistGradientBoostingClassifier", optimize_HGB, "tree"),
    ("LGBMClassifier",                  optimize_LGBM, "tree"),
    ("ExtraTreesClassifier",            optimize_ETC, "tree"),
    ("RandomForestClassifier",          optimize_RF,  "tree"),
    ("XGBClassifier",                   optimize_XGB, "tree"),
    ("LinearSVC",                       optimize_LinearSVC, "linear"),
    ("BaggingClassifier",               optimize_Bagging, "tree"),
    ("SVC",                             optimize_SVC, "linear"),
    ("LogisticRegression",              optimize_LogReg, "linear"),
]

all_metrics, roc_curves, proba_cache = [], {}, {}
best_params_by_model: Dict[str, dict] = {}
prep_kind_by_model: Dict[str, str] = {}

for model_name, opt_fun, prep_kind in optimizers:
    print(f"\n=== {model_name}: HPO with 5-fold CV ===")
    study, maker = opt_fun()
    best_params = study.best_trial.params
    best_params_by_model[model_name] = best_params
    prep_kind_by_model[model_name] = prep_kind
    pd.DataFrame([best_params]).to_csv(OUT/"reports"/f"{model_name}_best_params.csv", index=False)

    # Fit on training set (with small internal val split for boosters)
    P = prep_tree if prep_kind == "tree" else prep_linear
    P.fit(X_tr); Xtr_ = P.transform(X_tr); Xte_ = P.transform(X_te)
    clf = maker(best_params)

    if isinstance(clf, LGBMClassifier):
        Xtr_i, Xva_i, ytr_i, yva_i = train_test_split(Xtr_, y_tr, test_size=0.1,
                                                      stratify=y_tr, random_state=RANDOM_SEED)
        clf.set_params(random_state=RANDOM_SEED, n_jobs=N_JOBS)
        clf.fit(Xtr_i, ytr_i, eval_set=[(Xva_i, yva_i)], eval_metric="auc", verbose=-1)
    elif isinstance(clf, XGBClassifier):
        Xtr_i, Xva_i, ytr_i, yva_i = train_test_split(Xtr_, y_tr, test_size=0.1,
                                                      stratify=y_tr, random_state=RANDOM_SEED)
        clf.set_params(random_state=RANDOM_SEED, n_jobs=N_JOBS)
        clf.fit(Xtr_i, ytr_i, eval_set=[(Xva_i, yva_i)], eval_metric="auc",
                early_stopping_rounds=100, verbose=False)
    else:
        clf.fit(Xtr_, y_tr)

    s_test = clf.predict_proba(Xte_)[:, 1] if hasattr(clf,"predict_proba") else \
             (lambda d: (d-d.min())/(d.max()-d.min()+1e-12))(clf.decision_function(Xte_))
    y_pred = (s_test >= 0.5).astype(int)

    auc = roc_auc_score(y_te, s_test)
    ap  = average_precision_score(y_te, s_test)
    f1  = f1_score(y_te, y_pred)
    acc = accuracy_score(y_te, y_pred)
    bacc = balanced_accuracy_score(y_te, y_pred)
    kap = cohen_kappa_score(y_te, y_pred)
    cm  = confusion_matrix(y_te, y_pred, labels=[0,1])
    more = metrics_from_cm(cm)
    row = {"Model": model_name, "AUC": auc, "AP": ap, "F1": f1,
           "BalancedAcc": bacc, "Kappa": kap, "Accuracy": acc, **more}
    all_metrics.append(row)

    fpr, tpr, _ = roc_curve(y_te, s_test)
    roc_curves[model_name] = (fpr, tpr, auc)

    proba_cache[model_name] = s_test
    np.save(OUT/"preds"/f"{model_name}_test_scores.npy", s_test)
    print(f"{model_name} | AUC {auc:.4f}  AP {ap:.4f}")

# Save metrics + ROC
metrics_df = pd.DataFrame(all_metrics).sort_values("AUC", ascending=False)
metrics_df.to_csv(OUT/"reports"/"metrics_table.csv", index=False)

plt.figure(figsize=(8,8))
for name,(fpr,tpr,auc) in roc_curves.items():
    plt.plot(fpr, tpr, label=f"{name} [AUC {auc:.3f}]")
plt.plot([0,1],[0,1],"k--",lw=1)
plt.xlabel("False Positive Rate [1 - Specificity]"); plt.ylabel("True Positive Rate [Sensitivity]")
plt.title("ROC-AUC Curve (70/30 hold-out)")
plt.legend(loc="lower right", fontsize=8); plt.tight_layout()
plt.savefig(OUT/"figs"/"roc_auc_holdout.png", dpi=220); plt.close()

# DeLong vs LGBM
if "LGBMClassifier" in proba_cache:
    y_true = y_te.astype(int)
    ref = proba_cache["LGBMClassifier"]
    rows = []
    for name, s in proba_cache.items():
        if name == "LGBMClassifier": continue
        auc_ref, auc_cmp, z, p = delong_test(y_true, ref, s)
        rows.append({"Model": name, "AUC_ref(LGBM)": auc_ref, "AUC_model": auc_cmp,
                     "stat z": z, "p-value": p})
    delong_df = pd.DataFrame(rows).sort_values("p-value")
    delong_df.to_csv(OUT/"reports"/"delong_vs_LGBM.csv", index=False)
    print("\nDeLong vs LGBM:\n", delong_df)
else:
    print("Skipped DeLong: LGBMClassifier not available.")

print(f"\nAll ML artifacts saved in: {OUT.resolve()}")

# ================== FEATURE IMPORTANCE (driven by HPO outputs) ==================
# refitting the 3 diverse models using their BEST params found above,
# build Pipelines (preprocessor + estimator), then compute permutation importances.

# Friendly labels mapping
friendly = {
    "HistGradientBoostingClassifier": "Gradient Boosting",
    "RandomForestClassifier": "Random Forest",
    "SVC": "Support Vector Machine"
}

# Pick models that exist in best_params_by_model
fi_models = []
for key in ["HistGradientBoostingClassifier", "RandomForestClassifier", "SVC"]:
    if key in best_params_by_model:
        params = best_params_by_model[key]
        kind   = prep_kind_by_model[key]
        prep   = prep_tree if kind == "tree" else prep_linear
        if key == "HistGradientBoostingClassifier":
            est = HistGradientBoostingClassifier(**params, random_state=RANDOM_SEED)
        elif key == "RandomForestClassifier":
            est = RandomForestClassifier(**params, random_state=RANDOM_SEED, n_jobs=N_JOBS)
        elif key == "SVC":
            est = SVC(**params, probability=True, random_state=RANDOM_SEED)
        pipe = Pipeline([("prep", prep), ("clf", est)])
        fi_models.append((friendly[key], pipe))

# Fit on train, evaluate AUC on holdout (used as weights)
aucs_fi: Dict[str, float] = {}
fitted_pipes: Dict[str, Pipeline] = {}
for name, pipe in fi_models:
    pipe.fit(X_tr, y_tr)
    s = pipe.predict_proba(X_te)[:, 1] if hasattr(pipe.named_steps["clf"], "predict_proba") else \
        (lambda d:(d-d.min())/(d.max()-d.min()+1e-12))(pipe.decision_function(X_te))
    aucs_fi[name] = roc_auc_score(y_te, s)
    fitted_pipes[name] = pipe

# Permutation importances (on original columns via Pipeline)
imps: Dict[str, np.ndarray] = {}
feat_names = np.array(X.columns)
for name, pipe in fitted_pipes.items():
    r = permutation_importance(pipe, X_te, y_te, scoring="roc_auc",
                               n_repeats=10, random_state=RANDOM_SEED, n_jobs=-1)
    imp = r.importances_mean.clip(min=0)
    if imp.max() > 0: imp = imp / imp.max()
    imps[name] = imp

# Build importance table
imp_df_list = [pd.DataFrame({"feature": feat_names, "model": m, "importance": arr})
               for m, arr in imps.items()]
imp_long = pd.concat(imp_df_list, ignore_index=True)

avg_imp = (imp_long.groupby("feature")["importance"].mean()
           .rename("avg_importance").reset_index())

w_series = pd.Series(aucs_fi); w_series = w_series / w_series.sum()
w = imp_long.merge(w_series.rename("w").rename_axis("model").reset_index(), on="model", how="left")
wavg_imp = (w.assign(w_imp=lambda d: d["importance"] * d["w"])
            .groupby("feature")["w_imp"].sum()
            .rename("wavg_importance").reset_index())

imp_summary = (avg_imp.merge(wavg_imp, on="feature")
               .merge(imp_long.pivot(index="feature", columns="model", values="importance").reset_index(),
                      on="feature", how="left"))\
               .sort_values("avg_importance", ascending=False).reset_index(drop=True)

# Save & plot TOP-50
TOP_K_FEATURES = 50
imp_summary.to_csv(FI_OUT/"feature_importances_all_models.csv", index=False)
top50 = imp_summary.head(TOP_K_FEATURES).copy()
top50.to_csv(FI_OUT/f"feature_importances_top{TOP_K_FEATURES}.csv", index=False)

# Scatter plot
top_long = imp_long[imp_long["feature"].isin(top50["feature"])]
avg_for_plot = top50[["feature","avg_importance"]].assign(model="Average")\
                 .rename(columns={"avg_importance":"importance"})
wavg_for_plot = top50[["feature","wavg_importance"]].assign(model="Average (weighted)")\
                  .rename(columns={"wavg_importance":"importance"})
top_long_plot = pd.concat([top_long, avg_for_plot, wavg_for_plot], ignore_index=True)
cat_order = top50["feature"].tolist()[::-1]

palette = {
    "Average": "black",
    "Average (weighted)": "dimgray",
    "Gradient Boosting": "#ff7f0e",
    "Random Forest": "#1f77b4",
    "Support Vector Machine": "#2ca02c",
}

plt.figure(figsize=(9, 11))
for model_name, sub in top_long_plot.groupby("model"):
    y_pos = sub["feature"].apply(lambda f: cat_order.index(f))
    plt.scatter(sub["importance"], y_pos, s=22, label=model_name,
                color=palette.get(model_name, None), alpha=0.95, edgecolor="none")
plt.yticks(ticks=range(len(cat_order)), labels=cat_order, fontsize=8)
plt.gca().invert_yaxis()
plt.xlabel("Importance"); plt.ylabel("Features")
plt.title(f"Feature importances across Models — Top {TOP_K_FEATURES}")
handles, labels = plt.gca().get_legend_handles_labels()
order = ["Average","Average (weighted)","Gradient Boosting","Random Forest","Support Vector Machine"]
ordered = [handles[labels.index(l)] for l in order if l in labels]
plt.legend(ordered, order, loc="center left", bbox_to_anchor=(0.72, 0.5), title="Model", frameon=False)
plt.tight_layout(); plt.savefig(FI_OUT/f"feature_importances_top{TOP_K_FEATURES}.png", dpi=220); plt.close()

print(f"[OK] Saved feature-importance plots/tables -> {FI_OUT.resolve()}")

# ===================== PDPs (use best AUC among the three) =====================
best_model_name = max(aucs_fi, key=aucs_fi.get)
best_pipe = fitted_pipes[best_model_name]
print(f"PDPs using: {best_model_name}  AUC={aucs_fi[best_model_name]:.3f}")

# Configure PDP feature pairs (fallback to top-ranked)
PDP_GROUPS: List[List[str]] = [
    ["gg_medie_irrig", "plv_colture"],
    ["sau_irrigata", "volume_acqua_ha"],
    ["share_irrigated", "irr_pioggia"]
]
fallback_feats = top50["feature"].tolist()
all_feats = set(X.columns.tolist())

def choose_pair(pair: List[str]) -> List[str]:
    a, b = pair
    if a in all_feats and b in all_feats: return pair
    # fallback to first two distinct features from top50
    out = []
    for f in fallback_feats:
        if f in all_feats and f not in out:
            out.append(f)
        if len(out) == 2: break
    return out

for grp in PDP_GROUPS:
    f1, f2 = choose_pair(grp)
    fig, axes = plt.subplots(1, 3, figsize=(12, 3.7))
    PartialDependenceDisplay.from_estimator(best_pipe, X, [f1], ax=axes[0], kind="average", grid_resolution=30)
    axes[0].set_ylabel("Target variable")
    PartialDependenceDisplay.from_estimator(best_pipe, X, [f2], ax=axes[1], kind="average", grid_resolution=30)
    axes[1].set_ylabel("Target variable")
    PartialDependenceDisplay.from_estimator(best_pipe, X, [(f1, f2)], ax=axes[2], kind="average", grid_resolution=25)
    fig.suptitle(f"Features: {f1} and {f2}", y=1.03, fontsize=12)
    plt.tight_layout()
    out_png = FI_OUT/f"pdp_{f1}__{f2}.png"
    plt.savefig(out_png, dpi=220, bbox_inches="tight"); plt.close()
    print(f"[OK] Saved PDP: {out_png.name}")
