In [None]:

# --- 2. Fonctions de Diagnostic (Utilitaires) ---

def get_scores_for_auc(pipe, X):
    """Récupère les probabilités pour le calcul de l'AUC"""
    if hasattr(pipe, "predict_proba"):
        return pipe.predict_proba(X)[:, 1]
    elif hasattr(pipe, "decision_function"):
        return pipe.decision_function(X)
    return None

def compute_metrics_safe(y_true, y_pred, y_score):
    """Calcule un dictionnaire de métriques pour le bilan final"""
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_true, y_score) if y_score is not None else 0
    }

# --- 3. Préparation des données ---

num_features = ['age', 'creatinine_phosphokinase', 'ejection_fraction', 
                'platelets', 'serum_creatinine', 'serum_sodium', 'time']
cat_features = ['anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking']
target = 'DEATH_EVENT'

df_clean = df.drop_duplicates().reset_index(drop=True)
X = df_clean.drop(columns=[target])
y = df_clean[target]

# --- 4. Pipelines de Prétraitement ---

preprocessor = ColumnTransformer(transformers=[
    ("num", Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]), num_features),
    ("cat", Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_features)
])

# Split Stratifié 60/20/20
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
)

# Calcul du poids pour déséquilibre (utilisé par XGB/LGBM)
scale_pos = (y_train == 0).sum() / (y_train == 1).sum()

# --- 5. Définition des 8 Modèles ---

models_dict = {
    "LogisticRegression": LogisticRegression(class_weight="balanced", max_iter=2000),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "SVC": SVC(probability=True, class_weight="balanced", random_state=42),
    "RandomForest": RandomForestClassifier(class_weight="balanced", random_state=42),
    "ExtraTrees": ExtraTreesClassifier(class_weight="balanced", random_state=42),
    "GradBoost": GradientBoostingClassifier(random_state=42),
    "XGBoost": XGBClassifier(scale_pos_weight=scale_pos, eval_metric="logloss", random_state=42),
    "LightGBM": LGBMClassifier(scale_pos_weight=scale_pos, random_state=42, verbose=-1)
}

# --- 6. Évaluation Baseline avec Graphiques ---

os.makedirs("figs_baseline", exist_ok=True)
records = []

for name, model in models_dict.items():
    t0 = time.time()
    
    # Construction du pipeline avec gestion SMOTE sélective
    steps = [("pre", preprocessor)]
    if name in ["KNN", "GradBoost"]:
        steps.append(("smote", SMOTE(random_state=42)))
    steps.append(("clf", model))
    
    pipe = ImbPipeline(steps=steps)
    
    # Entraînement
    pipe.fit(X_train, y_train)
    t1 = time.time() - t0

    # Évaluation
    y_pred = pipe.predict(X_val)
    y_score = get_scores_for_auc(pipe, X_val)
    
    m = compute_metrics_safe(y_val, y_pred, y_score)
    m.update({"model": name, "train_time_sec": round(t1, 3)})
    records.append(m)

    # Sauvegarde Courbe ROC
    if y_score is not None:
        fpr, tpr, _ = roc_curve(y_val, y_score)
        plt.figure(figsize=(5,4))
        plt.plot(fpr, tpr, label=f"AUC={m['roc_auc']:.3f}", lw=2)
        plt.plot([0,1],[0,1], "k--")
        plt.title(f"ROC - {name}")
        plt.legend()
        plt.savefig(f"figs_baseline/roc_{name}.png")
        plt.show()

    # Sauvegarde Matrice de Confusion
    cm = confusion_matrix(y_val, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues')
    plt.title(f"CM - {name}")
    plt.savefig(f"figs_baseline/cm_{name}.png")
    plt.show()

# --- 7. Synthèse Finale ---

baseline_results = pd.DataFrame(records).sort_values(by="roc_auc", ascending=False)
print("\n=== CLASSEMENT DES MODÈLES (Basé sur l'AUC) ===")
display(baseline_results)

# Top 3 pour la suite
top3 = baseline_results.head(3)["model"].tolist()
print("Top-3 (val, AUC→):", top3)
top3_pipelines = {k: models_dict[k] for k in top3}