## IMPORTS and UTILS

In [1]:
print("hi")

import os, math
import pandas as pd
import subprocess
import re
import numpy as np
import json
import csv

from joblib import Parallel, delayed
BW_FOLDER = "BS"


import matplotlib.pyplot as plt
import seaborn as sns


import os

from scripts import combat_info
from scripts import combat_quick_apply
from scripts import combat_quick_QC
from robust_evaluation_tools.robust_utils import get_site, robust_text, rwp_text, get_camcan_file, get_diseases, get_metrics, add_nb_patients_and_diseased
from robust_evaluation_tools.robust_harmonization import fit, apply, visualize_harmonization, QC, compare_with_compilation, create_presentation, compare_distances, compare_with_compilation_var
from robust_evaluation_tools.synthectic_sites_generations import generate_sites
from robust_evaluation_tools.robust_outlier_detection import z_score_detection, flag_sid
from robust_evaluation_tools.robust_MLP import predict_malades_MLP

MAINFOLDER = "RESULTS/MAE_TEST"
SYNTHETIC_SITES = f"{MAINFOLDER}/SYNTHETIC_SITES"

ANALYSIS_FOLDER = f"{MAINFOLDER}/ANALYSIS"
MAE_PLOT_FOLDER = f"{ANALYSIS_FOLDER}/MAE_PLOTS"

hi


## EXECUTOR

In [2]:
harmonization_method= "classic"
SYNTHETIC_SITES_VERSION = "v1"

metrics = get_metrics()
#diseases = get_diseases(True)
diseases = ["ASTMIX", "AD", "SCHZ", "TBI"]
robust_methods = ["Z_SCORE_MAD", 'Z_SCORE_IQR',"IQR",'MAD','MMS', 'VS','FLIP', 'Z_SCORE', ]
#robust_methods = ["MMS","IQR",'MAD', 'VS', 'VS2', 'TOP30', 'FLIP']
#'Z_SCORE'


sample_sizes = [30,100,150]  # Différentes tailles d'échantillon
sample_sizes = [100]  # Différentes tailles d'échantillon
disease_ratios = [0.03, 0.1, 0.3, 0.5, 0.7, 0.8]   # Différents pourcentages de malades
num_tests = 20  # Nombre de tests à effectuer pour chaque combinaison
n_jobs=-1

# for disease in diseases:
#     generate_sites_for_disease(
#         disease, SYNTHETIC_SITES, SYNTHETIC_SITES_VERSION, sample_sizes, disease_ratios, num_tests, n_jobs
#     )

In [3]:
def load_mae_or_maev_compilations(mainfolder, diseases, sample_sizes, disease_ratios, num_tests, mae_or_maev='mae'):
    tests, trains = [], []
    for d in diseases:
        for s in sample_sizes:
            for r in disease_ratios:
                for i in range(num_tests):
                    base = os.path.join(mainfolder, "PROCESS", d, f"{s}_{int(r*100)}", str(i))
                    test_path  = os.path.join(base, f"{mae_or_maev}_compilation_test.csv")
                    train_path = os.path.join(base, f"{mae_or_maev}_compilation_train.csv")
                    if os.path.isfile(test_path):
                        tests.append(pd.read_csv(test_path))
                    if os.path.isfile(train_path):
                        trains.append(pd.read_csv(train_path))
    df_test  = pd.concat(tests,  ignore_index=True) if tests  else pd.DataFrame()
    df_train = pd.concat(trains, ignore_index=True) if trains else pd.DataFrame()
    return df_test, df_train

In [4]:
def load_compilation(mae_or_maev: str,
                     split: str,
                     *,
                     mainfolder: str,
                     diseases: list[str],
                     sample_sizes: list[int],
                     disease_ratios: list[int],
                     num_tests: int) -> pd.DataFrame:
    if mae_or_maev not in {"mae", "maev", "smape", "std_mae"}:
        raise ValueError("mae_or_maev doit être 'mae' ou 'maev'")
    if split not in {"test", "train"}:
        raise ValueError("split doit être 'test' ou 'train'")

    df_test, df_train = load_mae_or_maev_compilations(
        mainfolder,
        diseases,
        sample_sizes,
        disease_ratios,
        num_tests,
        mae_or_maev=mae_or_maev
    )
    return df_test if split == "test" else df_train

In [5]:
mae_compilation_train_all = load_compilation("mae", "train",
                      mainfolder=MAINFOLDER,
                      diseases=diseases,
                      sample_sizes=sample_sizes,
                      disease_ratios=disease_ratios,
                      num_tests=num_tests)
mae_compilation_test_all = load_compilation("mae", "test",
                      mainfolder=MAINFOLDER,
                      diseases=diseases,
                      sample_sizes=sample_sizes,
                      disease_ratios=disease_ratios,
                      num_tests=num_tests)

smape_compilation_train_all = load_compilation("smape", "train",
                      mainfolder=MAINFOLDER,
                      diseases=diseases,
                      sample_sizes=sample_sizes,
                      disease_ratios=disease_ratios,
                      num_tests=num_tests)
smape_compilation_test_all = load_compilation("smape", "test",
                      mainfolder=MAINFOLDER,
                      diseases=diseases,
                      sample_sizes=sample_sizes,
                      disease_ratios=disease_ratios,
                      num_tests=num_tests)

std_mae_compilation_train_all = load_compilation("std_mae", "train",
                      mainfolder=MAINFOLDER,
                      diseases=diseases,
                      sample_sizes=sample_sizes,
                      disease_ratios=disease_ratios,
                      num_tests=num_tests)
std_mae_compilation_test_all = load_compilation("std_mae", "test",
                      mainfolder=MAINFOLDER,
                      diseases=diseases,
                      sample_sizes=sample_sizes,
                      disease_ratios=disease_ratios,
                      num_tests=num_tests)

In [6]:
def transformer_df_large_en_long(df_large):

    context_cols = ["site", "method", "robust_method", "disease", "metric"]
    bundle_cols = [col for col in df_large.columns if col not in context_cols]

    df_long = df_large.melt(
        id_vars=context_cols,
        value_vars=bundle_cols,
        var_name="bundle",
        value_name="mae"
    )
    df_long.loc[df_long['robust_method'] == 'No', 'robust_method'] = df_long.loc[df_long['robust_method'] == 'No', 'method']


    return df_long

In [7]:
def compute_metrics(pivot, methods, eps=1e-7):
    """
    - Cas 1 : HC < NoRobust       (score vers HC)
    - Cas 2 : NoRobust < HC       (score vers NoRobust)
    - Cas 3 : |HC−No| < eps       (quasi égaux : on regarde +/- %Δ)
    """
    E0, Estar = pivot["NoRobust"], pivot["hc"]

    mask_eq   = (Estar - E0).abs() < eps
    mask_c1   = (Estar < E0) & ~mask_eq
    mask_c2   = (E0 < Estar) & ~mask_eq

    rows = []
    for m in methods:
        Em = pivot[m]

        # ---------------- Cas 1 ----------------
        s1 = pd.Series(np.nan, index=pivot.index)
        d1 = E0 - Estar
        s1[mask_c1] = 1 - (Em[mask_c1] - Estar[mask_c1]) / d1[mask_c1]

        pos1 = s1[s1 >= 0]
        neg1 = s1[s1 < 0]
        pct_fail_c1  = len(neg1) / mask_c1.sum() * 100 if mask_c1.any() else np.nan
        mean_gain_c1 = pos1.mean()
        mean_loss_c1 = neg1.mean()

        # ---------------- Cas 2 ----------------
        s2 = pd.Series(np.nan, index=pivot.index)
        d2 = Estar - E0
        s2[mask_c2] = 1 - (Em[mask_c2] - E0[mask_c2]) / d2[mask_c2]

        pos2 = s2[s2 >= 0]
        neg2 = s2[s2 < 0]
        pct_fail_c2  = len(neg2) / mask_c2.sum() * 100 if mask_c2.any() else np.nan
        mean_gain_c2 = pos2.mean()
        mean_loss_c2 = neg2.mean()

        # ---------------- Cas 3 ----------------
        delta_eq = (E0 - Em) / E0          # %Δ relatif à NoRobust (≈ HC)
        eq_imp   = delta_eq[mask_eq & (delta_eq > 0)]
        eq_wors  = delta_eq[mask_eq & (delta_eq < 0)]

        pct_eq_imp   = len(eq_imp) / mask_eq.sum() * 100 if mask_eq.any() else np.nan
        pct_eq_wors  = len(eq_wors) / mask_eq.sum() * 100 if mask_eq.any() else np.nan
        mean_eq_gain = eq_imp.mean()
        mean_eq_loss = eq_wors.mean()

        # ---------------- Global ----------------
        score_total = pd.concat([s1, s2])
        score_mean_tot = score_total.mean()

        rows.append({
            "method": m,

            # Cas 1
            "%fail_c1": pct_fail_c1,
            "mean_gain_c1": mean_gain_c1,
            "mean_loss_c1": mean_loss_c1,

            # Cas 2
            "%fail_c2": pct_fail_c2,
            "mean_gain_c2": mean_gain_c2,
            "mean_loss_c2": mean_loss_c2,

            # Cas 3
            "%eq_improve": pct_eq_imp,
            "mean_eq_gain": mean_eq_gain,
            "%eq_worsen": pct_eq_wors,
            "mean_eq_loss": mean_eq_loss,

            # Global
            "score_mean_total": score_mean_tot
        })

    summary = pd.DataFrame(rows)

    info = pd.Series({
        "Lignes cas1 (%)": mask_c1.mean()*100,
        "Lignes cas2 (%)": mask_c2.mean()*100,
        "Lignes cas3 (%)": mask_eq.mean()*100,
        "Total lignes": len(pivot)
    })

    return summary, info


In [8]:
def plot_mae_mean_all_ratios(
        pivot_df, sample_size,
        directory, dataset_type, Y="MAE"):
    """
    Moyenne du MAE pour chaque méthode en combinant:
      • toutes les maladies
      • toutes les métriques
      • toutes les valeurs de disease_ratio
    On filtre uniquement par num_patients et on trace une barre par méthode.
    """
    # 1) Filtre uniquement sur le nombre de patients
    df_filt = pivot_df.loc[
        pivot_df.index.get_level_values("num_patients") == sample_size
    ].reset_index()

    # 2) Colonnes de méthodes (numériques)
    mae_cols = [c for c in df_filt.columns if c not in
                ['site','disease','metric','bundle',
                 'num_patients','disease_ratio','num_diseased']]

    # 3) Ordre et couleurs
    ordered_cols = [c for c in ['hc', 'NoRobust'] if c in mae_cols]
    ordered_cols += [c for c in mae_cols if c not in ordered_cols]

    col_colors = {'hc': 'green', 'NoRobust': 'red'}
    remaining = [c for c in ordered_cols if c not in col_colors]
    col_colors.update(dict(zip(remaining,
                               sns.color_palette("viridis", len(remaining)))))

    # 4) Moyenne globale pour chaque méthode
    means = [df_filt[col].dropna().mean() for col in ordered_cols]

    x = np.arange(len(ordered_cols))
    bar_w = 0.7

    fig, ax = plt.subplots(figsize=(12, 6))
    ax.bar(
        x, means,
        width=bar_w,
        color=[col_colors[c] for c in ordered_cols],
        edgecolor='black'
    )

    # 5) Mise en forme
    ax.set_xticks(x)
    ax.set_xticklabels(ordered_cols, rotation=45, ha='right')
    ax.set_ylabel(f"Moyenne de {Y}")
    ax.set_title(
        f"Moyenne de {Y} - toutes maladies, métriques et ratios confondus\n"
        f"Nb patients: {sample_size}   |   Dataset: {dataset_type}"
    )
    ax.axhline(y=0, color='black', linestyle='--', linewidth=1)
    plt.tight_layout()

    # 6) Sauvegarde
    out_dir = os.path.join(directory, f"{Y}_PLOTS_MEAN",
                           "ALL_DISEASES_ALL_METRICS_ALL_RATIOS",
                           str(sample_size))
    os.makedirs(out_dir, exist_ok=True)
    fname = f"{Y}_mean_all_ratios_{dataset_type}.png"
    plt.savefig(os.path.join(out_dir, fname), bbox_inches="tight")
    plt.close()

In [9]:
def plot_mae_mean_all_diseases_metrics(
        pivot_df, sample_size,
        directory, dataset_type, Y="MAE"):
    """
    Affiche la moyenne du MAE (pas d’écart‑type) pour chaque méthode
    en fonction de disease_ratio, en combinant:
        • toutes les maladies
        • toutes les métriques
        • tous les bundles
    """
    # 1) Filtre: seulement le nombre de patients
    df_filt = pivot_df.loc[
        pivot_df.index.get_level_values("num_patients") == sample_size
    ].reset_index()

    # 2) Colonnes de méthodes (numériques)
    mae_cols = [c for c in df_filt.columns if c not in
                ['site','disease','metric','bundle',
                 'num_patients','disease_ratio','num_diseased']]

    # 3) Ordre + couleurs
    ordered_cols = [c for c in ['hc', 'NoRobust'] if c in mae_cols]
    ordered_cols += [c for c in mae_cols if c not in ordered_cols]

    col_colors = {'hc': 'green', 'NoRobust': 'red'}
    remaining = [c for c in ordered_cols if c not in col_colors]
    col_colors.update(dict(zip(remaining,
                               sns.color_palette("viridis", len(remaining)))))

    # 4) X‑ticks (ratios)
    ratios = sorted(df_filt["disease_ratio"].unique())
    x = np.arange(len(ratios))
    g_width   = .8
    n_methods = len(ordered_cols)
    bar_w     = g_width / n_methods

    fig, ax = plt.subplots(figsize=(14, 7))

    for i_m, col in enumerate(ordered_cols):
        means = [
            df_filt[df_filt["disease_ratio"] == r][col].dropna().mean()
            for r in ratios
        ]

        pos = x - g_width / 2 + (i_m + .5) * bar_w
        ax.bar(
            pos, means,
            width=bar_w * .9,
            color=col_colors[col],
            edgecolor='black',
            label=col
        )

    # 5) Mise en forme
    ax.set_xlabel("Pourcentage de patients malades")
    ax.set_ylabel(f"Moyenne de {Y}")
    ax.set_title(
        f"Moyenne de {Y} (toutes maladies et métriques confondues)\n"
        f"Nb patients : {sample_size}   |   Dataset : {dataset_type}"
    )
    ax.set_xticks(x)
    ax.set_xticklabels(ratios)
    ax.legend(loc="upper left", bbox_to_anchor=(1, 1))
    ax.axhline(y=0, color='black', linestyle='--', linewidth=1)
    plt.tight_layout()

    # 6) Sauvegarde
    out_dir = os.path.join(directory, f"{Y}_PLOTS_MEAN",
                           f"ALL_DISEASES_ALL_METRICS", str(sample_size))
    os.makedirs(out_dir, exist_ok=True)
    fname = f"{Y}_mean_all_diseases_metrics_{dataset_type}.png"
    plt.savefig(os.path.join(out_dir, fname), bbox_inches="tight")
    plt.close()


In [10]:
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def plot_rank(
        pivot_df, sample_size, disease, metric,
        directory, dataset_type, Y="MAE"):
    """
    Trace, pour chaque méthode, la moyenne du MAE
    en fonction de disease_ratio, tous bundles confondus.
    """
    # 1) Filtre
    df_filt = pivot_df.loc[
        (pivot_df.index.get_level_values("num_patients") == sample_size) &
        (pivot_df.index.get_level_values("disease")      == disease) &
        (pivot_df.index.get_level_values("metric")       == metric)
    ].reset_index()

    # 2) Colonnes de méthodes
    mae_cols = [c for c in df_filt.columns if c not in
                ['site','disease','metric','bundle',
                 'num_patients','disease_ratio','num_diseased']]

    # 3) Ordre + couleurs
    ordered_cols = [c for c in ['hc', 'NoRobust'] if c in mae_cols]
    ordered_cols += [c for c in mae_cols if c not in ordered_cols]

    col_colors = {'hc': 'green', 'NoRobust': 'red'}
    remaining = [c for c in ordered_cols if c not in col_colors]
    col_colors.update(dict(zip(remaining,
                               sns.color_palette("viridis", len(remaining)))))

    # 4) Préparation des x‑ticks
    ratios = sorted(df_filt["disease_ratio"].unique())
    x = np.arange(len(ratios))
    g_width   = .8
    n_methods = len(ordered_cols)
    bar_w     = g_width / n_methods

    fig, ax = plt.subplots(figsize=(14, 7))

    for i_m, col in enumerate(ordered_cols):
        means = [
            df_filt[df_filt["disease_ratio"] == r][col].dropna().mean()
            for r in ratios
        ]

        pos = x - g_width / 2 + (i_m + .5) * bar_w
        ax.bar(
            pos, means,
            width=bar_w * .9,
            color=col_colors[col],
            edgecolor='black',
            label=col
        )

    ax.set_xlabel("Pourcentage de patients malades")
    ax.set_ylabel(f"Moyenne de {Y}")
    ax.set_title(
        f"Moyenne de {Y}, tous bundles confondus\n"
        f"Maladie : {disease}   |   Metric : {metric}\n"
        f"Nb patients : {sample_size}   |   Dataset : {dataset_type}"
    )
    ax.set_xticks(x)
    ax.set_xticklabels(ratios)

    ax.legend(loc="upper left", bbox_to_anchor=(1, 1))
    ax.axhline(y=0, color='black', linestyle='--', linewidth=1)
    plt.tight_layout()
    out_dir = os.path.join(directory, f"{Y}_PLOTS_MEAN", disease,
                           str(sample_size))
    os.makedirs(out_dir, exist_ok=True)
    fname = f"{Y}_{metric}_mean_{dataset_type}.png"
    plt.savefig(os.path.join(out_dir, fname), bbox_inches="tight")
    plt.close()


In [11]:
def plot_rank_all_metrics(
        pivot_df, sample_size, disease,
        directory, dataset_type, Y="MAE"):
    """
    Même logique que plot_mae_all_bundles_mean,
    mais on agrège TOUTES les métriques (T1, FA, etc.) d’un coup.
    """
    # 1) Filtre (pas de métrique cette fois)
    df_filt = pivot_df.loc[
        (pivot_df.index.get_level_values("num_patients") == sample_size) &
        (pivot_df.index.get_level_values("disease")      == disease)
    ].reset_index()

    # 2) Colonnes de méthodes
    mae_cols = [c for c in df_filt.columns if c not in
                ['site','disease','metric','bundle',
                 'num_patients','disease_ratio','num_diseased']]

    # 3) Ordre + couleurs
    ordered_cols = [c for c in ['hc', 'NoRobust'] if c in mae_cols]
    ordered_cols += [c for c in mae_cols if c not in ordered_cols]

    col_colors = {'hc': 'green', 'NoRobust': 'red'}
    remaining = [c for c in ordered_cols if c not in col_colors]
    col_colors.update(dict(zip(remaining,
                               sns.color_palette("viridis", len(remaining)))))

    # 4) X‑ticks (ratios)
    ratios = sorted(df_filt["disease_ratio"].unique())
    x = np.arange(len(ratios))
    g_width   = .8
    n_methods = len(ordered_cols)
    bar_w     = g_width / n_methods

    fig, ax = plt.subplots(figsize=(14, 7))

    for i_m, col in enumerate(ordered_cols):
        means = [
            df_filt[df_filt["disease_ratio"] == r][col].dropna().mean()
            for r in ratios
        ]

        pos = x - g_width / 2 + (i_m + .5) * bar_w
        ax.bar(
            pos, means,
            width=bar_w * .9,
            color=col_colors[col],
            edgecolor='black',
            label=col
        )

    ax.set_xlabel("Pourcentage de patients malades")
    ax.set_ylabel(f"Moyenne de {Y} (toutes métriques)")
    ax.set_title(
        f"Moyenne de {Y}, tous bundles et métriques confondus\n"
        f"Maladie : {disease}   |   Nb patients : {sample_size}   |   Dataset : {dataset_type}"
    )
    ax.set_xticks(x)
    ax.set_xticklabels(ratios)

    ax.legend(loc="upper left", bbox_to_anchor=(1, 1))
    ax.axhline(y=0, color='black', linestyle='--', linewidth=1)
    plt.tight_layout()
    out_dir = os.path.join(directory, f"{Y}_PLOTS_MEAN", disease,
                           str(sample_size))
    os.makedirs(out_dir, exist_ok=True)
    fname = f"{Y}_all_metrics_mean_{dataset_type}.png"
    plt.savefig(os.path.join(out_dir, fname), bbox_inches="tight")
    plt.close()

In [12]:
def plot_mae_all_bundles_pivot(
        pivot_df, sample_size, disease, metric,
        directory, dataset_type, Y="MAE", bundle=None,
    ):
    # Construit la condition de filtre principale
    cond = (
        (pivot_df.index.get_level_values("num_patients") == sample_size) &
        (pivot_df.index.get_level_values("disease")      == disease) &
        (pivot_df.index.get_level_values("metric")       == metric)
    )

    # Ajoute le filtre bundle si demandé
    if bundle is not None:
        if isinstance(bundle, (list, tuple, set)):
            cond &= pivot_df.index.get_level_values("bundle").isin(bundle)
        else:
            cond &= pivot_df.index.get_level_values("bundle") == bundle

    df_filt = pivot_df.loc[cond].reset_index()

    # Colonnes MAE (toutes les numériques)
    mae_cols = [c for c in df_filt.columns if c not in
                ['site','disease','metric','bundle',
                 'num_patients','disease_ratio','num_diseased']]

    # Forcer l’ordre désiré
    ordered_cols = [col for col in ['hc', 'NoRobust'] if col in mae_cols]
    ordered_cols += [c for c in mae_cols if c not in ordered_cols]

    # Couleurs
    col_colors = {}
    if 'hc' in ordered_cols:
        col_colors['hc'] = 'green'
    if 'NoRobust' in ordered_cols:
        col_colors['NoRobust'] = 'red'

    remaining = [c for c in ordered_cols if c not in col_colors]
    pal = sns.color_palette("viridis", len(remaining))
    col_colors.update(dict(zip(remaining, pal)))

    # Préparation des x‑ticks
    ratios = sorted(df_filt["disease_ratio"].unique())
    x = np.arange(len(ratios))
    g_width = .8
    n_methods = len(ordered_cols)
    box_w = g_width / n_methods

    fig, ax = plt.subplots(figsize=(14, 7))

    for i_m, col in enumerate(ordered_cols):
        data = [
            df_filt[df_filt["disease_ratio"] == r][col].dropna().values
            for r in ratios
        ]
        if not any(len(d) for d in data):
            continue

        pos = x - g_width / 2 + (i_m + .5) * box_w
        ax.boxplot(
            data,
            positions=pos,
            widths=box_w * .8,
            patch_artist=True,
            showfliers=False,
            boxprops=dict(facecolor=col_colors[col],
                          edgecolor=col_colors[col]),
            medianprops=dict(color='black')
        )

    # Libellés et titre
    ax.set_xlabel("Pourcentage de patients malades")
    ax.set_ylabel(Y)
    bundle_str = ", ".join(bundle) if isinstance(bundle, (list, tuple, set)) else bundle
    ax.set_title(
        f"{Y} d’harmonisation"
        + (f", bundle : {bundle_str}" if bundle is not None else ", tous bundles confondus")
        + f"\nMaladie : {disease}   |   Metric : {metric}"
        + f"\nNb patients : {sample_size}   |   Dataset : {dataset_type}"
    )
    ax.set_xticks(x)
    ax.set_xticklabels(ratios)

    handles = [plt.Line2D([0], [0], color=col_colors[c], lw=3, label=c)
               for c in ordered_cols]
    ax.legend(handles=handles, loc="upper left", bbox_to_anchor=(1, 1))
    ax.axhline(y=0, color='black', linestyle='--', linewidth=1)
    plt.tight_layout()

    # Dossier et nom de fichier
    out_dir = os.path.join(directory, f"{Y}_PLOTS_NEW", disease, str(sample_size))
    if bundle is not None:
        out_dir = os.path.join(out_dir,metric)
    os.makedirs(out_dir, exist_ok=True)
    bundle_suffix = bundle_str.replace(" ", "_") if bundle is not None else "all_bundles"
    plt.savefig(os.path.join(out_dir, f"{Y}_{metric}_{bundle_suffix}_boxplot_{dataset_type}.png"),
                bbox_inches="tight")
    plt.close()

In [13]:
def plot_mae_each_bundle(
        pivot_df, sample_size, disease, metric,
        directory, dataset_type, Y="MAE",
    ):
    # Liste des bundles présents pour le cas demandé
    bundles = pivot_df.loc[
        (pivot_df.index.get_level_values("num_patients") == sample_size) &
        (pivot_df.index.get_level_values("disease")      == disease) &
        (pivot_df.index.get_level_values("metric")       == metric)
    ].index.get_level_values("bundle").unique()

    for b in bundles:
        plot_mae_all_bundles_pivot(
            pivot_df, sample_size, disease, metric,
            directory, dataset_type, Y=Y, bundle=b,
        )

In [14]:

def plot_mae_all_bundles_pivot_3_way(
        pivot_df, sample_size, disease, metric,
        directory, dataset_type, Y="MAE"):
    # 1) Filtre de base
    df_filt = pivot_df.loc[
        (pivot_df.index.get_level_values("num_patients") == sample_size) &
        (pivot_df.index.get_level_values("disease")      == disease) &
        (pivot_df.index.get_level_values("metric")       == metric)
    ].reset_index()

    # 2) Colonnes numériques
    mae_cols = [c for c in df_filt.columns if c not in
                ['site','disease','metric','bundle',
                 'num_patients','disease_ratio','num_diseased']]

    # 3) Ordre + couleurs
    ordered_cols = [c for c in ['hc', 'NoRobust'] if c in mae_cols]
    ordered_cols += [c for c in mae_cols if c not in ordered_cols]

    col_colors = {'hc': 'green', 'NoRobust': 'red'}
    remaining = [c for c in ordered_cols if c not in col_colors]
    col_colors.update(dict(zip(remaining,
                               sns.color_palette("viridis", len(remaining)))))

    # 4) Classement des lignes
    df_filt['hc_vs_no'] = np.select(
        [
            df_filt['hc'] > df_filt['NoRobust'],
            np.isclose(df_filt['hc'], df_filt['NoRobust'])
        ],
        ['hc_greater', 'hc_equal'],
        default='hc_smaller'
    )

    # 5) Pourcentages
    pct_map = (df_filt['hc_vs_no']
               .value_counts(normalize=True)
               .mul(100).round(1)
               .to_dict())

    label_map = {
        'hc_greater': 'HC > NoRobust',
        'hc_equal'  : 'HC = NoRobust',
        'hc_smaller': 'HC < NoRobust'
    }

    groups = ['hc_smaller', 'hc_equal', 'hc_greater']  # ordre voulu
    titles = [f"{label_map[g]} ({pct_map.get(g, 0.0):.1f} %)" for g in groups]

    # 6) Ratios globaux pour garantir le même axe x partout
    ratios_all = sorted(df_filt["disease_ratio"].unique())
    x = np.arange(len(ratios_all))
    g_width   = .8
    n_methods = len(ordered_cols)
    box_w     = g_width / n_methods

    # 7) Sous‑plots
    fig, axes = plt.subplots(nrows=3, sharex=True, figsize=(14, 18))

    for ax, grp, grp_title in zip(axes, groups, titles):
        sub = df_filt[df_filt['hc_vs_no'] == grp]

        # Trace chaque méthode
        for i_m, col in enumerate(ordered_cols):
            # données pour tous les ratios, même si vides
            data = [
                sub[sub["disease_ratio"] == r][col].dropna().values
                for r in ratios_all
            ]
            if not any(len(d) for d in data):
                continue  # rien à tracer pour cette méthode

            pos = x - g_width / 2 + (i_m + .5) * box_w
            ax.boxplot(
                data,
                positions=pos,
                widths=box_w * .8,
                patch_artist=True,
                showfliers=False,
                boxprops=dict(facecolor=col_colors[col],
                              edgecolor=col_colors[col]),
                medianprops=dict(color='black')
            )

        # Ligne zéro et mise en forme
        ax.axhline(y=0, color='black', linestyle='--', linewidth=1)
        ax.set_ylabel(Y)
        ax.set_title(grp_title, loc='left', fontsize=12)

        # Axe x identique partout
        ax.set_xticks(x)
        ax.set_xticklabels(ratios_all)
        ax.set_xlabel("Pourcentage de patients malades")
        ax.tick_params(axis='x', labelbottom=True)

        # Légende seulement sur le premier subplot
        if grp == groups[0]:
            handles = [plt.Line2D([0], [0], color=col_colors[c], lw=3, label=c)
                       for c in ordered_cols]
            ax.legend(handles=handles, loc="upper left", bbox_to_anchor=(1, 1))

        # Assure une même largeur de grille pour chaque axe
        ax.set_xlim(x[0] - 0.5, x[-1] + 0.5)

    # 8) Titre général
    fig.suptitle(
        f"{Y} d’harmonisation, tous bundles confondus\n"
        f"Maladie: {disease} | Metric: {metric}\n"
        f"Nb patients: {sample_size} | Dataset: {dataset_type}",
        fontsize=14
    )

    # 9) Sauvegarde
    plt.tight_layout(rect=[0, 0, 1, 0.93])
    out_dir = os.path.join(directory, f"{Y}_PLOTS_NEW", disease,
                           str(sample_size))
    os.makedirs(out_dir, exist_ok=True)
    fname = f"{Y}_{metric}_all_bundles_boxplot_split_{dataset_type}_3_way.png"
    plt.savefig(os.path.join(out_dir, fname), bbox_inches="tight")
    plt.close()


In [15]:
def rank_methods_per_row(pivot_df):
    """
    Retourne un DataFrame des mêmes dimensions que pivot_df
    où chaque cellule contient le rang (1 = meilleur, N = pire)
    calculé ligne par ligne.

    Les colonnes non numériques (s’il y en a) sont ignorées.
    """
    # On ne garde que les colonnes numériques (les méthodes)
    method_cols = pivot_df.select_dtypes(include='number').columns

    # Ranking ligne par ligne
    rank_df = (pivot_df[method_cols]
               .rank(axis=1, method='min', ascending=True)  # 1 = plus petit
               .astype(int))

    # Si tu veux conserver l’index multi‑index d’origine, c’est déjà le cas.
    # Si tu veux rajouter d’autres colonnes (non numériques) à côté :
    # return pivot_df.drop(columns=method_cols).join(rank_df)

    return rank_df

## ANALYSIS

In [16]:
# site, disease, n_patients, ratio, metric, bundle, robust_method, mae
mae_compilation_train_all_long = transformer_df_large_en_long(mae_compilation_train_all)
mae_compilation_test_all_long  = transformer_df_large_en_long(mae_compilation_test_all)
smape_compilation_test_all_long  = transformer_df_large_en_long(smape_compilation_test_all)
smape_compilation_train_all_long = transformer_df_large_en_long(smape_compilation_train_all)
std_mae_compilation_train_all_long = transformer_df_large_en_long(std_mae_compilation_train_all)
std_mae_compilation_test_all_long  = transformer_df_large_en_long(std_mae_compilation_test_all)


df_long = mae_compilation_train_all_long
df_long = add_nb_patients_and_diseased(df_long)
df_long = df_long[df_long["robust_method"].isin(['NoRobust', 
                                                 'hc', 
                                                 'raw',
                                                 "IQR", 
                                                 "MAD",
                                                 "VS",
                                                 "Z_SCORE",
                                                 "Z_SCORE_MAD",
                                                 "MLP2_ALL_5",
                                                 "SN",
                                                 "QN",
                                                 "LOF",
                                                 "MLP2_ALL_6",
                                                 "MLP2_ALL_5_MAD",
                                                 "MLP2_ALL_6_MAD",
                                                 "MLP2_ALL_9",])]
print(df_long.columns)
# ⬛️ CELLULE 3‑bis — Filtrer les sites contenant des NaN
# --------------------------------------------------------

# 1)  Identifie les sites à exclure
sites_with_nan = (
    df_long
      .groupby("site")
      .filter(lambda g: g.isna().any().any())   # True si NaN dans le groupe
      ["site"]
      .unique()
)

# 2)  Statistiques
n_nan_sites   = len(sites_with_nan)
n_total_sites = df_long["site"].nunique()

print(f"Sites exclus pour NaN : {n_nan_sites} / {n_total_sites}")
if n_nan_sites:
    print("Liste :", list(sites_with_nan))

# 3)  Filtre le DataFrame pour la suite
df_long = df_long[~df_long["site"].isin(sites_with_nan)].copy()

pivot_df = (
    df_long
    .pivot_table(
        index=['site', 'disease', 'metric', 'bundle',
               'num_patients', 'disease_ratio', 'num_diseased'],
        columns='robust_method',
        values='mae',
        aggfunc='first'   # ou 'mean' si tu peux avoir plusieurs lignes identiques
    )
)

diff_df = pivot_df.sub(pivot_df['NoRobust'], axis=0)

ranked = rank_methods_per_row(pivot_df)

df_long = smape_compilation_train_all_long
df_long = add_nb_patients_and_diseased(df_long)
df_long = df_long[df_long["robust_method"].isin(['NoRobust', 
                                                 'hc', 
                                                 'raw',
                                                 "IQR", 
                                                 "MAD",
                                                 "VS",
                                                 "Z_SCORE",
                                                 "Z_SCORE_MAD",
                                                 "MLP2_ALL_5",
                                                 "SN",
                                                 "QN",
                                                 "LOF",
                                                 "MLP2_ALL_6",
                                                 "MLP2_ALL_5_MAD",
                                                 "MLP2_ALL_6_MAD",
                                                 "MLP2_ALL_9",])]

# 1)  Identifie les sites à exclure
sites_with_nan_smape = (
    df_long
      .groupby("site")
      .filter(lambda g: g.isna().any().any())   # True si NaN dans le groupe
      ["site"]
      .unique()
)

# 2)  Statistiques
n_nan_sites_smape   = len(sites_with_nan_smape)
n_total_sites_smape = df_long["site"].nunique()

print(f"Sites exclus pour NaN : {n_nan_sites_smape} / {n_total_sites_smape}")
if n_nan_sites_smape:
    print("Liste :", list(sites_with_nan_smape))

# 3)  Filtre le DataFrame pour la suite
df_long = df_long[~df_long["site"].isin(sites_with_nan)].copy()

pivot_df_smape = (
    df_long
    .pivot_table(
        index=['site', 'disease', 'metric', 'bundle',
               'num_patients', 'disease_ratio', 'num_diseased'],
        columns='robust_method',
        values='mae',
        aggfunc='first'   # ou 'mean' si tu peux avoir plusieurs lignes identiques
    )
)

diff_df_smape= pivot_df_smape.sub(pivot_df_smape['NoRobust'], axis=0)

ranked_smape = rank_methods_per_row(pivot_df_smape)


df_long = std_mae_compilation_train_all_long
df_long = add_nb_patients_and_diseased(df_long)
df_long = df_long[df_long["robust_method"].isin(['NoRobust', 
                                                 'hc', 
                                                 'raw',
                                                 "IQR", 
                                                 "MAD",
                                                 "VS",
                                                 "Z_SCORE",
                                                 "Z_SCORE_MAD",
                                                 "MLP2_ALL_5",
                                                 "SN",
                                                 "QN",
                                                 "LOF",
                                                 "MLP2_ALL_6",
                                                 "MLP2_ALL_5_MAD",
                                                 "MLP2_ALL_6_MAD",
                                                 "MLP2_ALL_9",])]


# 1)  Identifie les sites à exclure
sites_with_nan_std = (
    df_long
      .groupby("site")
      .filter(lambda g: g.isna().any().any())   # True si NaN dans le groupe
      ["site"]
      .unique()
)

# 2)  Statistiques
n_nan_sites_std   = len(sites_with_nan_std)
n_total_sites_std = df_long["site"].nunique()

print(f"Sites exclus pour NaN : {n_nan_sites_std} / {n_total_sites_std}")
if n_nan_sites_std:
    print("Liste :", list(sites_with_nan_std))

# 3)  Filtre le DataFrame pour la suite
df_long = df_long[~df_long["site"].isin(sites_with_nan)].copy()

pivot_df_std = (
    df_long
    .pivot_table(
        index=['site', 'disease', 'metric', 'bundle',
               'num_patients', 'disease_ratio', 'num_diseased'],
        columns='robust_method',
        values='mae',
        aggfunc='first'   # ou 'mean' si tu peux avoir plusieurs lignes identiques
    )
)

diff_df_std= pivot_df_std.sub(pivot_df_std['NoRobust'], axis=0)

ranked_std = rank_methods_per_row(pivot_df_std)

Index(['site', 'method', 'robust_method', 'disease', 'metric', 'bundle', 'mae',
       'num_patients', 'disease_ratio', 'num_diseased'],
      dtype='object')
Sites exclus pour NaN : 0 / 480
Sites exclus pour NaN : 0 / 480
Sites exclus pour NaN : 0 / 480


## EXEC PLOTS MAE


In [17]:
# Crée les tâches pour chaque combinaison
pivot_df_std = pivot_df_std.drop(columns=['raw', 'FLIP'], errors='ignore')

tasks = [
    (pivot_df_std, sample_size, MAE_PLOT_FOLDER, "train", "STD_MAE")
    for sample_size  in sample_sizes
]

Parallel(n_jobs=-1)(
    delayed(plot_mae_mean_all_diseases_metrics)(*task) for task in tasks
)

Parallel(n_jobs=-1)(
    delayed(plot_mae_mean_all_ratios)(*task) for task in tasks
)


tasks = [
    (pivot_df_std, sample_size, disease, MAE_PLOT_FOLDER, "train", "STD_MAE")
    for disease      in diseases
    for sample_size  in sample_sizes
]

Parallel(n_jobs=-1)(
    delayed(plot_rank_all_metrics)(*task) for task in tasks
)

tasks = [
    (pivot_df_std, sample_size, disease, metric, MAE_PLOT_FOLDER, "train", "STD_MAE")
    for disease      in diseases
    for sample_size  in sample_sizes
    for metric       in metrics
]

# Exécution parallèle
Parallel(n_jobs=-1)(
    delayed(plot_rank)(*task) for task in tasks
)

# Exécution parallèle
Parallel(n_jobs=-1)(
    delayed(plot_mae_all_bundles_pivot)(*task) for task in tasks
)

# Exécution parallèle
Parallel(n_jobs=-1)(
    delayed(plot_mae_each_bundle)(*task) for task in tasks
)

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [18]:
# Crée les tâches pour chaque combinaison
diff_df_std = diff_df_std.drop(columns=['raw', 'FLIP'], errors='ignore')

tasks = [
    (diff_df_std, sample_size, disease, metric, MAE_PLOT_FOLDER, "train", "STD_MAE_DIFF")
    for disease      in diseases
    for sample_size  in sample_sizes
    for metric       in metrics
]

# # Exécution parallèle
# Parallel(n_jobs=1)(
#     delayed(plot_mae_all_bundles_pivot_3_way)(*task) for task in tasks
# )

# Exécution parallèle
Parallel(n_jobs=-1)(
    delayed(plot_mae_all_bundles_pivot)(*task) for task in tasks
)
# Exécution parallèle
Parallel(n_jobs=-1)(
    delayed(plot_mae_each_bundle)(*task) for task in tasks
)

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [19]:
# Crée les tâches pour chaque combinaison
pivot_df_smape = pivot_df_smape.drop(columns=['raw', 'FLIP'], errors='ignore')

tasks = [
    (pivot_df_smape, sample_size, MAE_PLOT_FOLDER, "train", "SMAPE")
    for sample_size  in sample_sizes
]

Parallel(n_jobs=-1)(
    delayed(plot_mae_mean_all_diseases_metrics)(*task) for task in tasks
)

Parallel(n_jobs=-1)(
    delayed(plot_mae_mean_all_ratios)(*task) for task in tasks
)


tasks = [
    (pivot_df_smape, sample_size, disease, MAE_PLOT_FOLDER, "train", "SMAPE")
    for disease      in diseases
    for sample_size  in sample_sizes
]

Parallel(n_jobs=-1)(
    delayed(plot_rank_all_metrics)(*task) for task in tasks
)

tasks = [
    (pivot_df_smape, sample_size, disease, metric, MAE_PLOT_FOLDER, "train", "SMAPE")
    for disease      in diseases
    for sample_size  in sample_sizes
    for metric       in metrics
]

# Exécution parallèle
Parallel(n_jobs=-1)(
    delayed(plot_rank)(*task) for task in tasks
)

# Exécution parallèle
Parallel(n_jobs=-1)(
    delayed(plot_mae_all_bundles_pivot)(*task) for task in tasks
)

# Exécution parallèle
Parallel(n_jobs=-1)(
    delayed(plot_mae_each_bundle)(*task) for task in tasks
)


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [20]:
# Crée les tâches pour chaque combinaison
diff_df_smape = diff_df_smape.drop(columns=['raw', 'FLIP'], errors='ignore')

tasks = [
    (diff_df_smape, sample_size, disease, metric, MAE_PLOT_FOLDER, "train", "SMAPE_DIFF")
    for disease      in diseases
    for sample_size  in sample_sizes
    for metric       in metrics
]

# # Exécution parallèle
# Parallel(n_jobs=1)(
#     delayed(plot_mae_all_bundles_pivot_3_way)(*task) for task in tasks
# )

# Exécution parallèle
Parallel(n_jobs=-1)(
    delayed(plot_mae_all_bundles_pivot)(*task) for task in tasks
)
# Exécution parallèle
Parallel(n_jobs=-1)(
    delayed(plot_mae_each_bundle)(*task) for task in tasks
)


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [21]:
# Crée les tâches pour chaque combinaison
pivot_df = pivot_df.drop(columns=['raw', 'FLIP'], errors='ignore')

tasks = [
    (pivot_df, sample_size, MAE_PLOT_FOLDER, "train", "MAE")
    for sample_size  in sample_sizes
]

Parallel(n_jobs=-1)(
    delayed(plot_mae_mean_all_diseases_metrics)(*task) for task in tasks
)

Parallel(n_jobs=-1)(
    delayed(plot_mae_mean_all_ratios)(*task) for task in tasks
)


tasks = [
    (pivot_df, sample_size, disease, MAE_PLOT_FOLDER, "train", "MAE")
    for disease      in diseases
    for sample_size  in sample_sizes
]

Parallel(n_jobs=-1)(
    delayed(plot_rank_all_metrics)(*task) for task in tasks
)

tasks = [
    (pivot_df, sample_size, disease, metric, MAE_PLOT_FOLDER, "train", "MAE")
    for disease      in diseases
    for sample_size  in sample_sizes
    for metric       in metrics
]

# Exécution parallèle
Parallel(n_jobs=-1)(
    delayed(plot_rank)(*task) for task in tasks
)

# Exécution parallèle
Parallel(n_jobs=-1)(
    delayed(plot_mae_all_bundles_pivot)(*task) for task in tasks
)

# Exécution parallèle
Parallel(n_jobs=-1)(
    delayed(plot_mae_each_bundle)(*task) for task in tasks
)


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [22]:
# Crée les tâches pour chaque combinaison
diff_df = diff_df.drop(columns=['raw', 'FLIP'], errors='ignore')

tasks = [
    (diff_df, sample_size, disease, metric, MAE_PLOT_FOLDER, "train", "MAE_DIFF")
    for disease      in diseases
    for sample_size  in sample_sizes
    for metric       in metrics
]

# # Exécution parallèle
# Parallel(n_jobs=-1)(
#     delayed(plot_mae_all_bundles_pivot_3_way)(*task) for task in tasks
# )

# Exécution parallèle
Parallel(n_jobs=-1)(
    delayed(plot_mae_all_bundles_pivot)(*task) for task in tasks
)

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [23]:
# Crée les tâches pour chaque combinaison
ranked = ranked.drop(columns='raw', errors='ignore')

tasks = [
    (ranked, sample_size, MAE_PLOT_FOLDER, "train", "RANK")
    for sample_size  in sample_sizes
]

Parallel(n_jobs=-1)(
    delayed(plot_mae_mean_all_ratios)(*task) for task in tasks
)

Parallel(n_jobs=-1)(
    delayed(plot_mae_mean_all_diseases_metrics)(*task) for task in tasks
)

tasks = [
    (ranked, sample_size, disease, MAE_PLOT_FOLDER, "train", "RANK")
    for disease      in diseases
    for sample_size  in sample_sizes
]

Parallel(n_jobs=-1)(
    delayed(plot_rank_all_metrics)(*task) for task in tasks
)

tasks = [
    (ranked, sample_size, disease, metric, MAE_PLOT_FOLDER, "train", "RANK")
    for disease      in diseases
    for sample_size  in sample_sizes
    for metric       in metrics
]

# Exécution parallèle
Parallel(n_jobs=-1)(
    delayed(plot_rank)(*task) for task in tasks
)

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

## BEST/WORST

In [None]:
def get_best_worst_cases(df, top_n=1):
    """
    Retourne deux DataFrames :
      1. worst_df : top_n pires erreurs (valeur la plus élevée)
      2. best_df  : top_n meilleures erreurs (valeur la plus basse)
    Groupage sur : disease, num_patients, disease_ratio, metric
    """
    # Colonnes purement contextuelles
    context_cols = [
        'site', 'disease', 'metric', 'bundle',
        'num_patients', 'disease_ratio', 'num_diseased', 'raw'
    ]
    
    # Chaque autre colonne est une méthode robuste
    method_cols = [col for col in df.columns if col not in context_cols]
    
    
    worst_rows = []
    best_rows  = []
    
    # Nouveau groupage incluant metric
    grp = df.groupby(['disease', 'num_patients', 'disease_ratio', 'metric'])
    
    # for (disease, num_patients, disease_ratio, metric), gdf in grp:
    for method in method_cols:
        # tmp = gdf.copy()
        tmp = df.copy()
        tmp['error']  = tmp[method]
        tmp['method'] = method
        
        # Top N pires
        worst_rows.append(
            tmp.nlargest(top_n, 'error')
                [['disease','num_patients','disease_ratio',
                  'metric','bundle','site','method','error']]
        )
        # Top N meilleures
        best_rows.append(
            tmp.nsmallest(top_n, 'error')
                [['disease','num_patients','disease_ratio',
                  'metric','bundle','site','method','error']]
        )
    
    worst_df = pd.concat(worst_rows, ignore_index=True)
    best_df  = pd.concat(best_rows,  ignore_index=True)
    
    return worst_df, best_df

worst_df, best_df = get_best_worst_cases(diff_df_std.reset_index(), top_n=10)

In [None]:
worst_df = worst_df[(worst_df['num_patients'] == 100)]
worst_df

best_df = best_df[(best_df['num_patients'] == 100)]
best_df

In [None]:
import os
from PIL import Image
from pathlib import Path
def combine_imgs(outdir, site, harmonization_method, metric, bundle, test_idx ,error, method="Robust", delete_originals=True):
    if method == 'hc':
        method_dir = "NoRobust"  # Harmonization Control is a NoRobust method
    else:
        method_dir = method
    outdir = Path(outdir)
    suf = f"{metric}_{bundle.replace('_', '')}"
    files = {
        "tl": f"{site}_No_Robust_raw_{suf}_{site}_No_Robust.png",
        "tr": f"{site}_No_Robust_{harmonization_method}_{suf}_{site}_No_Robust.png",
        "bl": f"{site}_{method_dir}_{harmonization_method}_{suf}_{site}_{method_dir}.png",
        "br": f"{site}_HC_raw_{suf}_{site}_HC.png"
    }
    others = [f"{site}_{method_dir}_raw_{suf}_{site}_{method_dir}.png", f"{site}_HC_{harmonization_method}_{suf}_{site}_HC.png"]
    if method == "hc":
        others.append(files["bl"])
        files["bl"] = f"{site}_HC_{harmonization_method}_{suf}_{site}_HC.png"
        (outdir / f"{site}_{method_dir}_{harmonization_method}_{suf}_{site}_{method_dir}.png").unlink(missing_ok=True)
    imgs = [Image.open(outdir / f) for f in files.values()]
    w, h = imgs[0].size
    cmb = Image.new("RGB", (2*w, 2*h), "white")
    cmb.paste(imgs[0], (0, 0)), cmb.paste(imgs[1], (w, 0))
    cmb.paste(imgs[2], (0, h)), cmb.paste(imgs[3], (w, h))
    out = outdir / f"{test_idx}_{method}_{float(error):.2f}_{site}_combined_{suf}.png"
    cmb.save(out)
    for im in imgs: im.close()
    if delete_originals:
        for f in list(files.values()) + others:
            (outdir / f).unlink(missing_ok=True)
    return out


In [None]:
def VIZ(metric, bundle, site, method, error,
        disease, num_patients, disease_ratio):
    
    if method == 'hc':
        method_dir = "NoRobust"  # Harmonization Control is a NoRobust method
    else:
        method_dir = method
    # Crée le dossier au besoin
    test_index = site.split("_")[-1]
    dir = os.path.join("RESULTS/MAE_TEST", 'PROCESS', disease,
                           f"{num_patients}_{disease_ratio}",
                           f"{test_index}", metric)
    dir_robust = os.path.join(dir,method)
    dir_norobust = os.path.join(dir, "NoRobust")
    dir_hc = os.path.join(dir, "hc")
    directory_site = os.path.join("RESULTS/MAE_TEST/SYNTHETIC_SITES/v1", disease, f"{num_patients}_{disease_ratio}",f"{test_index}")
    train_file_name = os.path.join(directory_site,f"train_{num_patients}_{disease_ratio}_{test_index}_{metric}.csv")
    gt_file_name = os.path.join(directory_site,f"gt_train_{num_patients}_{disease_ratio}_{test_index}_{metric}.csv")
    train_file_name_2 = os.path.join(directory_site,f"train_{num_patients}_{disease_ratio}_{test_index}_{metric}_{str(error)}.csv")
    gt_file_name_2 = os.path.join(directory_site,f"gt_train_{num_patients}_{disease_ratio}_{test_index}_{metric}_{str(error)}.csv")
    # Ouvre le fichier train, supprime la colonne 'model' si elle existe, puis sauvegarde
    if os.path.isfile(train_file_name):
        if os.path.getsize(train_file_name) == 0:
            print("Fichier vide ou manquant :", train_file_name)
            return
        df_train = pd.read_csv(train_file_name)
        df_gt = pd.read_csv(gt_file_name)
        if 'model' in df_train.columns:
            df_train = df_train.drop(columns=['model'])
        if 'model' in df_gt.columns:
            df_gt = df_gt.drop(columns=['model'])
        if 'harmonization' in df_gt.columns:
            df_gt = df_gt.drop(columns=['harmonization'])
        if 'harmonization' in df_train.columns:
            df_train = df_train.drop(columns=['harmonization'])
        df_train['site'] = site
        df_train.to_csv(train_file_name_2, index=False)
        df_gt['site'] = site
        df_gt.to_csv(gt_file_name_2, index=False)
    
    robust_file = os.path.join(
            dir_robust,
            site
            + "."
            + metric
            + "."
            + harmonization_method
            + "."
            + method_dir
            + "."
            + rwp_text(False)
            + ".csv"
        )
    norobust_file = os.path.join(
            dir_norobust,
            site
            + "."
            + metric
            + "."
            + harmonization_method
            + ".NoRobust"
            + "."
            + rwp_text(False)
            + ".csv"
        )
    hc_file = os.path.join(
            dir_hc,
            site
            + "."
            + metric
            + "."
            + harmonization_method
            + ".NoRobust"
            + "."
            + rwp_text(False)
            + ".csv"
        )
    ref_data_file = get_camcan_file(metric)
    if BW_FOLDER in ["BEST", "WORST"]:
        outdir = os.path.join(ANALYSIS_FOLDER, BW_FOLDER ,method, str(num_patients))
    else:
        outdir = os.path.join(ANALYSIS_FOLDER, BW_FOLDER, disease,
                           f"{num_patients}_{disease_ratio}",
                             metric, bundle)

        
    visualize_harmonization(train_file_name_2, norobust_file, ref_data_file, outdir, bundles = bundle, title=f"{site}_No_Robust")
    visualize_harmonization(train_file_name_2, robust_file, ref_data_file, outdir, bundles = bundle, title=f"{site}_{method}")
    visualize_harmonization(gt_file_name_2, hc_file, ref_data_file, outdir, bundles = bundle, title=f"{site}_HC")

    if "Z_SCORE" in method:
        z_score_sids = z_score_detection(os.path.join(
        directory_site, f"train_{num_patients}_{disease_ratio}_{test_index}_all.csv"))
        df_train = flag_sid(df_train, z_score_sids, "Z_SCORE")

    if "MLP2_ALL_5" in method:
        mlp2all5_sid = predict_malades_MLP(os.path.join(
            directory_site, f"train_{num_patients}_{disease_ratio}_{test_index}_all.csv"), "mlp2_ALL", threshold=0.5)
        df_train = flag_sid(df_train, mlp2all5_sid, "MLP2_ALL_5")
    if "MLP2_ALL_6" in method:
        mlp2all6_sid = predict_malades_MLP(os.path.join(
            directory_site, f"train_{num_patients}_{disease_ratio}_{test_index}_all.csv"), "mlp2_ALL", threshold=0.6)
        df_train = flag_sid(df_train, mlp2all6_sid, "MLP2_ALL_6")
    if "MLP2_ALL_9" in method:
        mlp2all9_sid = predict_malades_MLP(os.path.join(
            directory_site, f"train_{num_patients}_{disease_ratio}_{test_index}_all.csv"), "mlp2_ALL", threshold=0.9)
        df_train = flag_sid(df_train, mlp2all9_sid, "MLP2_ALL_9")
        
    df_train['site'] = site + "viz"
    df_train['error'] = error
    df_train['nasty_bundle'] = bundle
    temp_file = os.path.join(outdir, f"temp_{str(error)}.csv")
    df_train.to_csv(temp_file, index=False)

    output_model_filename = fit(temp_file, ref_data_file, metric, harmonization_method, method_dir, False, outdir, False)

    if os.path.isfile(temp_file):
        os.remove(temp_file)
    if os.path.isfile(train_file_name_2):
        os.remove(train_file_name_2)
    if output_model_filename and os.path.isfile(output_model_filename):
        os.remove(output_model_filename)
    outlier_file = os.path.join(outdir, f"outliers_{site}viz_{method}_NoRWP.csv")
    if os.path.isfile(outlier_file):
        os.remove(outlier_file)
    combine_imgs(outdir, site, harmonization_method, metric, bundle, test_index, str(error), method)

    

    

In [None]:
# BW_FOLDER = "WORST"
# Parallel(n_jobs=1)(delayed(VIZ)(**r) for r in worst_df.to_dict("records"))

In [None]:
# BW_FOLDER = "BEST"
# Parallel(n_jobs=1)(delayed(VIZ)(**r) for r in best_df.to_dict("records"))

In [None]:
def filter_errors(
    df,
    metric=None,
    disease=None,
    bundle=None,
    num_patients=None,
    disease_ratio=None,
    method=None,
):
    """
    Filtre le DataFrame long sur les colonnes demandées.
    Laisse un paramètre à None pour l’ignorer.

    Retour : DataFrame filtré.
    """
    mask = pd.Series(True, index=df.index)

    if metric is not None:
        mask &= df["metric"] == metric
    if disease is not None:
        mask &= df["disease"] == disease
    if bundle is not None:
        mask &= df["bundle"] == bundle
    if num_patients is not None:
        mask &= df["num_patients"] == num_patients
    if disease_ratio is not None:
        mask &= df["disease_ratio"] == disease_ratio
    if method is not None:
        mask &= df["method"] == method

    return df.loc[mask].reset_index(drop=True)

In [None]:
def long_format_all_errors(df):
    """
    Convertit le DataFrame large (une colonne par méthode) en
    format long avec toutes les erreurs conservées.

    Retour : DataFrame avec colonnes
        ['disease', 'num_patients', 'disease_ratio',
         'metric', 'bundle', 'site', 'method', 'error']
    """
    # Colonnes de contexte (on garde celles‑ci telles quelles)
    context_cols = [
        'site', 'disease', 'metric', 'bundle',
        'num_patients', 'disease_ratio', 'num_diseased'
    ]

    # Colonnes correspondant aux méthodes robustes
    method_cols = [c for c in df.columns if c not in context_cols]

    # Transformation wide → long
    long_df = (
        df
        .melt(
            id_vars=context_cols,
            value_vars=method_cols,
            var_name='method',
            value_name='error'
        )
        .drop(columns=['num_diseased'])  # retire si inutiles
        .reset_index(drop=True)
    )

    return long_df
long_allMethos = long_format_all_errors(diff_df_std.reset_index())

In [None]:
BW_FOLDER = "ISOLATE"
long_allMethos
subset = filter_errors(
    long_allMethos,
    metric="fw",
    disease="AD",
    bundle="mni_OPT_L",
    num_patients=100,
    disease_ratio=30,
    method="MAD"
)
Parallel(n_jobs=1)(delayed(VIZ)(**r) for r in subset.to_dict("records"))

In [None]:
context_c = [
        'site', 'disease', 'metric', 'bundle',
        'num_patients', 'disease_ratio', 'num_diseased', "raw", "NoRobust"
    ]

    # Colonnes correspondant aux méthodes robustes
methodi = [c for c in diff_df_smape.columns if c not in context_c]
methodi = ["hc","Z_SCORE", "MAD", "VS2"]

In [None]:
# for bund in ["mni_STT_L", "mni_AC", "mni_SLF_R"]:
#     for meth in methodi:
#         # Filtre le DataFrame pour la méthode en cours
#         BW_FOLDER = "ISOLATE"
#         long_allMethos
#         subset = filter_errors(
#             long_allMethos,
#             metric="fa",
#             disease="AD",
#             bundle=bund,
#             num_patients=100,
#             disease_ratio=30,
#             method=meth
#         )
#         subset = subset.sort_values("site").head(5)
#         Parallel(n_jobs=1)(delayed(VIZ)(**r) for r in subset.to_dict("records"))


## MAE vs DISTANCE 

In [None]:

def tracer_corr_par_ratio(
    df_dist,
    df_errors,
    methods,
    disease_ratios,
    out_root="PLOTS_CORRELATION",
    distance_col="d_cohen"
):

    ratios_pct = [int(r * 100) for r in disease_ratios]

    for ratio_float, ratio_pct in zip(disease_ratios, ratios_pct):
        df_ratio = df_errors[df_errors["disease_ratio"] == ratio_pct]
        out_dir = os.path.join(out_root, f"{ratio_pct}pc")
        os.makedirs(out_dir, exist_ok=True)

        for meth in methods:
            col_err = f"{meth}_error"
            df_mean = df_ratio.groupby(["disease", "metric", "bundle"], as_index=False).agg(**{col_err: (meth, "mean")})
            merged = df_dist.merge(df_mean, on=["disease", "metric", "bundle"], how="inner")
            if merged.empty:
                continue

            pearson = merged[distance_col].corr(merged[col_err], method="pearson")
            spearman = merged[distance_col].corr(merged[col_err], method="spearman")

            plt.figure(figsize=(6, 5))
            sns.regplot(data=merged, x=distance_col, y=col_err, scatter_kws={"alpha": 0.7})
            plt.title(f"{meth} | disease ratio {ratio_pct}%\nPearson={pearson:.2f}, Spearman={spearman:.2f}")
            plt.xlabel(distance_col)
            plt.ylabel(col_err)
            plt.grid(True)
            plt.tight_layout()

            file_path = os.path.join(out_dir, f"{meth}.png")
            plt.savefig(file_path, dpi=300)
            plt.close()

In [None]:
import os
import matplotlib.pyplot as plt
import seaborn as sns

def tracer_corr_par_ratio_brut(
    df_dist,
    df_errors,
    methods,
    disease_ratios,
    out_root="PLOTS_CORRELATION_RAW",
    distance_col="d_cohen",
):

    for ratio in disease_ratios:
        pct = int(ratio * 100)
        df_ratio = df_errors[df_errors["disease_ratio"] == pct]
        out_dir = os.path.join(out_root, f"{pct}pc")
        os.makedirs(out_dir, exist_ok=True)

        for meth in methods:
            if meth not in df_ratio.columns:
                continue
            merged = df_dist.merge(
                df_ratio[["disease", "metric", "bundle", meth]],
                on=["disease", "metric", "bundle"]
            )
            if merged.empty:
                continue

            pearson = merged[distance_col].corr(merged[meth], "pearson")
            spearman = merged[distance_col].corr(merged[meth], "spearman")

            plt.figure(figsize=(6, 5))
            sns.regplot(data=merged, x=distance_col, y=meth, scatter_kws={"alpha": 0.7})
            plt.title(f"{meth} | disease ratio {pct}%\nPearson={pearson:.2f}, Spearman={spearman:.2f}")
            plt.xlabel(distance_col)
            plt.ylabel(meth)
            plt.grid(True)
            plt.tight_layout()

            plt.savefig(os.path.join(out_dir, f"{meth}.png"), dpi=300)
            plt.close()

In [None]:
df_dist = pd.read_csv("RESULTS/DISTRIBUTION_ANALYSIS/distance_metrics_results.csv")
diff_df_smape.reset_index()
methodi = [c for c in diff_df_smape.columns if c not in context_c]

base_out_dir   = os.path.join(MAINFOLDER,"PLOTS_CORRELATION_MEAN") 

tracer_corr_par_ratio(
    df_dist=df_dist,
    df_errors=diff_df_smape.reset_index(),
    methods=methodi,
    disease_ratios=disease_ratios,
    out_root=os.path.join(base_out_dir,"SMAPE_DIFF")
)

tracer_corr_par_ratio(
    df_dist=df_dist,
    df_errors=diff_df_std.reset_index(),
    methods=methodi,
    disease_ratios=disease_ratios,
    out_root=os.path.join(base_out_dir,"STD_MAE_DIFF")
)

base_out_dir   = os.path.join(MAINFOLDER,"PLOTS_CORRELATION_RAW") 
tracer_corr_par_ratio_brut(
    df_dist=df_dist,
    df_errors=diff_df_smape.reset_index(),
    methods=methodi,
    disease_ratios=disease_ratios,
    out_root=os.path.join(base_out_dir,"SMAPE_DIFF")
)

tracer_corr_par_ratio_brut(
    df_dist=df_dist,
    df_errors=diff_df_std.reset_index(),
    methods=methodi,
    disease_ratios=disease_ratios,
    out_root=os.path.join(base_out_dir,"STD_MAE_DIFF")
)



## MEILLEURS PIRES

In [None]:
def moyenne_par_disease_metric_bundle(df, colonnes_valeurs, disease_ratio=None):
    if isinstance(colonnes_valeurs, str):
        colonnes_valeurs = [colonnes_valeurs]
    if disease_ratio is not None:
        df = df[df["disease_ratio"] == disease_ratio]
    return (
        df.groupby(["disease", "metric", "bundle"])[colonnes_valeurs]
          .mean()
          .reset_index()
    )
def meilleurs_par_methode(df, methods, top=1):
    for meth in methods:
        print(f"\n========== {meth} =========")
        for d_r in disease_ratios:
            d_r = d_r * 100
            if meth not in df.columns:
                continue
            best = (
                moyenne_par_disease_metric_bundle(df, meth, d_r)
                .sort_values(meth, ascending=False)
                .head(top)
            )
            print(f"\n== {d_r} ==")
            for _, row in best.iterrows():
                print(f"{row['disease']} | {row['metric']} | {row['bundle']} : {row[meth]:.4f}")

def meilleurs_par_methode_site(df, methods, top=1):
    for meth in methods:
        if meth not in df.columns:
            continue
        print(f"\n========== {meth} =========")
        for d_r in disease_ratios:
            d_r = d_r * 100
            sub = df[df["disease_ratio"] == d_r]
            if sub.empty:
                continue
            sub = (sub.sort_values(meth, ascending=False)
                       .drop_duplicates(["disease", "site", "metric", "bundle"]))
            best = sub.head(top)
            print(f"\n== {d_r:.0f}% ==")
            for _, row in best.iterrows():
                print(f"{row['disease']} | {row['site']} | {row['metric']} | {row['bundle']} : {row[meth]:.4f}")

In [None]:
context_c = [
        'site', 'disease', 'metric', 'bundle',
        'num_patients', 'disease_ratio', 'num_diseased', "raw", "NoRobust"
    ]

methodi = [c for c in diff_df_smape.columns if c not in context_c]
top_df = meilleurs_par_methode(diff_df_std.reset_index(), methodi, top=3)

In [None]:
top_df = meilleurs_par_methode_site(diff_df_std.reset_index(), methodi, top=3)

## Stats

In [None]:
# =============================================================
# 0. CONFIG DE BASE
# =============================================================
df   = pivot_df_std.reset_index()         # ton DataFrame initial
folder = os.path.join(ANALYSIS_FOLDER, 'METHODS_STATS')

site_cols = ['site', 'disease', 'metric', 'bundle',
             'num_patients', 'disease_ratio', 'num_diseased']
refs = ['NoRobust', 'hc']
method_cols = [c for c in df.columns if c not in site_cols + refs]

LABELS4 = ['M > HC et No', 'No > M > HC',
           'HC > M > No', 'HC et No > M']
COLORS4 = ['red', 'green', 'yellow', 'pink']
LABEL5  = '2+3+4'
COLOR5  = 'purple'      # couleur pour le Scénario 5


# =============================================================
# 1. CLASSIFY & SUMMARY
# =============================================================
def classify(row, meth):
    m, hc, nr = row[meth], row['hc'], row['NoRobust']
    if m >  hc and m >  nr: return 1
    if m >= hc and m <= nr: return 2
    if m <= hc and m >= nr: return 3
    if m <  hc and m <  nr: return 4
    return 0                           # égaux ou hors-scope

def summarize(df_sub, meths):
    """Retourne un DataFrame indexé par (method, disease_ratio)
       avec s1…s5_perc / s1…s5_mean_diff."""
    rows = []
    for meth in meths:
        scen = df_sub.apply(classify, axis=1, meth=meth)
        dn   = df_sub[meth] - df_sub['NoRobust']   # diff Scénarios 1-2-4
        dh   = df_sub[meth] - df_sub['hc']         # diff Scénario 3
        for ratio, dfr in df_sub.groupby('disease_ratio'):
            mask_r  = scen.index.isin(dfr.index)
            scen_r  = scen[mask_r]
            dn_r, dh_r = dn[mask_r], dh[mask_r]

            stats = {'method': meth, 'disease_ratio': ratio}
            # Scénarios 1 à 4
            for s in (1, 2, 3, 4):
                msk = scen_r == s
                n   = msk.sum()
                if n == 0:
                    stats[f's{s}_perc']      = 0
                    stats[f's{s}_mean_diff'] = None
                    continue
                diff = dn_r[msk] if s in (1, 2, 4) else dh_r[msk]
                stats[f's{s}_perc']      = 100 * n / len(dfr)
                stats[f's{s}_mean_diff'] = diff.mean()

            # Scénario 5  = union 2,3,4
            msk_5 = scen_r.isin([2, 3, 4])
            n5 = msk_5.sum()
            if n5 == 0:
                stats['s5_perc']      = 0
                stats['s5_mean_diff'] = None
            else:
                # composer la diff pour chaque ligne (switch sur le s)
                diff_vec = pd.Series(index=scen_r.index, dtype=float)
                m24 = scen_r.isin([2, 4])
                diff_vec[m24] = dn_r[m24]
                diff_vec[scen_r == 3] = dh_r[scen_r == 3]
                stats['s5_perc']      = 100 * n5 / len(dfr)
                stats['s5_mean_diff'] = diff_vec[msk_5].mean()

            rows.append(stats)
    return pd.DataFrame(rows).set_index(['method', 'disease_ratio'])


# =============================================================
# 2. PLOTS (param include_s5)
# =============================================================
def stacked_perc(df_plot, title, out_path, include_s5=False):
    cols = ['s1_perc','s2_perc','s3_perc','s4_perc']
    lbls = LABELS4
    cols_col = COLORS4
    if include_s5:
        cols.append('s5_perc')
        lbls = LABELS4 + [LABEL5]
        cols_col = COLORS4 + [COLOR5]

    df_plot[cols].rename(columns=dict(zip(cols, lbls))) \
          .plot(kind='bar', stacked=True, color=cols_col,
                edgecolor='black', linewidth=0.5, figsize=(10,6))
    plt.ylabel('% des cas'); plt.xlabel('disease_ratio')
    plt.title(title); plt.legend(lbls, bbox_to_anchor=(1.02,1),
                                 loc='upper left', title='Scénarios')
    plt.tight_layout(); plt.savefig(out_path, dpi=150)
    plt.close(); print('Sauvé :', out_path)

def bar_mean(df_plot, title, out_path, include_s5=False):
    cols = ['s1_mean_diff','s2_mean_diff','s3_mean_diff','s4_mean_diff']
    lbls = LABELS4
    cols_col = COLORS4
    if include_s5:
        cols.append('s5_mean_diff')
        lbls = LABELS4 + [LABEL5]
        cols_col = COLORS4 + [COLOR5]

    df_plot[cols].rename(columns=dict(zip(cols, lbls))) \
          .plot(kind='bar', color=cols_col, edgecolor='black',
                linewidth=0.5, figsize=(10,6))
    plt.ylabel('Différence moyenne'); plt.xlabel('disease_ratio')
    plt.title(title)#; plt.axhline(0, ls='--', lw=0.7, c='grey')
    plt.legend(lbls, bbox_to_anchor=(1.02,1), loc='upper left',
               title='Scénarios')
    plt.tight_layout(); plt.savefig(out_path, dpi=150)
    plt.close(); print('Sauvé :', out_path)


# =============================================================
# 3. RUN  : GLOBAL + PAR RATIO + PAR MÉTHODE + MÉTHODE×MALADIE
# =============================================================
os.makedirs(folder, exist_ok=True)
smry_all = summarize(df, method_cols)

# -- A) global (pas de Scénario 5)
dir_ratio = os.path.join(folder, 'SCENARIOS_PROPORTIONS')
os.makedirs(dir_ratio, exist_ok=True)
stacked_perc(smry_all.groupby('method').mean(),
             'Tous ratios', os.path.join(dir_ratio, 'scenarios_ALL.png'))

# -- B) par disease_ratio (toujours sans Scénario 5)

for ratio in smry_all.index.get_level_values(1).unique():
    sub = smry_all.xs(ratio, level='disease_ratio')
    f = f'scenarios_ratio_{str(ratio).replace(".","_")}.png'
    stacked_perc(sub, f'ratio {ratio}', os.path.join(dir_ratio, f))

# -- C) METHOD_STATS (inclut Scénario 5)
dir_method = os.path.join(folder, 'METHOD_STATS')
os.makedirs(dir_method, exist_ok=True)
for meth in method_cols:
    sm = smry_all.xs(meth, level='method')
    stacked_perc(sm, f'Scénarios – {meth}',
                 os.path.join(dir_method, f'{meth}_scenario_perc.png'),
                 include_s5=False)
    bar_mean(sm, f'Différence moyenne – {meth}',
             os.path.join(dir_method, f'{meth}_mean_diff.png'),
             include_s5=True)

# -- D) METHOD_STATS_BY_DISEASE (inclut Scénario 5)
dir_md_dis = os.path.join(folder, 'METHOD_STATS_BY_DISEASE')
for meth in method_cols:
    safe_m = re.sub(r'[^A-Za-z0-9_\-\.]', '_', meth)
    for dis in df['disease'].dropna().unique():
        subset = df[df['disease'] == dis]
        sm = summarize(subset, [meth]).xs(meth, level='method')
        if sm.empty: continue
        safe_d = re.sub(r'[^A-Za-z0-9_\-\.]', '_', dis)
        outdir = os.path.join(dir_md_dis, safe_m, safe_d)
        os.makedirs(outdir, exist_ok=True)
        stacked_perc(sm, f'% Scénarios – {meth} – {dis}',
                     os.path.join(outdir, f'{safe_m}_{safe_d}_scenario_perc.png'),
                     include_s5=False)
        bar_mean(sm, f'Mean diff – {meth} – {dis}',
                 os.path.join(outdir, f'{safe_m}_{safe_d}_mean_diff.png'),
                 include_s5=True)
