# Notebook 6 - Analyse des résultats MAE
Ce notebook reprend le code de `ANALYSIS _MAE_SIMPLY` pour visualiser les métriques 
issues du Notebook 4 (`4-MAE_robust_pipeline.ipynb`). Les graphiques sont écrits dans 
`RESULTS/MAE_TEST/ANALYSIS_classic`.


In [35]:
import os, math
import pandas as pd
import subprocess
import re
import numpy as np
import json
import csv
from pathlib import Path

from joblib import Parallel, delayed
BW_FOLDER = "BS"


import matplotlib.pyplot as plt
import seaborn as sns



from scripts import combat_info
from scripts import combat_quick_apply
from scripts import combat_quick_QC
from robust_evaluation_tools.robust_utils import get_site, robust_text, rwp_text, get_camcan_file, get_diseases, get_metrics, add_nb_patients_and_diseased
from robust_evaluation_tools.robust_harmonization import fit, apply, visualize_harmonization, QC, compare_with_compilation, create_presentation, compare_distances, compare_with_compilation_var
from robust_evaluation_tools.synthectic_sites_generations import generate_sites
from robust_evaluation_tools.robust_outlier_detection import z_score_detection, flag_sid
from robust_evaluation_tools.robust_MLP import predict_malades_MLP

MAINFOLDER = "RESULTS/MAE_TEST"
SYNTHETIC_SITES = f"{MAINFOLDER}/SYNTHETIC_SITES"
harmonization_method= "classic"
ANALYSIS_FOLDER = f"{MAINFOLDER}/ANALYSIS/{harmonization_method}"
MAE_PLOT_FOLDER = f"{ANALYSIS_FOLDER}/MAE_PLOTS"

## EXECUTOR

In [36]:
SYNTHETIC_SITES_VERSION = "v1"

metrics = get_metrics()

# Paramètres contrôlant l'analyse (laisser à None pour autodétection)
diseases = ["ALL"]  # mettre None pour prendre toutes les maladies disponibles
sample_sizes = None  # déduire automatiquement les tailles disponibles
disease_ratios = None  # déduire automatiquement les ratios disponibles
num_tests = None  # déduire automatiquement le nombre de répétitions
n_jobs = -1


In [37]:
def infer_results_grid(mainfolder, harmonization_method, diseases=None):
    """Explore les résultats générés par le Notebook 4 et
    renvoie (diseases, sample_sizes, disease_ratios, num_tests).
    """
    base_dir = Path(mainfolder) / f"PROCESS" / harmonization_method
    if not base_dir.exists():
        raise FileNotFoundError(f"{base_dir} introuvable.")

    if diseases:
        disease_candidates = diseases
    else:
        disease_candidates = sorted(p.name for p in base_dir.iterdir() if p.is_dir())

    detected_diseases = []
    sample_sizes = set()
    disease_ratios = set()
    inferred_num_tests = 0

    for disease in disease_candidates:
        disease_dir = base_dir / disease
        if not disease_dir.is_dir():
            continue
        detected_diseases.append(disease)
        for size_ratio_dir in disease_dir.iterdir():
            if not size_ratio_dir.is_dir() or '_' not in size_ratio_dir.name:
                continue
            size_part, ratio_part = size_ratio_dir.name.split('_', 1)
            try:
                sample_size_val = int(size_part)
                disease_ratio_val = int(ratio_part) / 100
            except ValueError:
                continue
            sample_sizes.add(sample_size_val)
            disease_ratios.add(disease_ratio_val)

            run_ids = [
                int(run_dir.name)
                for run_dir in size_ratio_dir.iterdir()
                if run_dir.is_dir() and run_dir.name.isdigit()
            ]
            if run_ids:
                inferred_num_tests = max(inferred_num_tests, max(run_ids) + 1, len(run_ids))

    detected_diseases = sorted(set(detected_diseases))
    if not detected_diseases or not sample_sizes or not disease_ratios or inferred_num_tests == 0:
        raise ValueError(
            f"Impossible de déduire les paramètres depuis {base_dir}."
        )
    return detected_diseases, sorted(sample_sizes), sorted(disease_ratios), inferred_num_tests


In [38]:
def load_mae_or_maev_compilations(mainfolder, diseases, sample_sizes, disease_ratios, num_tests, mae_or_maev='mae'):
    tests, trains = [], []
    for d in diseases:
        for s in sample_sizes:
            for r in disease_ratios:
                for i in range(num_tests):
                    base = os.path.join(mainfolder, f"PROCESS", harmonization_method, d, f"{s}_{int(r*100)}", str(i))
                    test_path  = os.path.join(base, f"{mae_or_maev}_compilation_test.csv")
                    train_path = os.path.join(base, f"{mae_or_maev}_compilation_train.csv")
                    if os.path.isfile(test_path):
                        tests.append(pd.read_csv(test_path))
                    if os.path.isfile(train_path):
                        trains.append(pd.read_csv(train_path))
    df_test  = pd.concat(tests,  ignore_index=True) if tests  else pd.DataFrame()
    df_train = pd.concat(trains, ignore_index=True) if trains else pd.DataFrame()
    return df_test, df_train

In [39]:
def load_compilation(mae_or_maev: str,
                     split: str,
                     *,
                     mainfolder: str,
                     diseases: list[str],
                     sample_sizes: list[int],
                     disease_ratios: list[int],
                     num_tests: int) -> pd.DataFrame:
    if mae_or_maev not in {"mae", "maev", "smape", "std_mae"}:
        raise ValueError("mae_or_maev doit être 'mae' ou 'maev'")
    if split not in {"test", "train"}:
        raise ValueError("split doit être 'test' ou 'train'")

    df_test, df_train = load_mae_or_maev_compilations(
        mainfolder,
        diseases,
        sample_sizes,
        disease_ratios,
        num_tests,
        mae_or_maev=mae_or_maev
    )
    return df_test if split == "test" else df_train

In [40]:
detected_diseases, detected_sample_sizes, detected_disease_ratios, detected_num_tests = infer_results_grid(
    MAINFOLDER, harmonization_method, diseases if diseases is not None else None
)

if diseases is None:
    diseases = detected_diseases
else:
    missing = sorted(set(diseases) - set(detected_diseases))
    if missing:
        print(f"Pas de données pour : {missing}")
    diseases = [d for d in diseases if d in detected_diseases]
    if not diseases:
        raise ValueError("Aucune maladie valide trouvée dans les résultats.")

sample_sizes = detected_sample_sizes if sample_sizes is None else sample_sizes
disease_ratios = detected_disease_ratios if disease_ratios is None else disease_ratios
num_tests = detected_num_tests if num_tests is None else num_tests

print(f"Maladies analysées : {diseases}")
print(f"Tailles d'échantillon : {sample_sizes}")
print(f"Ratios de malades : {disease_ratios}")
print(f"Nombre de répétitions : {num_tests}")

mae_compilation_train_all = load_compilation("mae", "train",
                      mainfolder=MAINFOLDER,
                      diseases=diseases,
                      sample_sizes=sample_sizes,
                      disease_ratios=disease_ratios,
                      num_tests=num_tests)
mae_compilation_test_all = load_compilation("mae", "test",
                      mainfolder=MAINFOLDER,
                      diseases=diseases,
                      sample_sizes=sample_sizes,
                      disease_ratios=disease_ratios,
                      num_tests=num_tests)

smape_compilation_train_all = load_compilation("smape", "train",
                      mainfolder=MAINFOLDER,
                      diseases=diseases,
                      sample_sizes=sample_sizes,
                      disease_ratios=disease_ratios,
                      num_tests=num_tests)
smape_compilation_test_all = load_compilation("smape", "test",
                      mainfolder=MAINFOLDER,
                      diseases=diseases,
                      sample_sizes=sample_sizes,
                      disease_ratios=disease_ratios,
                      num_tests=num_tests)

std_mae_compilation_train_all = load_compilation("std_mae", "train",
                      mainfolder=MAINFOLDER,
                      diseases=diseases,
                      sample_sizes=sample_sizes,
                      disease_ratios=disease_ratios,
                      num_tests=num_tests)
std_mae_compilation_test_all = load_compilation("std_mae", "test",
                      mainfolder=MAINFOLDER,
                      diseases=diseases,
                      sample_sizes=sample_sizes,
                      disease_ratios=disease_ratios,
                      num_tests=num_tests)

Maladies analysées : ['ALL']
Tailles d'échantillon : [100]
Ratios de malades : [0.03, 0.1, 0.3, 0.5, 0.7, 0.8]
Nombre de répétitions : 40


In [41]:
def transformer_df_large_en_long(df_large):

    context_cols = ["site", "method", "robust_method", "disease", "metric"]
    bundle_cols = [col for col in df_large.columns if col not in context_cols]

    df_long = df_large.melt(
        id_vars=context_cols,
        value_vars=bundle_cols,
        var_name="bundle",
        value_name="mae"
    )
    df_long.loc[df_long['robust_method'] == 'No', 'robust_method'] = df_long.loc[df_long['robust_method'] == 'No', 'method']


    return df_long

In [42]:
def plot_mae_mean_all_ratios(
        pivot_df, sample_size,
        directory, dataset_type, Y="MAE"):
    """
    Moyenne du MAE pour chaque méthode en combinant:
      • toutes les maladies
      • toutes les métriques
      • toutes les valeurs de disease_ratio
    On filtre uniquement par num_patients et on trace une barre par méthode.
    """
    # 1) Filtre uniquement sur le nombre de patients
    df_filt = pivot_df.loc[
        pivot_df.index.get_level_values("num_patients") == sample_size
    ].reset_index()

    # 2) Colonnes de méthodes (numériques)
    mae_cols = [c for c in df_filt.columns if c not in
                ['site','disease','metric','bundle',
                 'num_patients','disease_ratio','num_diseased']]

    # 3) Ordre et couleurs
    first = [c for c in ['HC', 'NO_FILTERING'] if c in mae_cols]
    remaining = [c for c in mae_cols if c not in first]
    non_mlp = [c for c in remaining if not c.startswith('MLP')]
    mlp_last = [c for c in remaining if c.startswith('MLP')]
    ordered_cols = first + non_mlp + mlp_last

    col_colors = {'HC': 'green', 'NO_FILTERING': 'red'}
    remaining = [c for c in ordered_cols if c not in col_colors]
    col_colors.update(dict(zip(remaining,
                               sns.color_palette("viridis", len(remaining)))))

    # 4) Moyenne globale pour chaque méthode
    means = [df_filt[col].dropna().mean() for col in ordered_cols]

    x = np.arange(len(ordered_cols))
    bar_w = 0.7

    fig, ax = plt.subplots(figsize=(12, 6))
    ax.bar(
        x, means,
        width=bar_w,
        color=[col_colors[c] for c in ordered_cols],
        edgecolor='black'
    )

    # 5) Mise en forme
    ax.set_xticks(x)
    ax.set_xticklabels(ordered_cols, rotation=45, ha='right')
    ax.set_ylabel(f"{Y}")
    ax.set_title(
        f"{Y}\n"
        f"Nb patients: {sample_size}   |   Dataset: {dataset_type}"
    )
    ax.axhline(y=0, color='black', linestyle='--', linewidth=1)
    plt.tight_layout()

    # 6) Sauvegarde
    out_dir = os.path.join(directory, f"{Y}_PLOTS_MEAN",
                           "ALL_DISEASES_ALL_METRICS_ALL_RATIOS",
                           str(sample_size))
    os.makedirs(out_dir, exist_ok=True)
    fname = f"{Y}_mean_all_ratios_{dataset_type}.png"
    plt.savefig(os.path.join(out_dir, fname), bbox_inches="tight")
    plt.close()

In [43]:
def plot_mae_mean_all_diseases_metrics(
        pivot_df, sample_size,
        directory, dataset_type, Y="MAE"):
    """
    Affiche la moyenne du MAE (pas d’écart‑type) pour chaque méthode
    en fonction de disease_ratio, en combinant:
        • toutes les maladies
        • toutes les métriques
        • tous les bundles
    """
    # 1) Filtre: seulement le nombre de patients
    df_filt = pivot_df.loc[
        pivot_df.index.get_level_values("num_patients") == sample_size
    ].reset_index()

    # 2) Colonnes de méthodes (numériques)
    mae_cols = [c for c in df_filt.columns if c not in
                ['site','disease','metric','bundle',
                 'num_patients','disease_ratio','num_diseased']]

    # 3) Ordre + couleurs
    first = [c for c in ['HC', 'NO_FILTERING'] if c in mae_cols]
    remaining = [c for c in mae_cols if c not in first]
    non_mlp = [c for c in remaining if not c.startswith('MLP')]
    mlp_last = [c for c in remaining if c.startswith('MLP')]
    ordered_cols = first + non_mlp + mlp_last

    col_colors = {'HC': 'green', 'NO_FILTERING': 'red'}
    remaining = [c for c in ordered_cols if c not in col_colors]
    col_colors.update(dict(zip(remaining,
                               sns.color_palette("viridis", len(remaining)))))

    # 4) X‑ticks (ratios)
    ratios = sorted(df_filt["disease_ratio"].unique())
    x = np.arange(len(ratios))
    g_width   = .8
    n_methods = len(ordered_cols)
    bar_w     = g_width / n_methods

    fig, ax = plt.subplots(figsize=(14, 7))

    for i_m, col in enumerate(ordered_cols):
        means = [
            df_filt[df_filt["disease_ratio"] == r][col].dropna().mean()
            for r in ratios
        ]

        pos = x - g_width / 2 + (i_m + .5) * bar_w
        ax.bar(
            pos, means,
            width=bar_w * .9,
            color=col_colors[col],
            edgecolor='black',
            label=col
        )

    # 5) Mise en forme
    ax.set_xlabel("Proportion of patients")
    ax.set_ylabel(f"{Y}")
    ax.set_title(
        f"{Y}\n"
        f"Nb patients : {sample_size}   |   Dataset : {dataset_type}"
    )
    ax.set_xticks(x)
    ax.set_xticklabels(ratios)
    ax.legend(loc="upper left", bbox_to_anchor=(1, 1))
    ax.axhline(y=0, color='black', linestyle='--', linewidth=1)
    plt.tight_layout()

    # 6) Sauvegarde
    out_dir = os.path.join(directory, f"{Y}_PLOTS_MEAN",
                           f"ALL_DISEASES_ALL_METRICS", str(sample_size))
    os.makedirs(out_dir, exist_ok=True)
    fname = f"{Y}_mean_all_diseases_metrics_{dataset_type}.png"
    plt.savefig(os.path.join(out_dir, fname), bbox_inches="tight")
    plt.close()


In [44]:
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


def _plot_rank_barplot(
        pivot_df, sample_size, disease, directory,
        dataset_type, Y="MAE", metric=None, aggregate_metrics=False):
    """Plot helper reused by plot_rank and plot_rank_all_metrics.
    Set aggregate_metrics to True to combine every metric at once.
    """
    cond = (
        (pivot_df.index.get_level_values("num_patients") == sample_size) &
        (pivot_df.index.get_level_values("disease")      == disease)
    )

    if metric is not None:
        cond &= pivot_df.index.get_level_values("metric") == metric

    df_filt = pivot_df.loc[cond].reset_index()
    if df_filt.empty:
        return

    mae_cols = [c for c in df_filt.columns if c not in
                ['site','disease','metric','bundle',
                 'num_patients','disease_ratio','num_diseased']]

    first = [c for c in ['HC', 'NO_FILTERING'] if c in mae_cols]
    remaining = [c for c in mae_cols if c not in first]
    non_mlp = [c for c in remaining if not c.startswith('MLP')]
    mlp_last = [c for c in remaining if c.startswith('MLP')]
    ordered_cols = first + non_mlp + mlp_last

    if not ordered_cols:
        return

    col_colors = {'HC': 'green', 'NO_FILTERING': 'red'}
    extra_cols = [c for c in ordered_cols if c not in col_colors]
    if extra_cols:
        palette = sns.color_palette("viridis", len(extra_cols))
        col_colors.update(dict(zip(extra_cols, palette)))

    ratios = sorted(df_filt["disease_ratio"].unique())
    x = np.arange(len(ratios))
    g_width   = .8
    n_methods = len(ordered_cols)
    bar_w     = g_width / n_methods

    fig, ax = plt.subplots(figsize=(14, 7))

    for i_m, col in enumerate(ordered_cols):
        values_per_ratio = [
            df_filt[df_filt["disease_ratio"] == r][col].dropna().values
            for r in ratios
        ]
        means = np.array([
            vals.mean() if len(vals) else np.nan
            for vals in values_per_ratio
        ], dtype=float)
        stds = np.array([
            vals.std(ddof=0) if len(vals) else 0.0
            for vals in values_per_ratio
        ], dtype=float)

        pos = x - g_width / 2 + (i_m + .5) * bar_w
        ax.bar(
            pos, means,
            width=bar_w * .9,
            color=col_colors[col],
            edgecolor='black',
            label=col
        )
        ax.errorbar(
            pos, means,
            yerr=stds,
            fmt='none',
            ecolor='black',
            elinewidth=1,
            capsize=3
        )

    ax.set_xlabel("Proportion of patients")
    ylabel = f"{Y}"
    ax.set_ylabel(ylabel)

    if aggregate_metrics:
        ax.set_title(
            f"Overall Performance Across All Metrics and Bundles"
        )
        fname = f"{Y}_all_metrics_mean_{dataset_type}.png"
    else:
        ax.set_title(
            f"Overall {metric.upper()} Performance Across Bundles"
        )
        fname = f"{Y}_{metric}_mean_{dataset_type}.png"

    ax.set_xticks(x)
    ax.set_xticklabels(ratios)

    #ax.legend(loc="upper left", bbox_to_anchor=(1, 1))
    ax.axhline(y=0, color='black', linestyle='--', linewidth=1)
    plt.tight_layout()
    out_dir = os.path.join(directory, f"{Y}_PLOTS_MEAN", disease,
                           str(sample_size))
    os.makedirs(out_dir, exist_ok=True)
    plt.savefig(os.path.join(out_dir, fname), bbox_inches="tight")
    plt.close()



def plot_rank(
        pivot_df, sample_size, disease, metric,
        directory, dataset_type, Y="MAE"):
    """Trace la MEAN Y pour une maladie donnée et une métrique.
    """
    _plot_rank_barplot(
        pivot_df, sample_size, disease, directory, dataset_type,
        Y=Y, metric=metric, aggregate_metrics=False
    )


In [45]:
def plot_rank_all_metrics(
        pivot_df, sample_size, disease,
        directory, dataset_type, Y="MAE"):
    """Trace la MEAN Y en agrégeant toutes les métriques.
    """
    _plot_rank_barplot(
        pivot_df, sample_size, disease, directory, dataset_type,
        Y=Y, metric=None, aggregate_metrics=True
    )


In [46]:
def plot_mae_all_bundles_pivot(
        pivot_df, sample_size, disease, metric,
        directory, dataset_type, Y="MAE", bundle=None,
    ):
    # Construit la condition de filtre principale
    cond = (
        (pivot_df.index.get_level_values("num_patients") == sample_size) &
        (pivot_df.index.get_level_values("disease")      == disease) &
        (pivot_df.index.get_level_values("metric")       == metric)
    )

    # Ajoute le filtre bundle si demandé
    if bundle is not None:
        if isinstance(bundle, (list, tuple, set)):
            cond &= pivot_df.index.get_level_values("bundle").isin(bundle)
        else:
            cond &= pivot_df.index.get_level_values("bundle") == bundle

    df_filt = pivot_df.loc[cond].reset_index()

    # Colonnes MAE (toutes les numériques)
    mae_cols = [c for c in df_filt.columns if c not in
                ['site','disease','metric','bundle',
                 'num_patients','disease_ratio','num_diseased']]

    # Forcer l’ordre désiré
    first = [col for col in ['HC', 'NO_FILTERING'] if col in mae_cols]
    remaining = [c for c in mae_cols if c not in first]
    non_mlp = [c for c in remaining if not c.startswith('MLP')]
    mlp_last = [c for c in remaining if c.startswith('MLP')]
    ordered_cols = first + non_mlp + mlp_last

    # Couleurs
    col_colors = {}
    if 'HC' in ordered_cols:
        col_colors['HC'] = 'green'
    if 'NO_FILTERING' in ordered_cols:
        col_colors['NO_FILTERING'] = 'red'

    remaining = [c for c in ordered_cols if c not in col_colors]
    pal = sns.color_palette("viridis", len(remaining))
    col_colors.update(dict(zip(remaining, pal)))

    # Préparation des x‑ticks
    ratios = sorted(df_filt["disease_ratio"].unique())
    x = np.arange(len(ratios))
    g_width = .8
    n_methods = len(ordered_cols)
    box_w = g_width / n_methods

    fig, ax = plt.subplots(figsize=(14, 7))

    for i_m, col in enumerate(ordered_cols):
        data = [
            df_filt[df_filt["disease_ratio"] == r][col].dropna().values
            for r in ratios
        ]
        if not any(len(d) for d in data):
            continue

        pos = x - g_width / 2 + (i_m + .5) * box_w
        ax.boxplot(
            data,
            positions=pos,
            widths=box_w * .8,
            patch_artist=True,
            showfliers=False,
            boxprops=dict(facecolor=col_colors[col],
                          edgecolor=col_colors[col]),
            medianprops=dict(color='black')
        )

    # Libellés et titre
    ax.set_xlabel("Proportion of patients")
    ax.set_ylabel(Y)
    bundle_str = ", ".join(bundle) if isinstance(bundle, (list, tuple, set)) else bundle
    ax.set_title(
        f"{metric.upper()} Evaluation for {bundle_str.upper()} Bundle"
    )
    ax.set_xticks(x)
    ax.set_xticklabels(ratios)

    handles = [plt.Line2D([0], [0], color=col_colors[c], lw=3, label=c)
               for c in ordered_cols]
    # ax.legend(handles=handles, loc="upper left", bbox_to_anchor=(1, 1))
    ax.axhline(y=0, color='black', linestyle='--', linewidth=1)
    plt.tight_layout()

    # Dossier et nom de fichier
    out_dir = os.path.join(directory, f"{Y}_PLOTS_NEW", disease, str(sample_size))
    if bundle is not None:
        out_dir = os.path.join(out_dir,metric)
    os.makedirs(out_dir, exist_ok=True)
    bundle_suffix = bundle_str.replace(" ", "_") if bundle is not None else "all_bundles"
    plt.savefig(os.path.join(out_dir, f"{Y}_{metric}_{bundle_suffix}_boxplot_{dataset_type}.png"),
                bbox_inches="tight")
    plt.close()

In [47]:
def plot_mae_each_bundle(
        pivot_df, sample_size, disease, metric,
        directory, dataset_type, Y="MAE",
    ):
    # Liste des bundles présents pour le cas demandé
    bundles = pivot_df.loc[
        (pivot_df.index.get_level_values("num_patients") == sample_size) &
        (pivot_df.index.get_level_values("disease")      == disease) &
        (pivot_df.index.get_level_values("metric")       == metric)
    ].index.get_level_values("bundle").unique()

    for b in bundles:
        plot_mae_all_bundles_pivot(
            pivot_df, sample_size, disease, metric,
            directory, dataset_type, Y=Y, bundle=b,
        )

In [48]:
def rank_methods_per_row(pivot_df):
    """
    Retourne un DataFrame des mêmes dimensions que pivot_df
    où chaque cellule contient le rang (1 = meilleur, N = pire)
    calculé ligne par ligne.

    Les colonnes non numériques (s’il y en a) sont ignorées.
    """
    # On ne garde que les colonnes numériques (les méthodes)
    method_cols = pivot_df.select_dtypes(include='number').columns

    # Ranking ligne par ligne
    rank_df = (pivot_df[method_cols]
               .rank(axis=1, method='min', ascending=True)  # 1 = plus petit
               .astype(int))

    # Si tu veux conserver l’index multi‑index d’origine, c’est déjà le cas.
    # Si tu veux rajouter d’autres colonnes (non numériques) à côté :
    # return pivot_df.drop(columns=method_cols).join(rank_df)

    return rank_df

## ANALYSIS

In [49]:
mae_compilation_train_all_long = transformer_df_large_en_long(mae_compilation_train_all)
mae_compilation_test_all_long  = transformer_df_large_en_long(mae_compilation_test_all)
smape_compilation_test_all_long  = transformer_df_large_en_long(smape_compilation_test_all)
smape_compilation_train_all_long = transformer_df_large_en_long(smape_compilation_train_all)
std_mae_compilation_train_all_long = transformer_df_large_en_long(std_mae_compilation_train_all)
std_mae_compilation_test_all_long  = transformer_df_large_en_long(std_mae_compilation_test_all)
ROBUST_METHODS = [
    'NoRobust',
    'hc',
    'raw',
    'MAD',
    'IQR',
    'SN',
    'QN',
    'MMS',
    'VS',
    'FLIP',
    'G_ZS',
    'G_MAD',
    'MLP7_ALL'
]

df_long = mae_compilation_train_all_long
df_long = add_nb_patients_and_diseased(df_long)
df_long = df_long[df_long["robust_method"].isin(ROBUST_METHODS)]
print(df_long.columns)
# Normalize MLP *_ALL_9 labels to "MLP"
df_long['robust_method'] = (
    df_long['robust_method']
    .astype(str)
    .replace({k: 'MLP' for k in ['MLP7_ALL']})
)

print("robust_method unique:", df_long['robust_method'].unique())
# ⬛️ CELLULE 3‑bis — Filtrer les sites contenant des NaN
# --------------------------------------------------------

# 1)  Identifie les sites à exclure
sites_with_nan = (
    df_long
      .groupby("site")
      .filter(lambda g: g.isna().any().any())   # True si NaN dans le groupe
      ["site"]
      .unique()
)

# 2)  Statistiques
n_nan_sites   = len(sites_with_nan)
n_total_sites = df_long["site"].nunique()

print(f"Sites exclus pour NaN : {n_nan_sites} / {n_total_sites}")
if n_nan_sites:
    print("Liste :", list(sites_with_nan))

# 3)  Filtre le DataFrame pour la suite
df_long = df_long[~df_long["site"].isin(sites_with_nan)].copy()

pivot_df = (
    df_long
    .pivot_table(
        index=['site', 'disease', 'metric', 'bundle',
               'num_patients', 'disease_ratio', 'num_diseased'],
        columns='robust_method',
        values='mae',
        aggfunc='first'   # ou 'mean' si tu peux avoir plusieurs lignes identiques
    )
)

diff_df = pivot_df.sub(pivot_df['NoRobust'], axis=0)

ranked = rank_methods_per_row(pivot_df)

df_long = smape_compilation_train_all_long
df_long = add_nb_patients_and_diseased(df_long)
df_long = df_long[df_long["robust_method"].isin(ROBUST_METHODS)]

# 1)  Identifie les sites à exclure
sites_with_nan_smape = (
    df_long
      .groupby("site")
      .filter(lambda g: g.isna().any().any())   # True si NaN dans le groupe
      ["site"]
      .unique()
)

# 2)  Statistiques
n_nan_sites_smape   = len(sites_with_nan_smape)
n_total_sites_smape = df_long["site"].nunique()

print(f"Sites exclus pour NaN : {n_nan_sites_smape} / {n_total_sites_smape}")
if n_nan_sites_smape:
    print("Liste :", list(sites_with_nan_smape))

# 3)  Filtre le DataFrame pour la suite
df_long = df_long[~df_long["site"].isin(sites_with_nan)].copy()

pivot_df_smape = (
    df_long
    .pivot_table(
        index=['site', 'disease', 'metric', 'bundle',
               'num_patients', 'disease_ratio', 'num_diseased'],
        columns='robust_method',
        values='mae',
        aggfunc='first'   # ou 'mean' si tu peux avoir plusieurs lignes identiques
    )
)

diff_df_smape= pivot_df_smape.sub(pivot_df_smape['NoRobust'], axis=0)

ranked_smape = rank_methods_per_row(pivot_df_smape)


df_long = std_mae_compilation_train_all_long
df_long = add_nb_patients_and_diseased(df_long)
df_long = df_long[df_long["robust_method"].isin(ROBUST_METHODS)]


# 1)  Identifie les sites à exclure
sites_with_nan_std = (
    df_long
      .groupby("site")
      .filter(lambda g: g.isna().any().any())   # True si NaN dans le groupe
      ["site"]
      .unique()
)

# 2)  Statistiques
n_nan_sites_std   = len(sites_with_nan_std)
n_total_sites_std = df_long["site"].nunique()

print(f"Sites exclus pour NaN : {n_nan_sites_std} / {n_total_sites_std}")
if n_nan_sites_std:
    print("Liste :", list(sites_with_nan_std))

# 3)  Filtre le DataFrame pour la suite
df_long = df_long[~df_long["site"].isin(sites_with_nan)].copy()

pivot_df_std = (
    df_long
    .pivot_table(
        index=['site', 'disease', 'metric', 'bundle',
               'num_patients', 'disease_ratio', 'num_diseased'],
        columns='robust_method',
        values='mae',
        aggfunc='first'   # ou 'mean' si tu peux avoir plusieurs lignes identiques
    )
)

diff_df_std= pivot_df_std.sub(pivot_df_std['NoRobust'], axis=0)

ranked_std = rank_methods_per_row(pivot_df_std)

Index(['site', 'method', 'robust_method', 'disease', 'metric', 'bundle', 'mae',
       'num_patients', 'disease_ratio', 'num_diseased'],
      dtype='object')
robust_method unique: ['NoRobust' 'hc' 'raw' 'IQR' 'MAD' 'SN' 'QN' 'MMS' 'VS' 'FLIP' 'G_ZS'
 'G_MAD' 'MLP']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_long['robust_method'] = (


Sites exclus pour NaN : 0 / 240
Sites exclus pour NaN : 0 / 240
Sites exclus pour NaN : 0 / 240


In [50]:
# Renommer directement pivot_df_std : 'hc' -> 'HC_ONLY', 'NoRobust' -> 'All patients'
try:
    mapping = {}
    if 'hc' in pivot_df_std.columns:
        mapping['hc'] = 'HC'
    if 'NoRobust' in pivot_df_std.columns:
        mapping['NoRobust'] = 'NO_FILTERING'
    if 'MLP7_ALL' in pivot_df_std.columns:
        mapping['MLP7_ALL'] = 'MLP'
    if mapping:
        pivot_df_std.rename(columns=mapping, inplace=True)
except NameError:
    print("pivot_df_std n'est pas défini dans cet environnement.")


## EXEC PLOTS MAE


In [51]:
# Crée les tâches pour chaque combinaison
pivot_df_std = pivot_df_std.drop(columns=['raw', 'FLIP'], errors='ignore')

tasks = [
    (pivot_df_std, sample_size, MAE_PLOT_FOLDER, "train", "STD_MAE")
    for sample_size  in sample_sizes
]

# Mean All disease All Metrics
Parallel(n_jobs=-1)(
    delayed(plot_mae_mean_all_diseases_metrics)(*task) for task in tasks
)

# Mean All disease All Metrics all ratios
Parallel(n_jobs=-1)(
    delayed(plot_mae_mean_all_ratios)(*task) for task in tasks
)

tasks = [
    (pivot_df_std, sample_size, disease, MAE_PLOT_FOLDER, "train", "STD_MAE")
    for disease      in diseases
    for sample_size  in sample_sizes
]
# Mean All Metrics per disease
Parallel(n_jobs=-1)(
    delayed(plot_rank_all_metrics)(*task) for task in tasks
)
tasks = [
    (pivot_df_std, sample_size, disease, metric, MAE_PLOT_FOLDER, "train", "STD_MAE")
    for disease      in diseases
    for sample_size  in sample_sizes
    for metric       in metrics
]

# Mean per disease per metric all bundles
Parallel(n_jobs=-1)(
    delayed(plot_rank)(*task) for task in tasks
)

# New per disease per metric all bundles
# Parallel(n_jobs=-1)(
#     delayed(plot_mae_all_bundles_pivot)(*task) for task in tasks
# )
# Exécution parallèle
Parallel(n_jobs=-1)(
    delayed(plot_mae_each_bundle)(*task) for task in tasks
)

[None, None, None, None, None, None, None, None, None, None]