## IMPORTS and UTILS

In [None]:
print("hi")

import os, math
import pandas as pd
import subprocess
import re
import numpy as np
import json
import csv

from joblib import Parallel, delayed



import matplotlib.pyplot as plt
import seaborn as sns


import os

from scripts import combat_info
from scripts import combat_quick_apply
from scripts import combat_quick_QC
from robust_evaluation_tools.robust_utils import get_site, robust_text, rwp_text, get_camcan_file, get_diseases, get_metrics, add_nb_patients_and_diseased
from robust_evaluation_tools.robust_harmonization import fit, apply, visualize_harmonization, QC, compare_with_compilation, create_presentation, compare_distances, compare_with_compilation_var
from robust_evaluation_tools.synthectic_sites_generations import generate_sites
from robust_evaluation_tools.robust_outlier_detection import z_score_detection, flag_sid

MAINFOLDER = "RESULTS/MAE_TEST"
SYNTHETIC_SITES = f"{MAINFOLDER}/SYNTHETIC_SITES"

ANALYSIS_FOLDER = f"{MAINFOLDER}/ANALYSIS"

robust_methods_for_analysis = ["No","raw", "IQR",'MAD','MMS', 'VS', 'VS2', 'FLIP', 'Z_SCORE', 'Z_SCORE_IQR', 'Z_SCORE_MAD']

## EXECUTOR

In [None]:
harmonization_method= "classic"
SYNTHETIC_SITES_VERSION = "v2"

metrics = get_metrics()
#diseases = get_diseases(True)
diseases = ["ASTMIX", "AD", "SCHZ", "TBI"]
robust_methods = ["Z_SCORE_MAD", 'Z_SCORE_IQR',"IQR",'MAD','MMS', 'VS','FLIP', 'Z_SCORE', ]
#robust_methods = ["MMS","IQR",'MAD', 'VS', 'VS2', 'TOP30', 'FLIP']
#'Z_SCORE'


sample_sizes = [30, 40, 100]  # Différentes tailles d'échantillon
disease_ratios = [0.03, 0.1, 0.3, 0.5]  # Différents pourcentages de malades
num_tests = 12  # Nombre de tests à effectuer pour chaque combinaison
n_jobs=-1

# for disease in diseases:
#     generate_sites_for_disease(
#         disease, SYNTHETIC_SITES, SYNTHETIC_SITES_VERSION, sample_sizes, disease_ratios, num_tests, n_jobs
#     )

In [None]:
def load_mae_or_maev_compilations(mainfolder, diseases, sample_sizes, disease_ratios, num_tests, mae_or_maev='mae'):
    tests, trains = [], []
    for d in diseases:
        for s in sample_sizes:
            for r in disease_ratios:
                for i in range(num_tests):
                    base = os.path.join(mainfolder, "PROCESS", d, f"{s}_{int(r*100)}", str(i))
                    test_path  = os.path.join(base, f"{mae_or_maev}_compilation_test.csv")
                    train_path = os.path.join(base, f"{mae_or_maev}_compilation_train.csv")
                    if os.path.isfile(test_path):
                        tests.append(pd.read_csv(test_path))
                    if os.path.isfile(train_path):
                        trains.append(pd.read_csv(train_path))
    df_test  = pd.concat(tests,  ignore_index=True) if tests  else pd.DataFrame()
    df_train = pd.concat(trains, ignore_index=True) if trains else pd.DataFrame()
    return df_test, df_train

In [None]:
def load_compilation(mae_or_maev: str,
                     split: str,
                     *,
                     mainfolder: str,
                     diseases: list[str],
                     sample_sizes: list[int],
                     disease_ratios: list[int],
                     num_tests: int) -> pd.DataFrame:
    if mae_or_maev not in {"mae", "maev", "smape"}:
        raise ValueError("mae_or_maev doit être 'mae' ou 'maev'")
    if split not in {"test", "train"}:
        raise ValueError("split doit être 'test' ou 'train'")

    df_test, df_train = load_mae_or_maev_compilations(
        mainfolder,
        diseases,
        sample_sizes,
        disease_ratios,
        num_tests,
        mae_or_maev=mae_or_maev
    )
    return df_test if split == "test" else df_train

In [None]:
mae_compilation_train_all = load_compilation("mae", "train",
                      mainfolder=MAINFOLDER,
                      diseases=diseases,
                      sample_sizes=sample_sizes,
                      disease_ratios=disease_ratios,
                      num_tests=num_tests)
mae_compilation_test_all = load_compilation("mae", "test",
                      mainfolder=MAINFOLDER,
                      diseases=diseases,
                      sample_sizes=sample_sizes,
                      disease_ratios=disease_ratios,
                      num_tests=num_tests)

smape_compilation_train_all = load_compilation("smape", "train",
                      mainfolder=MAINFOLDER,
                      diseases=diseases,
                      sample_sizes=sample_sizes,
                      disease_ratios=disease_ratios,
                      num_tests=num_tests)
smape_compilation_test_all = load_compilation("smape", "test",
                      mainfolder=MAINFOLDER,
                      diseases=diseases,
                      sample_sizes=sample_sizes,
                      disease_ratios=disease_ratios,
                      num_tests=num_tests)

In [None]:
def transformer_df_large_en_long(df_large):

    context_cols = ["site", "method", "robust_method", "disease", "metric"]
    bundle_cols = [col for col in df_large.columns if col not in context_cols]

    df_long = df_large.melt(
        id_vars=context_cols,
        value_vars=bundle_cols,
        var_name="bundle",
        value_name="mae"
    )
    df_long.loc[df_long['robust_method'] == 'No', 'robust_method'] = df_long.loc[df_long['robust_method'] == 'No', 'method']


    return df_long

In [None]:
def compute_metrics(pivot, methods, eps=1e-7):
    """
    - Cas 1 : HC < NoRobust       (score vers HC)
    - Cas 2 : NoRobust < HC       (score vers NoRobust)
    - Cas 3 : |HC−No| < eps       (quasi égaux : on regarde +/- %Δ)
    """
    E0, Estar = pivot["NoRobust"], pivot["hc"]

    mask_eq   = (Estar - E0).abs() < eps
    mask_c1   = (Estar < E0) & ~mask_eq
    mask_c2   = (E0 < Estar) & ~mask_eq

    rows = []
    for m in methods:
        Em = pivot[m]

        # ---------------- Cas 1 ----------------
        s1 = pd.Series(np.nan, index=pivot.index)
        d1 = E0 - Estar
        s1[mask_c1] = 1 - (Em[mask_c1] - Estar[mask_c1]) / d1[mask_c1]

        pos1 = s1[s1 >= 0]
        neg1 = s1[s1 < 0]
        pct_fail_c1  = len(neg1) / mask_c1.sum() * 100 if mask_c1.any() else np.nan
        mean_gain_c1 = pos1.mean()
        mean_loss_c1 = neg1.mean()

        # ---------------- Cas 2 ----------------
        s2 = pd.Series(np.nan, index=pivot.index)
        d2 = Estar - E0
        s2[mask_c2] = 1 - (Em[mask_c2] - E0[mask_c2]) / d2[mask_c2]

        pos2 = s2[s2 >= 0]
        neg2 = s2[s2 < 0]
        pct_fail_c2  = len(neg2) / mask_c2.sum() * 100 if mask_c2.any() else np.nan
        mean_gain_c2 = pos2.mean()
        mean_loss_c2 = neg2.mean()

        # ---------------- Cas 3 ----------------
        delta_eq = (E0 - Em) / E0          # %Δ relatif à NoRobust (≈ HC)
        eq_imp   = delta_eq[mask_eq & (delta_eq > 0)]
        eq_wors  = delta_eq[mask_eq & (delta_eq < 0)]

        pct_eq_imp   = len(eq_imp) / mask_eq.sum() * 100 if mask_eq.any() else np.nan
        pct_eq_wors  = len(eq_wors) / mask_eq.sum() * 100 if mask_eq.any() else np.nan
        mean_eq_gain = eq_imp.mean()
        mean_eq_loss = eq_wors.mean()

        # ---------------- Global ----------------
        score_total = pd.concat([s1, s2])
        score_mean_tot = score_total.mean()

        rows.append({
            "method": m,

            # Cas 1
            "%fail_c1": pct_fail_c1,
            "mean_gain_c1": mean_gain_c1,
            "mean_loss_c1": mean_loss_c1,

            # Cas 2
            "%fail_c2": pct_fail_c2,
            "mean_gain_c2": mean_gain_c2,
            "mean_loss_c2": mean_loss_c2,

            # Cas 3
            "%eq_improve": pct_eq_imp,
            "mean_eq_gain": mean_eq_gain,
            "%eq_worsen": pct_eq_wors,
            "mean_eq_loss": mean_eq_loss,

            # Global
            "score_mean_total": score_mean_tot
        })

    summary = pd.DataFrame(rows)

    info = pd.Series({
        "Lignes cas1 (%)": mask_c1.mean()*100,
        "Lignes cas2 (%)": mask_c2.mean()*100,
        "Lignes cas3 (%)": mask_eq.mean()*100,
        "Total lignes": len(pivot)
    })

    return summary, info


In [None]:
def plot_mae_mean_all_ratios(
        pivot_df, sample_size,
        directory, dataset_type, Y="MAE"):
    """
    Moyenne du MAE pour chaque méthode en combinant:
      • toutes les maladies
      • toutes les métriques
      • toutes les valeurs de disease_ratio
    On filtre uniquement par num_patients et on trace une barre par méthode.
    """
    # 1) Filtre uniquement sur le nombre de patients
    df_filt = pivot_df.loc[
        pivot_df.index.get_level_values("num_patients") == sample_size
    ].reset_index()

    # 2) Colonnes de méthodes (numériques)
    mae_cols = [c for c in df_filt.columns if c not in
                ['site','disease','metric','bundle',
                 'num_patients','disease_ratio','num_diseased']]

    # 3) Ordre et couleurs
    ordered_cols = [c for c in ['hc', 'NoRobust'] if c in mae_cols]
    ordered_cols += [c for c in mae_cols if c not in ordered_cols]

    col_colors = {'hc': 'green', 'NoRobust': 'red'}
    remaining = [c for c in ordered_cols if c not in col_colors]
    col_colors.update(dict(zip(remaining,
                               sns.color_palette("viridis", len(remaining)))))

    # 4) Moyenne globale pour chaque méthode
    means = [df_filt[col].dropna().mean() for col in ordered_cols]

    x = np.arange(len(ordered_cols))
    bar_w = 0.7

    fig, ax = plt.subplots(figsize=(12, 6))
    ax.bar(
        x, means,
        width=bar_w,
        color=[col_colors[c] for c in ordered_cols],
        edgecolor='black'
    )

    # 5) Mise en forme
    ax.set_xticks(x)
    ax.set_xticklabels(ordered_cols, rotation=45, ha='right')
    ax.set_ylabel(f"Moyenne de {Y}")
    ax.set_title(
        f"Moyenne de {Y} - toutes maladies, métriques et ratios confondus\n"
        f"Nb patients: {sample_size}   |   Dataset: {dataset_type}"
    )
    ax.axhline(y=0, color='black', linestyle='--', linewidth=1)
    plt.tight_layout()

    # 6) Sauvegarde
    out_dir = os.path.join(directory, f"{Y}_PLOTS_MEAN",
                           "ALL_DISEASES_ALL_METRICS_ALL_RATIOS",
                           str(sample_size))
    os.makedirs(out_dir, exist_ok=True)
    fname = f"{Y}_mean_all_ratios_{dataset_type}.png"
    plt.savefig(os.path.join(out_dir, fname), bbox_inches="tight")
    plt.close()

In [None]:
def plot_mae_mean_all_diseases_metrics(
        pivot_df, sample_size,
        directory, dataset_type, Y="MAE"):
    """
    Affiche la moyenne du MAE (pas d’écart‑type) pour chaque méthode
    en fonction de disease_ratio, en combinant:
        • toutes les maladies
        • toutes les métriques
        • tous les bundles
    """
    # 1) Filtre: seulement le nombre de patients
    df_filt = pivot_df.loc[
        pivot_df.index.get_level_values("num_patients") == sample_size
    ].reset_index()

    # 2) Colonnes de méthodes (numériques)
    mae_cols = [c for c in df_filt.columns if c not in
                ['site','disease','metric','bundle',
                 'num_patients','disease_ratio','num_diseased']]

    # 3) Ordre + couleurs
    ordered_cols = [c for c in ['hc', 'NoRobust'] if c in mae_cols]
    ordered_cols += [c for c in mae_cols if c not in ordered_cols]

    col_colors = {'hc': 'green', 'NoRobust': 'red'}
    remaining = [c for c in ordered_cols if c not in col_colors]
    col_colors.update(dict(zip(remaining,
                               sns.color_palette("viridis", len(remaining)))))

    # 4) X‑ticks (ratios)
    ratios = sorted(df_filt["disease_ratio"].unique())
    x = np.arange(len(ratios))
    g_width   = .8
    n_methods = len(ordered_cols)
    bar_w     = g_width / n_methods

    fig, ax = plt.subplots(figsize=(14, 7))

    for i_m, col in enumerate(ordered_cols):
        means = [
            df_filt[df_filt["disease_ratio"] == r][col].dropna().mean()
            for r in ratios
        ]

        pos = x - g_width / 2 + (i_m + .5) * bar_w
        ax.bar(
            pos, means,
            width=bar_w * .9,
            color=col_colors[col],
            edgecolor='black',
            label=col
        )

    # 5) Mise en forme
    ax.set_xlabel("Pourcentage de patients malades")
    ax.set_ylabel(f"Moyenne de {Y}")
    ax.set_title(
        f"Moyenne de {Y} (toutes maladies et métriques confondues)\n"
        f"Nb patients : {sample_size}   |   Dataset : {dataset_type}"
    )
    ax.set_xticks(x)
    ax.set_xticklabels(ratios)
    ax.legend(loc="upper left", bbox_to_anchor=(1, 1))
    ax.axhline(y=0, color='black', linestyle='--', linewidth=1)
    plt.tight_layout()

    # 6) Sauvegarde
    out_dir = os.path.join(directory, f"{Y}_PLOTS_MEAN",
                           f"ALL_DISEASES_ALL_METRICS", str(sample_size))
    os.makedirs(out_dir, exist_ok=True)
    fname = f"{Y}_mean_all_diseases_metrics_{dataset_type}.png"
    plt.savefig(os.path.join(out_dir, fname), bbox_inches="tight")
    plt.close()


In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def plot_rank(
        pivot_df, sample_size, disease, metric,
        directory, dataset_type, Y="MAE"):
    """
    Trace, pour chaque méthode, la moyenne du MAE
    en fonction de disease_ratio, tous bundles confondus.
    """
    # 1) Filtre
    df_filt = pivot_df.loc[
        (pivot_df.index.get_level_values("num_patients") == sample_size) &
        (pivot_df.index.get_level_values("disease")      == disease) &
        (pivot_df.index.get_level_values("metric")       == metric)
    ].reset_index()

    # 2) Colonnes de méthodes
    mae_cols = [c for c in df_filt.columns if c not in
                ['site','disease','metric','bundle',
                 'num_patients','disease_ratio','num_diseased']]

    # 3) Ordre + couleurs
    ordered_cols = [c for c in ['hc', 'NoRobust'] if c in mae_cols]
    ordered_cols += [c for c in mae_cols if c not in ordered_cols]

    col_colors = {'hc': 'green', 'NoRobust': 'red'}
    remaining = [c for c in ordered_cols if c not in col_colors]
    col_colors.update(dict(zip(remaining,
                               sns.color_palette("viridis", len(remaining)))))

    # 4) Préparation des x‑ticks
    ratios = sorted(df_filt["disease_ratio"].unique())
    x = np.arange(len(ratios))
    g_width   = .8
    n_methods = len(ordered_cols)
    bar_w     = g_width / n_methods

    fig, ax = plt.subplots(figsize=(14, 7))

    for i_m, col in enumerate(ordered_cols):
        means = [
            df_filt[df_filt["disease_ratio"] == r][col].dropna().mean()
            for r in ratios
        ]

        pos = x - g_width / 2 + (i_m + .5) * bar_w
        ax.bar(
            pos, means,
            width=bar_w * .9,
            color=col_colors[col],
            edgecolor='black',
            label=col
        )

    ax.set_xlabel("Pourcentage de patients malades")
    ax.set_ylabel(f"Moyenne de {Y}")
    ax.set_title(
        f"Moyenne de {Y}, tous bundles confondus\n"
        f"Maladie : {disease}   |   Metric : {metric}\n"
        f"Nb patients : {sample_size}   |   Dataset : {dataset_type}"
    )
    ax.set_xticks(x)
    ax.set_xticklabels(ratios)

    ax.legend(loc="upper left", bbox_to_anchor=(1, 1))
    ax.axhline(y=0, color='black', linestyle='--', linewidth=1)
    plt.tight_layout()
    out_dir = os.path.join(directory, f"{Y}_PLOTS_MEAN", disease,
                           str(sample_size))
    os.makedirs(out_dir, exist_ok=True)
    fname = f"{Y}_{metric}_mean_{dataset_type}.png"
    plt.savefig(os.path.join(out_dir, fname), bbox_inches="tight")
    plt.close()


In [None]:
def plot_rank_all_metrics(
        pivot_df, sample_size, disease,
        directory, dataset_type, Y="MAE"):
    """
    Même logique que plot_mae_all_bundles_mean,
    mais on agrège TOUTES les métriques (T1, FA, etc.) d’un coup.
    """
    # 1) Filtre (pas de métrique cette fois)
    df_filt = pivot_df.loc[
        (pivot_df.index.get_level_values("num_patients") == sample_size) &
        (pivot_df.index.get_level_values("disease")      == disease)
    ].reset_index()

    # 2) Colonnes de méthodes
    mae_cols = [c for c in df_filt.columns if c not in
                ['site','disease','metric','bundle',
                 'num_patients','disease_ratio','num_diseased']]

    # 3) Ordre + couleurs
    ordered_cols = [c for c in ['hc', 'NoRobust'] if c in mae_cols]
    ordered_cols += [c for c in mae_cols if c not in ordered_cols]

    col_colors = {'hc': 'green', 'NoRobust': 'red'}
    remaining = [c for c in ordered_cols if c not in col_colors]
    col_colors.update(dict(zip(remaining,
                               sns.color_palette("viridis", len(remaining)))))

    # 4) X‑ticks (ratios)
    ratios = sorted(df_filt["disease_ratio"].unique())
    x = np.arange(len(ratios))
    g_width   = .8
    n_methods = len(ordered_cols)
    bar_w     = g_width / n_methods

    fig, ax = plt.subplots(figsize=(14, 7))

    for i_m, col in enumerate(ordered_cols):
        means = [
            df_filt[df_filt["disease_ratio"] == r][col].dropna().mean()
            for r in ratios
        ]

        pos = x - g_width / 2 + (i_m + .5) * bar_w
        ax.bar(
            pos, means,
            width=bar_w * .9,
            color=col_colors[col],
            edgecolor='black',
            label=col
        )

    ax.set_xlabel("Pourcentage de patients malades")
    ax.set_ylabel(f"Moyenne de {Y} (toutes métriques)")
    ax.set_title(
        f"Moyenne de {Y}, tous bundles et métriques confondus\n"
        f"Maladie : {disease}   |   Nb patients : {sample_size}   |   Dataset : {dataset_type}"
    )
    ax.set_xticks(x)
    ax.set_xticklabels(ratios)

    ax.legend(loc="upper left", bbox_to_anchor=(1, 1))
    ax.axhline(y=0, color='black', linestyle='--', linewidth=1)
    plt.tight_layout()
    out_dir = os.path.join(directory, f"{Y}_PLOTS_MEAN", disease,
                           str(sample_size))
    os.makedirs(out_dir, exist_ok=True)
    fname = f"{Y}_all_metrics_mean_{dataset_type}.png"
    plt.savefig(os.path.join(out_dir, fname), bbox_inches="tight")
    plt.close()

In [None]:
def plot_mae_all_bundles_pivot(
        pivot_df, sample_size, disease, metric,
        directory, dataset_type, Y="MAE",
    ):
    # Filtre
    df_filt = pivot_df.loc[
        (pivot_df.index.get_level_values("num_patients") == sample_size) &
        (pivot_df.index.get_level_values("disease")      == disease) &
        (pivot_df.index.get_level_values("metric")       == metric)
    ].reset_index()

    # Colonnes MAE (toutes les numériques)
    mae_cols = [c for c in df_filt.columns if c not in
                ['site','disease','metric','bundle',
                 'num_patients','disease_ratio','num_diseased']]

    # Forcer l’ordre désiré
    ordered_cols = [col for col in ['hc', 'NoRobust'] if col in mae_cols]
    ordered_cols += [c for c in mae_cols if c not in ordered_cols]

    # Couleurs
    col_colors = {}
    if 'hc' in ordered_cols:
        col_colors['hc'] = 'green'
    if 'NoRobust' in ordered_cols:
        col_colors['NoRobust'] = 'red'

    remaining = [c for c in ordered_cols if c not in col_colors]
    pal = sns.color_palette("viridis", len(remaining))
    col_colors.update(dict(zip(remaining, pal)))

    # Préparation des x‑ticks
    ratios = sorted(df_filt["disease_ratio"].unique())
    x = np.arange(len(ratios))
    g_width = .8
    n_methods = len(ordered_cols)
    box_w = g_width / n_methods

    fig, ax = plt.subplots(figsize=(14, 7))

    for i_m, col in enumerate(ordered_cols):
        data = [
            df_filt[df_filt["disease_ratio"] == r][col].dropna().values
            for r in ratios
        ]
        if not any(len(d) for d in data):
            continue

        pos = x - g_width / 2 + (i_m + .5) * box_w
        ax.boxplot(
            data,
            positions=pos,
            widths=box_w * .8,
            patch_artist=True,
            showfliers=False,
            boxprops=dict(facecolor=col_colors[col],
                          edgecolor=col_colors[col]),
            medianprops=dict(color='black')
        )

    ax.set_xlabel("Pourcentage de patients malades")
    ax.set_ylabel(Y)
    ax.set_title(
        f"{Y} d’harmonisation, tous bundles confondus\n"
        f"Maladie : {disease}   |   Metric : {metric}\n"
        f"Nb patients : {sample_size}   |   Dataset : {dataset_type}"
    )
    ax.set_xticks(x)
    ax.set_xticklabels(ratios)

    handles = [plt.Line2D([0], [0], color=col_colors[c], lw=3, label=c)
               for c in ordered_cols]
    ax.legend(handles=handles, loc="upper left", bbox_to_anchor=(1, 1))
    ax.axhline(y=0, color='black', linestyle='--', linewidth=1)
    plt.tight_layout()
    out_dir = os.path.join(directory, f"{Y}_PLOTS_NEW", disease,
                           str(sample_size))
    os.makedirs(out_dir, exist_ok=True)
    plt.savefig(os.path.join(out_dir, f"{Y}_{metric}_all_bundles_boxplot_{dataset_type}.png"),
                bbox_inches="tight")
    plt.close()


In [None]:

def plot_mae_all_bundles_pivot_3_way(
        pivot_df, sample_size, disease, metric,
        directory, dataset_type, Y="MAE"):
    # 1) Filtre de base
    df_filt = pivot_df.loc[
        (pivot_df.index.get_level_values("num_patients") == sample_size) &
        (pivot_df.index.get_level_values("disease")      == disease) &
        (pivot_df.index.get_level_values("metric")       == metric)
    ].reset_index()

    # 2) Colonnes numériques
    mae_cols = [c for c in df_filt.columns if c not in
                ['site','disease','metric','bundle',
                 'num_patients','disease_ratio','num_diseased']]

    # 3) Ordre + couleurs
    ordered_cols = [c for c in ['hc', 'NoRobust'] if c in mae_cols]
    ordered_cols += [c for c in mae_cols if c not in ordered_cols]

    col_colors = {'hc': 'green', 'NoRobust': 'red'}
    remaining = [c for c in ordered_cols if c not in col_colors]
    col_colors.update(dict(zip(remaining,
                               sns.color_palette("viridis", len(remaining)))))

    # 4) Classement des lignes
    df_filt['hc_vs_no'] = np.select(
        [
            df_filt['hc'] > df_filt['NoRobust'],
            np.isclose(df_filt['hc'], df_filt['NoRobust'])
        ],
        ['hc_greater', 'hc_equal'],
        default='hc_smaller'
    )

    # 5) Pourcentages
    pct_map = (df_filt['hc_vs_no']
               .value_counts(normalize=True)
               .mul(100).round(1)
               .to_dict())

    label_map = {
        'hc_greater': 'HC > NoRobust',
        'hc_equal'  : 'HC = NoRobust',
        'hc_smaller': 'HC < NoRobust'
    }

    groups = ['hc_smaller', 'hc_equal', 'hc_greater']  # ordre voulu
    titles = [f"{label_map[g]} ({pct_map.get(g, 0.0):.1f} %)" for g in groups]

    # 6) Ratios globaux pour garantir le même axe x partout
    ratios_all = sorted(df_filt["disease_ratio"].unique())
    x = np.arange(len(ratios_all))
    g_width   = .8
    n_methods = len(ordered_cols)
    box_w     = g_width / n_methods

    # 7) Sous‑plots
    fig, axes = plt.subplots(nrows=3, sharex=True, figsize=(14, 18))

    for ax, grp, grp_title in zip(axes, groups, titles):
        sub = df_filt[df_filt['hc_vs_no'] == grp]

        # Trace chaque méthode
        for i_m, col in enumerate(ordered_cols):
            # données pour tous les ratios, même si vides
            data = [
                sub[sub["disease_ratio"] == r][col].dropna().values
                for r in ratios_all
            ]
            if not any(len(d) for d in data):
                continue  # rien à tracer pour cette méthode

            pos = x - g_width / 2 + (i_m + .5) * box_w
            ax.boxplot(
                data,
                positions=pos,
                widths=box_w * .8,
                patch_artist=True,
                showfliers=False,
                boxprops=dict(facecolor=col_colors[col],
                              edgecolor=col_colors[col]),
                medianprops=dict(color='black')
            )

        # Ligne zéro et mise en forme
        ax.axhline(y=0, color='black', linestyle='--', linewidth=1)
        ax.set_ylabel(Y)
        ax.set_title(grp_title, loc='left', fontsize=12)

        # Axe x identique partout
        ax.set_xticks(x)
        ax.set_xticklabels(ratios_all)
        ax.set_xlabel("Pourcentage de patients malades")
        ax.tick_params(axis='x', labelbottom=True)

        # Légende seulement sur le premier subplot
        if grp == groups[0]:
            handles = [plt.Line2D([0], [0], color=col_colors[c], lw=3, label=c)
                       for c in ordered_cols]
            ax.legend(handles=handles, loc="upper left", bbox_to_anchor=(1, 1))

        # Assure une même largeur de grille pour chaque axe
        ax.set_xlim(x[0] - 0.5, x[-1] + 0.5)

    # 8) Titre général
    fig.suptitle(
        f"{Y} d’harmonisation, tous bundles confondus\n"
        f"Maladie: {disease} | Metric: {metric}\n"
        f"Nb patients: {sample_size} | Dataset: {dataset_type}",
        fontsize=14
    )

    # 9) Sauvegarde
    plt.tight_layout(rect=[0, 0, 1, 0.93])
    out_dir = os.path.join(directory, f"{Y}_PLOTS_NEW", disease,
                           str(sample_size))
    os.makedirs(out_dir, exist_ok=True)
    fname = f"{Y}_{metric}_all_bundles_boxplot_split_{dataset_type}_3_way.png"
    plt.savefig(os.path.join(out_dir, fname), bbox_inches="tight")
    plt.close()


In [None]:
def rank_methods_per_row(pivot_df):
    """
    Retourne un DataFrame des mêmes dimensions que pivot_df
    où chaque cellule contient le rang (1 = meilleur, N = pire)
    calculé ligne par ligne.

    Les colonnes non numériques (s’il y en a) sont ignorées.
    """
    # On ne garde que les colonnes numériques (les méthodes)
    method_cols = pivot_df.select_dtypes(include='number').columns

    # Ranking ligne par ligne
    rank_df = (pivot_df[method_cols]
               .rank(axis=1, method='min', ascending=True)  # 1 = plus petit
               .astype(int))

    # Si tu veux conserver l’index multi‑index d’origine, c’est déjà le cas.
    # Si tu veux rajouter d’autres colonnes (non numériques) à côté :
    # return pivot_df.drop(columns=method_cols).join(rank_df)

    return rank_df

## ANALYSIS

In [None]:
# site, disease, n_patients, ratio, metric, bundle, robust_method, mae
mae_compilation_train_all_long = transformer_df_large_en_long(mae_compilation_train_all)
mae_compilation_test_all_long  = transformer_df_large_en_long(mae_compilation_test_all)
smape_compilation_test_all_long  = transformer_df_large_en_long(smape_compilation_test_all)
smape_compilation_train_all_long = transformer_df_large_en_long(smape_compilation_train_all)


df_long = mae_compilation_train_all_long
df_long = add_nb_patients_and_diseased(df_long)
print(df_long.columns)

# ⬛️ CELLULE 3‑bis — Filtrer les sites contenant des NaN
# --------------------------------------------------------

# 1)  Identifie les sites à exclure
sites_with_nan = (
    df_long
      .groupby("site")
      .filter(lambda g: g.isna().any().any())   # True si NaN dans le groupe
      ["site"]
      .unique()
)

# 2)  Statistiques
n_nan_sites   = len(sites_with_nan)
n_total_sites = df_long["site"].nunique()

print(f"Sites exclus pour NaN : {n_nan_sites} / {n_total_sites}")
if n_nan_sites:
    print("Liste :", list(sites_with_nan))

# 3)  Filtre le DataFrame pour la suite
df_long = df_long[~df_long["site"].isin(sites_with_nan)].copy()

pivot_df = (
    df_long
    .pivot_table(
        index=['site', 'disease', 'metric', 'bundle',
               'num_patients', 'disease_ratio', 'num_diseased'],
        columns='robust_method',
        values='mae',
        aggfunc='first'   # ou 'mean' si tu peux avoir plusieurs lignes identiques
    )
)

diff_df = pivot_df.sub(pivot_df['NoRobust'], axis=0)

ranked = rank_methods_per_row(pivot_df)

df_long = smape_compilation_train_all_long
df_long = add_nb_patients_and_diseased(df_long)

# 1)  Identifie les sites à exclure
sites_with_nan_smape = (
    df_long
      .groupby("site")
      .filter(lambda g: g.isna().any().any())   # True si NaN dans le groupe
      ["site"]
      .unique()
)

# 2)  Statistiques
n_nan_sites_smape   = len(sites_with_nan_smape)
n_total_sites_smape = df_long["site"].nunique()

print(f"Sites exclus pour NaN : {n_nan_sites_smape} / {n_total_sites_smape}")
if n_nan_sites_smape:
    print("Liste :", list(sites_with_nan_smape))

# 3)  Filtre le DataFrame pour la suite
df_long = df_long[~df_long["site"].isin(sites_with_nan)].copy()

pivot_df_smape = (
    df_long
    .pivot_table(
        index=['site', 'disease', 'metric', 'bundle',
               'num_patients', 'disease_ratio', 'num_diseased'],
        columns='robust_method',
        values='mae',
        aggfunc='first'   # ou 'mean' si tu peux avoir plusieurs lignes identiques
    )
)

diff_df_smape= pivot_df_smape.sub(pivot_df_smape['NoRobust'], axis=0)

ranked_smape = rank_methods_per_row(pivot_df_smape)

## Pas top util


In [None]:
icd = diff_df_smape.reset_index(inplace=False)
icd.columns

In [None]:
group_cols = ['disease', 'num_patients', 'disease_ratio', 'metric']
mean_ranks = (ranked
              .groupby(group_cols)
              .mean()          # moyenne par méthode
              .round(2)        # arrondi à 2 décimales
              .reset_index())

In [None]:
subset_100 = mean_ranks[mean_ranks['num_patients'] == 100]
subset_100

In [None]:
# Ajoute une colonne pour chaque cas : hc < NoRobust, hc > NoRobust, hc == NoRobust
df_tmp = pivot_df.assign(
    hc_better = pivot_df['hc'] < pivot_df['NoRobust'],
    hc_worse  = pivot_df['hc'] > pivot_df['NoRobust'],
    hc_equal  = np.isclose(pivot_df['hc'], pivot_df['NoRobust'])
)

# Calcule les pourcentages pour chaque cas
pct_table = (
    df_tmp
      .groupby(["disease", "num_patients", "disease_ratio"])
      .apply(lambda g: pd.Series({
          "hc_better_pct": 100 * g["hc_better"].mean(),
          "hc_worse_pct":  100 * g["hc_worse"].mean(),
          "hc_equal_pct":  100 * g["hc_equal"].mean(),
      }))
      .reset_index()
      .round(2)
)

print(pct_table)


In [None]:
import numpy as np
import pandas as pd

def percent_tables_by_combo(pivot_df):
    """
    Retourne un DataFrame avec les colonnes :
        disease, num_patients, disease_ratio,
        scenario, method, pct_better
    où pct_better = % de lignes (du sous‑ensemble) pour lesquelles
    method < NoRobust.
    """
    # — Vérifier que les colonnes essentielles existent
    if 'hc' not in pivot_df.columns or 'NoRobust' not in pivot_df.columns:
        raise ValueError("Il faut les colonnes 'hc' et 'NoRobust' dans pivot_df.")

    # — Encodage du scénario pour chaque ligne
    scen = np.select(
        [
            pivot_df['hc'] <  pivot_df['NoRobust'],
            np.isclose(pivot_df['hc'], pivot_df['NoRobust'])
        ],
        ['hc_smaller', 'hc_equal'],
        default='hc_greater'
    )
    df_tmp = pivot_df.copy()
    df_tmp['scenario'] = scen

    # — Méthodes à comparer
    methods = [c for c in pivot_df.columns if c != 'NoRobust']

    # — Liste pour accumuler les résultats
    rows = []

    # — Grouper par combinaison
    group_levels = ['disease', 'num_patients', 'disease_ratio']
    for combo_vals, grp in df_tmp.groupby(level=group_levels):
        combo_dict = dict(zip(group_levels, combo_vals))

        # Overall
        for m in methods:
            pct = 100 * (grp[m] < grp['NoRobust']).mean()
            rows.append({**combo_dict,
                         'scenario': 'Overall',
                         'method': m,
                         'pct_better': round(pct, 1)})

        # Par scénario
        for scen_key in ['hc_smaller', 'hc_equal', 'hc_greater']:
            sub = grp[grp['scenario'] == scen_key]
            if sub.empty:
                continue
            for m in methods:
                pct = 100 * (sub[m] < sub['NoRobust']).mean()
                rows.append({**combo_dict,
                             'scenario': scen_key,
                             'method': m,
                             'pct_better': round(pct, 1)})

    # — DataFrame final
    result = (pd.DataFrame(rows)
                .sort_values(group_levels + ['scenario', 'pct_better'],
                             ascending=[True, True, True,True, False])
                .reset_index(drop=True))

    # Facilité de lecture : renommer scénarios
    scen_rename = {'hc_smaller': 'HC < NoRobust',
                   'hc_equal'  : 'HC = NoRobust',
                   'hc_greater': 'HC > NoRobust'}
    result['scenario'] = result['scenario'].replace(scen_rename)

    return result
tables = percent_tables_by_combo(pivot_df)

wide = (tables
            .pivot_table(index=['disease', 'num_patients', 'disease_ratio', 'method'],
                         columns='scenario',
                         values='pct_better')
            .reset_index())

    # Option : trier pour la lisibilité
wide = wide.sort_values(
    ['disease', 'num_patients', 'disease_ratio', 'Overall'],
    ascending=[True, True, True, False]  # Overall décroissant
)

# Afficher un extrait
pourcentage_method_meilleur_NoRobust = wide
print(pourcentage_method_meilleur_NoRobust)

In [None]:
# Affiche les cas pour num_patients = 100 dans pct_table
pourcentage_method_meilleur_NoRobust_100 = pourcentage_method_meilleur_NoRobust[pourcentage_method_meilleur_NoRobust["num_patients"] == 100]
print(pourcentage_method_meilleur_NoRobust_100)

In [None]:
# Crée les tâches pour chaque combinaison
pivot_df_smape = pivot_df_smape.drop(columns=['raw', 'FLIP'], errors='ignore')

tasks = [
    (pivot_df_smape, sample_size, ANALYSIS_FOLDER, "train", "SMAPE")
    for sample_size  in sample_sizes
]

Parallel(n_jobs=-1)(
    delayed(plot_mae_mean_all_diseases_metrics)(*task) for task in tasks
)

Parallel(n_jobs=-1)(
    delayed(plot_mae_mean_all_ratios)(*task) for task in tasks
)


tasks = [
    (pivot_df_smape, sample_size, disease, ANALYSIS_FOLDER, "train", "SMAPE")
    for disease      in diseases
    for sample_size  in sample_sizes
]

Parallel(n_jobs=-1)(
    delayed(plot_rank_all_metrics)(*task) for task in tasks
)

tasks = [
    (pivot_df_smape, sample_size, disease, metric, ANALYSIS_FOLDER, "train", "SMAPE")
    for disease      in diseases
    for sample_size  in sample_sizes
    for metric       in metrics
]

# Exécution parallèle
Parallel(n_jobs=-1)(
    delayed(plot_rank)(*task) for task in tasks
)

# Exécution parallèle
Parallel(n_jobs=-1)(
    delayed(plot_mae_all_bundles_pivot)(*task) for task in tasks
)

# Exécution parallèle
Parallel(n_jobs=-1)(
    delayed(plot_mae_all_bundles_pivot)(*task) for task in tasks
)


In [None]:
# Crée les tâches pour chaque combinaison
diff_df_smape = diff_df_smape.drop(columns=['raw', 'FLIP'], errors='ignore')

tasks = [
    (diff_df_smape, sample_size, disease, metric, ANALYSIS_FOLDER, "train", "SMAPE_DIFF")
    for disease      in diseases
    for sample_size  in sample_sizes
    for metric       in metrics
]

# Exécution parallèle
Parallel(n_jobs=-1)(
    delayed(plot_mae_all_bundles_pivot_3_way)(*task) for task in tasks
)

# Exécution parallèle
Parallel(n_jobs=-1)(
    delayed(plot_mae_all_bundles_pivot)(*task) for task in tasks
)

In [None]:
# Crée les tâches pour chaque combinaison
pivot_df = pivot_df.drop(columns=['raw', 'FLIP'], errors='ignore')

tasks = [
    (pivot_df, sample_size, ANALYSIS_FOLDER, "train", "MAE")
    for sample_size  in sample_sizes
]

Parallel(n_jobs=-1)(
    delayed(plot_mae_mean_all_diseases_metrics)(*task) for task in tasks
)

Parallel(n_jobs=-1)(
    delayed(plot_mae_mean_all_ratios)(*task) for task in tasks
)


tasks = [
    (pivot_df, sample_size, disease, ANALYSIS_FOLDER, "train", "MAE")
    for disease      in diseases
    for sample_size  in sample_sizes
]

Parallel(n_jobs=-1)(
    delayed(plot_rank_all_metrics)(*task) for task in tasks
)

tasks = [
    (pivot_df, sample_size, disease, metric, ANALYSIS_FOLDER, "train", "MAE")
    for disease      in diseases
    for sample_size  in sample_sizes
    for metric       in metrics
]

# Exécution parallèle
Parallel(n_jobs=-1)(
    delayed(plot_rank)(*task) for task in tasks
)

# Exécution parallèle
Parallel(n_jobs=-1)(
    delayed(plot_mae_all_bundles_pivot)(*task) for task in tasks
)

# Exécution parallèle
Parallel(n_jobs=-1)(
    delayed(plot_mae_all_bundles_pivot)(*task) for task in tasks
)


In [None]:
# Crée les tâches pour chaque combinaison
diff_df = diff_df.drop(columns=['raw', 'FLIP'], errors='ignore')

tasks = [
    (diff_df, sample_size, disease, metric, ANALYSIS_FOLDER, "train", "MAE_DIFF")
    for disease      in diseases
    for sample_size  in sample_sizes
    for metric       in metrics
]

# Exécution parallèle
Parallel(n_jobs=-1)(
    delayed(plot_mae_all_bundles_pivot_3_way)(*task) for task in tasks
)

# Exécution parallèle
Parallel(n_jobs=-1)(
    delayed(plot_mae_all_bundles_pivot)(*task) for task in tasks
)

In [None]:
# Crée les tâches pour chaque combinaison
ranked = ranked.drop(columns='raw', errors='ignore')

tasks = [
    (ranked, sample_size, ANALYSIS_FOLDER, "train", "RANK")
    for sample_size  in sample_sizes
]

Parallel(n_jobs=-1)(
    delayed(plot_mae_mean_all_ratios)(*task) for task in tasks
)

Parallel(n_jobs=-1)(
    delayed(plot_mae_mean_all_diseases_metrics)(*task) for task in tasks
)

tasks = [
    (ranked, sample_size, disease, ANALYSIS_FOLDER, "train", "RANK")
    for disease      in diseases
    for sample_size  in sample_sizes
]

Parallel(n_jobs=-1)(
    delayed(plot_rank_all_metrics)(*task) for task in tasks
)

tasks = [
    (ranked, sample_size, disease, metric, ANALYSIS_FOLDER, "train", "RANK")
    for disease      in diseases
    for sample_size  in sample_sizes
    for metric       in metrics
]

# Exécution parallèle
Parallel(n_jobs=-1)(
    delayed(plot_rank)(*task) for task in tasks
)

In [None]:
# ⬛️ CELLULE 4 — PERF PAR (site, disease) avec split Cas1/Cas2
summary_site_disease = []   # stats méthodes
info_site_disease    = []   # % de cas1 / cas2 par groupe

for (site, disease), grp in df_long.groupby(["site", "disease"]):
    pivot = grp.pivot_table(index=["metric", "bundle"],
                            columns="robust_method",
                            values="mae")

    # Vérifie qu'on a les deux références
    if {"NoRobust", "hc"}.issubset(pivot.columns):
        methods = [c for c in pivot.columns if c not in ["NoRobust", "hc"]]

        # ---- nouveau split Cas1 / Cas2 ----
        summary, info = compute_metrics(pivot, methods)

        # Ajoute les identifiants
        summary.insert(0, "site", site)
        summary.insert(1, "disease", disease)
        info["site"]    = site
        info["disease"] = disease

        summary_site_disease.append(summary)
        info_site_disease.append(info)

# Concatène
perf_site_disease  = pd.concat(summary_site_disease, ignore_index=True)
scenario_overview  = pd.DataFrame(info_site_disease)

# Aperçu
display(perf_site_disease.head())
display(scenario_overview.head())


In [None]:
# ⬛️ CELLULE 5 — PERF PAR (disease, n_patients, ratio) avec split Cas1/Cas2
summary_disease_N_ratio = []   # stats méthodes
info_disease_N_ratio    = []   # % cas1 / cas2 par combinaison

group_cols = ["disease", "num_patients", "disease_ratio"]

for key, grp in df_long.groupby(group_cols):
    pivot = grp.pivot_table(index=["metric", "bundle"],
                            columns="robust_method",
                            values="mae")

    if {"NoRobust", "hc"}.issubset(pivot.columns):
        methods = [c for c in pivot.columns if c not in ["NoRobust", "hc"]]

        # ---- split Cas1 / Cas2 ----
        summary, info = compute_metrics(pivot, methods)

        # Ajoute les identifiants de groupe
        for i, col in enumerate(group_cols):
            summary.insert(i, col, key[i])
            info[col] = key[i]

        summary_disease_N_ratio.append(summary)
        info_disease_N_ratio.append(info)

# Concatène
perf_disease_N_ratio = pd.concat(summary_disease_N_ratio, ignore_index=True)
scenario_disease_N_ratio = pd.DataFrame(info_disease_N_ratio)

# Aperçu
display(perf_disease_N_ratio.head())
display(scenario_disease_N_ratio.head())


In [None]:
# ⬛️ CELLULE 5‑bis — PERF PAR (disease, n_patients, ratio, metric)  ─ split Cas1/Cas2/Cas3
#  ⇢  Cette cellule cohabite avec la précédente : variables et outputs ont des noms distincts

summary_disease_N_ratio_metric = []   # stats par méthode + par métrique
info_disease_N_ratio_metric    = []   # % cas1 / cas2 / cas3 pour chaque combo

group_cols_metric = ["disease", "num_patients", "disease_ratio", "metric"]

for key, grp in df_long.groupby(group_cols_metric):
    # key = (disease, N, ratio, metric)
    # On garde l’index = bundle seulement, car la métrique est fixée
    pivot_m = grp.pivot_table(index="bundle",
                              columns="robust_method",
                              values="mae")

    if {"NoRobust", "hc"}.issubset(pivot_m.columns):
        methods_m = [c for c in pivot_m.columns if c not in ["NoRobust", "hc"]]

        # ---- split Cas1 / Cas2 / Cas3 ----
        summary_m, info_m = compute_metrics(pivot_m, methods_m)

        # Ajoute les identifiants (disease, N, ratio, metric)
        for i, col in enumerate(group_cols_metric):
            summary_m.insert(i, col, key[i])
            info_m[col] = key[i]

        summary_disease_N_ratio_metric.append(summary_m)
        info_disease_N_ratio_metric.append(info_m)

# Concatène résultats
perf_disease_N_ratio_metric     = pd.concat(summary_disease_N_ratio_metric, ignore_index=True)
scenario_disease_N_ratio_metric = pd.DataFrame(info_disease_N_ratio_metric)

# Aperçu rapide
display(perf_disease_N_ratio_metric.head())
display(scenario_disease_N_ratio_metric.head())


In [None]:
# ⬛️ CELLULE 6 — PERF PAR n_patients (toutes maladies/ratios mêlées) avec split Cas1/Cas2
summary_by_N = []     # stats méthodes
info_by_N    = []     # % cas1 / cas2 par N

for n, grp in df_long.groupby("num_patients"):
    pivot = grp.pivot_table(index=["metric", "bundle"],
                            columns="robust_method",
                            values="mae")

    if {"NoRobust", "hc"}.issubset(pivot.columns):
        methods = [c for c in pivot.columns if c not in ["NoRobust", "hc"]]

        # ---- split Cas1 / Cas2 ----
        summary, info = compute_metrics(pivot, methods)

        summary.insert(0, "n_patients", n)
        info["n_patients"] = n

        summary_by_N.append(summary)
        info_by_N.append(info)

# Concatène
perf_by_N          = pd.concat(summary_by_N, ignore_index=True)
scenario_by_N      = pd.DataFrame(info_by_N)

# Aperçu
display(perf_by_N.head())
display(scenario_by_N.head())


In [None]:
# ⬛️ CELLULE 6 bis — PERF par (disease, n_patients) + split Cas1 / Cas2
summary_by_N_disease = []   # stats méthodes
info_by_N_disease    = []   # % cas1 / cas2 (ou autre)

# Parcours par maladie PUIS par n_patients
for (disease, n), grp in df_long.groupby(["disease", "num_patients"]):
    pivot = grp.pivot_table(index=["metric", "bundle"],
                            columns="robust_method",
                            values="mae")

    # Vérifie qu’on a bien NoRobust et hc
    if {"NoRobust", "hc"}.issubset(pivot.columns):
        methods = [c for c in pivot.columns if c not in ["NoRobust", "hc"]]

        # ---- split Cas1 / Cas2 via compute_metrics ----
        # compute_metrics doit renvoyer (summary_df, info_dict)
        summary_df, info_dict = compute_metrics(pivot, methods)

        # Ajoute les dimensions disease et n_patients
        summary_df.insert(0, "disease", disease)
        summary_df.insert(1, "n_patients", n)

        info_dict["disease"]     = disease
        info_dict["n_patients"]  = n

        summary_by_N_disease.append(summary_df)
        info_by_N_disease.append(info_dict)

# Concatène
perf_by_N_disease = pd.concat(summary_by_N_disease, ignore_index=True)
scenario_by_N_disease = pd.DataFrame(info_by_N_disease)

# Aperçu
display(perf_by_N_disease.head())
display(scenario_by_N_disease.head())

In [None]:
OUTPUT_DIR = os.path.join(ANALYSIS_FOLDER, "SCORE_OVERVIEW")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 2) Petite fonction utilitaire
def save_df(df, filename):
    path = os.path.join(OUTPUT_DIR, filename)
    df.to_csv(path, index=False)
    print(" •", path)

# 3) Sauvegardes
print("CSV sauvegardés :")

# (site, disease)
save_df(perf_site_disease,        "perf_site_disease.csv")
save_df(scenario_overview,        "scenario_site_disease.csv")

# (disease, N, ratio) — sans métrique
save_df(perf_disease_N_ratio,     "perf_disease_N_ratio.csv")
save_df(scenario_disease_N_ratio, "scenario_disease_N_ratio.csv")

# (n_patients) — tous contextes mélangés
save_df(perf_by_N,                "perf_by_N.csv")
save_df(scenario_by_N,            "scenario_by_N.csv")

# (disease, N, ratio, metric)
save_df(perf_disease_N_ratio_metric,     "perf_disease_N_ratio_metric.csv")
save_df(scenario_disease_N_ratio_metric, "scenario_disease_N_ratio_metric.csv")

# (disease, N) — NOUVEAU bloc avec split Cas1/Cas2
save_df(perf_by_N_disease,        "perf_by_N_disease.csv")
save_df(scenario_by_N_disease,    "scenario_by_N_disease.csv")

In [None]:
diff_df_100 = diff_df.reset_index()
diff_df_100 = diff_df_100[(diff_df_100['num_patients'] == 100)]
diff_df_100


## BEST/WORST

In [None]:
def get_best_worst_cases(df, top_n=2):
    """
    Retourne deux DataFrames :
      1. worst_df : top_n pires erreurs (valeur la plus élevée)
      2. best_df  : top_n meilleures erreurs (valeur la plus basse)
    Groupage sur : disease, num_patients, disease_ratio, metric
    """
    # Colonnes purement contextuelles
    context_cols = [
        'site', 'disease', 'metric', 'bundle',
        'num_patients', 'disease_ratio', 'num_diseased', 'raw', 'hc', 'NoRobust'
    ]
    
    # Chaque autre colonne est une méthode robuste
    method_cols = [col for col in df.columns if col not in context_cols]
    
    
    worst_rows = []
    best_rows  = []
    
    # Nouveau groupage incluant metric
    grp = df.groupby(['disease', 'num_patients', 'disease_ratio', 'metric'])
    
    for (disease, num_patients, disease_ratio, metric), gdf in grp:
        for method in method_cols:
            tmp = gdf.copy()
            tmp['error']  = tmp[method]
            tmp['method'] = method
            
            # Top N pires
            worst_rows.append(
                tmp.nlargest(top_n, 'error')
                   [['disease','num_patients','disease_ratio',
                     'metric','bundle','site','method','error']]
            )
            # Top N meilleures
            best_rows.append(
                tmp.nsmallest(top_n, 'error')
                   [['disease','num_patients','disease_ratio',
                     'metric','bundle','site','method','error']]
            )
    
    worst_df = pd.concat(worst_rows, ignore_index=True)
    best_df  = pd.concat(best_rows,  ignore_index=True)
    
    return worst_df, best_df

worst_df, best_df = get_best_worst_cases(diff_df_smape.reset_index(), top_n=2)

In [None]:
worst_df = worst_df[worst_df['num_patients'] == 100]
worst_df

best_df = best_df[best_df['num_patients'] == 100]
best_df

In [None]:
import os
from PIL import Image
from pathlib import Path
def combine_imgs(outdir, site, harmonization_method, metric, bundle, error, method="Robust", delete_originals=True):
    outdir = Path(outdir)
    suf = f"{metric}_{bundle.replace('_', '')}.png"
    files = {
        "tl": f"{site}_No_Robust_raw_{suf}",
        "tr": f"{site}_No_Robust_{harmonization_method}_{suf}",
        "bl": f"{site}_{method}_{harmonization_method}_{suf}",
        "br": f"{site}_HC_{harmonization_method}_{suf}"
    }
    imgs = [Image.open(outdir / f) for f in files.values()]
    w, h = imgs[0].size
    cmb = Image.new("RGB", (2*w, 2*h), "white")
    cmb.paste(imgs[0], (0, 0)), cmb.paste(imgs[1], (w, 0))
    cmb.paste(imgs[2], (0, h)), cmb.paste(imgs[3], (w, h))
    out = outdir / f"{method}_{float(error):.2f}_{site}_combined_{suf}"
    cmb.save(out)
    for im in imgs: im.close()
    if delete_originals:
        for f in list(files.values()) + [f"{site}_{method}_raw_{suf}", f"{site}_HC_raw_{suf}"]:
            (outdir / f).unlink(missing_ok=True)
    return out


In [210]:
def VIZ(metric, bundle, site, method, error,
        disease, num_patients, disease_ratio):
    
    # Crée le dossier au besoin
    test_index = site.split("_")[-1]
    dir = os.path.join("RESULTS/MAE_TEST", 'PROCESS', disease,
                           f"{num_patients}_{disease_ratio}",
                           f"{test_index}", metric)
    dir_robust = os.path.join(dir,method)
    dir_norobust = os.path.join(dir, "NoRobust")
    dir_hc = os.path.join(dir, "hc")
    directory_site = os.path.join("RESULTS/MAE_TEST/SYNTHETIC_SITES/v2", disease, f"{num_patients}_{disease_ratio}",f"{test_index}")
    train_file_name = os.path.join(directory_site,f"train_{num_patients}_{disease_ratio}_{test_index}_{metric}.csv")
    train_file_name_2 = os.path.join(directory_site,f"train_{num_patients}_{disease_ratio}_{test_index}_{metric}_{str(error)}.csv")
    # Ouvre le fichier train, supprime la colonne 'model' si elle existe, puis sauvegarde
    if os.path.isfile(train_file_name):
        if os.path.getsize(train_file_name) == 0:
            print("Fichier vide ou manquant :", train_file_name)
            return
        df_train = pd.read_csv(train_file_name)
        if 'model' in df_train.columns:
            df_train = df_train.drop(columns=['model'])
        if 'harmonization' in df_train.columns:
            df_train = df_train.drop(columns=['harmonization'])
        df_train['site'] = site
        df_train.to_csv(train_file_name_2, index=False)
    
    robust_file = os.path.join(
            dir_robust,
            site
            + "."
            + metric
            + "."
            + harmonization_method
            + "."
            + method
            + "."
            + rwp_text(False)
            + ".csv"
        )
    norobust_file = os.path.join(
            dir_norobust,
            site
            + "."
            + metric
            + "."
            + harmonization_method
            + ".NoRobust"
            + "."
            + rwp_text(False)
            + ".csv"
        )
    hc_file = os.path.join(
            dir_hc,
            site
            + "."
            + metric
            + "."
            + harmonization_method
            + ".NoRobust"
            + "."
            + rwp_text(False)
            + ".csv"
        )
    ref_data_file = get_camcan_file(metric)
    outdir = os.path.join(ANALYSIS_FOLDER, 'BEST',method, str(num_patients))

    visualize_harmonization(train_file_name_2, norobust_file, ref_data_file, outdir, bundles = bundle, title=f"{site}_No_Robust")
    visualize_harmonization(train_file_name_2, robust_file, ref_data_file, outdir, bundles = bundle, title=f"{site}_{method}")
    visualize_harmonization(train_file_name_2, hc_file, ref_data_file, outdir, bundles = bundle, title=f"{site}_HC")

    if "Z_SCORE" in method:
        z_score_sids = z_score_detection(os.path.join(
        directory_site, f"train_{num_patients}_{disease_ratio}_{test_index}_all.csv"))
        df_train = flag_sid(df_train, z_score_sids, "Z_SCORE")

    df_train['site'] = site + "viz"
    df_train['error'] = error
    df_train['nasty_bundle'] = bundle
    temp_file = os.path.join(outdir, f"temp_{str(error)}.csv")
    df_train.to_csv(temp_file, index=False)

    output_model_filename = fit(temp_file, ref_data_file, metric, harmonization_method, method, False, outdir, False)

    if os.path.isfile(temp_file):
        os.remove(temp_file)
    if os.path.isfile(train_file_name_2):
        os.remove(train_file_name_2)
    if output_model_filename and os.path.isfile(output_model_filename):
        os.remove(output_model_filename)
    outlier_file = os.path.join(outdir, f"outliers_{site}viz_{method}_NoRWP.csv")
    if os.path.isfile(outlier_file):
        os.remove(outlier_file)
    combine_imgs(outdir, site, harmonization_method, metric, bundle, str(error), method)

    

    

In [None]:
Parallel(n_jobs=1)(delayed(VIZ)(**r) for r in worst_df.to_dict("records"))

In [211]:
Parallel(n_jobs=1)(delayed(VIZ)(**r) for r in best_df.to_dict("records"))

Removing outliers with method FLIP
bundle is  mni_IIT_mask_skeletonFA
FLIPPERRRRRRRR
Removing outliers with method FLIP
bundle is  mni_IIT_mask_skeletonFA
FLIPPERRRRRRRR
Removing outliers with method IQR
bundle is  mni_F_L_R
Removing outliers with method IQR
bundle is  mni_VOF_L
Removing outliers with method MAD
bundle is  mni_VOF_L
Removing outliers with method MAD
bundle is  mni_F_L_R
Removing outliers with method MMS
bundle is  mni_VOF_L
Removing outliers with method MMS
bundle is  mni_ILF_L
Removing outliers with method VS
bundle is  mni_VOF_L
Removing outliers with method VS
bundle is  mni_ML_R
Removing outliers with method Z_SCORE
bundle is  mni_VOF_L
Removing outliers with method Z_SCORE
bundle is  mni_OR_R
Removing outliers with method Z_SCORE_IQR
bundle is  mni_VOF_L
Removing outliers with method Z_SCORE_IQR
bundle is  mni_F_L_R


  sns.kdeplot(df_b_after[df_b_after['disease'] != 'HC'][y_column], label='Malades', ax=axes[1], linewidth=2)


Removing outliers with method Z_SCORE_MAD
bundle is  mni_VOF_L
Removing outliers with method Z_SCORE_MAD
bundle is  mni_F_L_R


  sns.kdeplot(df_b_after[df_b_after['disease'] != 'HC'][y_column], label='Malades', ax=axes[1], linewidth=2)


Removing outliers with method FLIP
bundle is  mni_IIT_mask_skeletonFA
FLIPPERRRRRRRR
Removing outliers with method FLIP
bundle is  mni_IFOF_L
FLIPPERRRRRRRR
Removing outliers with method IQR
bundle is  mni_F_L_R
Removing outliers with method IQR
bundle is  mni_MdLF_L


  sns.kdeplot(df_b_after[df_b_after['disease'] != 'HC'][y_column], label='Malades', ax=axes[1], linewidth=2)


Removing outliers with method MAD
bundle is  mni_F_L_R
Removing outliers with method MAD
bundle is  mni_F_L_R
Removing outliers with method MMS
bundle is  mni_ML_L
Removing outliers with method MMS
bundle is  mni_F_L_R
Removing outliers with method VS
bundle is  mni_ML_L
Removing outliers with method VS
bundle is  mni_F_L_R
Removing outliers with method Z_SCORE
bundle is  mni_F_L_R
Removing outliers with method Z_SCORE
bundle is  mni_F_L_R
Removing outliers with method Z_SCORE_IQR
bundle is  mni_F_L_R


  sns.kdeplot(df_b_after[df_b_after['disease'] != 'HC'][y_column], label='Malades', ax=axes[1], linewidth=2)


Removing outliers with method Z_SCORE_IQR
bundle is  mni_MdLF_L
Removing outliers with method Z_SCORE_MAD
bundle is  mni_F_L_R


  sns.kdeplot(df_b_after[df_b_after['disease'] != 'HC'][y_column], label='Malades', ax=axes[1], linewidth=2)


Removing outliers with method Z_SCORE_MAD
bundle is  mni_F_L_R
Removing outliers with method FLIP
bundle is  mni_IIT_mask_skeletonFA
FLIPPERRRRRRRR
Removing outliers with method FLIP
bundle is  mni_IIT_mask_skeletonFA
FLIPPERRRRRRRR
Removing outliers with method IQR
bundle is  mni_AF_L
Removing outliers with method IQR
bundle is  mni_SLF_L
Removing outliers with method MAD
bundle is  mni_VOF_L
Removing outliers with method MAD
bundle is  mni_MdLF_L
Removing outliers with method MMS
bundle is  mni_IFOF_R
Removing outliers with method MMS
bundle is  mni_OR_R
Removing outliers with method VS
bundle is  mni_OR_R
Removing outliers with method VS
bundle is  mni_IFOF_R
Removing outliers with method Z_SCORE
bundle is  mni_AF_L
Removing outliers with method Z_SCORE
bundle is  mni_IFOF_L
Removing outliers with method Z_SCORE_IQR
bundle is  mni_AF_L
Removing outliers with method Z_SCORE_IQR
bundle is  mni_SLF_L
Removing outliers with method Z_SCORE_MAD
bundle is  mni_AF_L
Removing outliers with m

  sns.kdeplot(df_b_after[df_b_after['disease'] != 'HC'][y_column], label='Malades', ax=axes[1], linewidth=2)


Removing outliers with method MAD
bundle is  mni_CST_L


  sns.kdeplot(df_b_after[df_b_after['disease'] != 'HC'][y_column], label='Malades', ax=axes[1], linewidth=2)


Removing outliers with method MMS
bundle is  mni_MdLF_R


  sns.kdeplot(df_b_after[df_b_after['disease'] != 'HC'][y_column], label='Malades', ax=axes[1], linewidth=2)


Removing outliers with method MMS
bundle is  mni_PPT_R
Removing outliers with method VS
bundle is  mni_MdLF_R


  sns.kdeplot(df_b_after[df_b_after['disease'] != 'HC'][y_column], label='Malades', ax=axes[1], linewidth=2)


Removing outliers with method VS
bundle is  mni_CCMid


  sns.kdeplot(df_b_after[df_b_after['disease'] != 'HC'][y_column], label='Malades', ax=axes[1], linewidth=2)


Removing outliers with method Z_SCORE
bundle is  mni_AST_L
Removing outliers with method Z_SCORE
bundle is  mni_AC
Removing outliers with method Z_SCORE_IQR
bundle is  mni_AC
Removing outliers with method Z_SCORE_IQR
bundle is  mni_CC
Removing outliers with method Z_SCORE_MAD
bundle is  mni_AC
Removing outliers with method Z_SCORE_MAD
bundle is  mni_AC
Removing outliers with method FLIP
bundle is  mni_IIT_mask_skeletonFA
FLIPPERRRRRRRR
Removing outliers with method FLIP
bundle is  mni_IIT_mask_skeletonFA
FLIPPERRRRRRRR
Removing outliers with method IQR
bundle is  mni_PPT_R


  sns.kdeplot(df_b_after[df_b_after['disease'] != 'HC'][y_column], label='Malades', ax=axes[1], linewidth=2)


Removing outliers with method IQR
bundle is  mni_CST_R


  sns.kdeplot(df_b_after[df_b_after['disease'] != 'HC'][y_column], label='Malades', ax=axes[1], linewidth=2)


Removing outliers with method MAD
bundle is  mni_CST_R


  sns.kdeplot(df_b_after[df_b_after['disease'] != 'HC'][y_column], label='Malades', ax=axes[1], linewidth=2)


Removing outliers with method MAD
bundle is  mni_CST_L


  sns.kdeplot(df_b_after[df_b_after['disease'] != 'HC'][y_column], label='Malades', ax=axes[1], linewidth=2)


Removing outliers with method MMS
bundle is  mni_SLF_L


  sns.kdeplot(df_b_after[df_b_after['disease'] != 'HC'][y_column], label='Malades', ax=axes[1], linewidth=2)


Removing outliers with method MMS
bundle is  mni_SLF_R
Removing outliers with method VS
bundle is  mni_AF_L


  sns.kdeplot(df_b_after[df_b_after['disease'] != 'HC'][y_column], label='Malades', ax=axes[1], linewidth=2)


Removing outliers with method VS
bundle is  mni_UF_R


  sns.kdeplot(df_b_after[df_b_after['disease'] != 'HC'][y_column], label='Malades', ax=axes[1], linewidth=2)


Removing outliers with method Z_SCORE
bundle is  mni_AST_L
Removing outliers with method Z_SCORE
bundle is  mni_PPT_R
Removing outliers with method Z_SCORE_IQR
bundle is  mni_PPT_R
Removing outliers with method Z_SCORE_IQR
bundle is  mni_AST_R
Removing outliers with method Z_SCORE_MAD
bundle is  mni_PPT_R
Removing outliers with method Z_SCORE_MAD
bundle is  mni_AST_R
Removing outliers with method FLIP
bundle is  mni_STT_L
FLIPPERRRRRRRR
Removing outliers with method FLIP
bundle is  mni_STT_L
FLIPPERRRRRRRR
Removing outliers with method IQR
bundle is  mni_C_L
Removing outliers with method IQR
bundle is  mni_AF_R
Removing outliers with method MAD
bundle is  mni_C_L
Removing outliers with method MAD
bundle is  mni_AF_R
Removing outliers with method MMS
bundle is  mni_AF_R
Removing outliers with method MMS
bundle is  mni_C_L
Removing outliers with method VS
bundle is  mni_C_L
Removing outliers with method VS
bundle is  mni_AF_R
Removing outliers with method Z_SCORE
bundle is  mni_VOF_L
Rem

  sns.kdeplot(df_b_after[df_b_after['disease'] != 'HC'][y_column], label='Malades', ax=axes[1], linewidth=2)


Removing outliers with method Z_SCORE_MAD
bundle is  mni_VOF_L
Removing outliers with method Z_SCORE_MAD
bundle is  mni_AF_R
Removing outliers with method FLIP
bundle is  mni_FPT_L
FLIPPERRRRRRRR
Removing outliers with method FLIP
bundle is  mni_IIT_mask_skeletonFA
FLIPPERRRRRRRR
Removing outliers with method IQR
bundle is  mni_IFOF_R
Removing outliers with method IQR
bundle is  mni_AF_R


  sns.kdeplot(df_b_after[df_b_after['disease'] != 'HC'][y_column], label='Malades', ax=axes[1], linewidth=2)


Removing outliers with method MAD
bundle is  mni_AF_R


  sns.kdeplot(df_b_after[df_b_after['disease'] != 'HC'][y_column], label='Malades', ax=axes[1], linewidth=2)


Removing outliers with method MAD
bundle is  mni_IFOF_R
Removing outliers with method MMS
bundle is  mni_VOF_L
Removing outliers with method MMS
bundle is  mni_ILF_L
Removing outliers with method VS
bundle is  mni_VOF_L
Removing outliers with method VS
bundle is  mni_ILF_L
Removing outliers with method Z_SCORE
bundle is  mni_IFOF_R
Removing outliers with method Z_SCORE
bundle is  mni_AF_R


  sns.kdeplot(df_b_after[df_b_after['disease'] != 'HC'][y_column], label='Malades', ax=axes[1], linewidth=2)


Removing outliers with method Z_SCORE_IQR
bundle is  mni_IFOF_R
Removing outliers with method Z_SCORE_IQR
bundle is  mni_AF_R


  sns.kdeplot(df_b_after[df_b_after['disease'] != 'HC'][y_column], label='Malades', ax=axes[1], linewidth=2)


Removing outliers with method Z_SCORE_MAD
bundle is  mni_IFOF_R
Removing outliers with method Z_SCORE_MAD
bundle is  mni_AF_R


  sns.kdeplot(df_b_after[df_b_after['disease'] != 'HC'][y_column], label='Malades', ax=axes[1], linewidth=2)


Removing outliers with method FLIP
bundle is  mni_UF_R
FLIPPERRRRRRRR
Removing outliers with method FLIP
bundle is  mni_UF_L
FLIPPERRRRRRRR
Removing outliers with method IQR
bundle is  mni_MdLF_R
Removing outliers with method IQR
bundle is  mni_IFOF_R
Removing outliers with method MAD
bundle is  mni_MdLF_R
Removing outliers with method MAD
bundle is  mni_IFOF_R
Removing outliers with method MMS
bundle is  mni_STT_R


  sns.kdeplot(df_b_after[df_b_after['disease'] != 'HC'][y_column], label='Malades', ax=axes[1], linewidth=2)


Removing outliers with method MMS
bundle is  mni_SCP


  sns.kdeplot(df_b_after[df_b_after['disease'] != 'HC'][y_column], label='Malades', ax=axes[1], linewidth=2)


Removing outliers with method VS
bundle is  mni_SCP
Removing outliers with method VS
bundle is  mni_STT_R
Removing outliers with method Z_SCORE
bundle is  mni_MdLF_R
Removing outliers with method Z_SCORE
bundle is  mni_F_L_R
Removing outliers with method Z_SCORE_IQR
bundle is  mni_MdLF_R
Removing outliers with method Z_SCORE_IQR
bundle is  mni_IFOF_R
Removing outliers with method Z_SCORE_MAD
bundle is  mni_MdLF_R
Removing outliers with method Z_SCORE_MAD
bundle is  mni_IFOF_R
Removing outliers with method FLIP
bundle is  mni_FPT_L
FLIPPERRRRRRRR
Removing outliers with method FLIP
bundle is  mni_SCP
FLIPPERRRRRRRR
Removing outliers with method IQR
bundle is  mni_IFOF_R
Removing outliers with method IQR
bundle is  mni_CC_ForcepsMinor
Removing outliers with method MAD
bundle is  mni_IFOF_R
Removing outliers with method MAD
bundle is  mni_AF_R


  sns.kdeplot(df_b_after[df_b_after['disease'] != 'HC'][y_column], label='Malades', ax=axes[1], linewidth=2)


Removing outliers with method MMS
bundle is  mni_VOF_L
Removing outliers with method MMS
bundle is  mni_PPT_L
Removing outliers with method VS
bundle is  mni_VOF_L
Removing outliers with method VS
bundle is  mni_PPT_L
Removing outliers with method Z_SCORE
bundle is  mni_IFOF_R
Removing outliers with method Z_SCORE
bundle is  mni_CC_ForcepsMinor
Removing outliers with method Z_SCORE_IQR
bundle is  mni_IFOF_R
Removing outliers with method Z_SCORE_IQR
bundle is  mni_CC_ForcepsMinor
Removing outliers with method Z_SCORE_MAD
bundle is  mni_IFOF_R
Removing outliers with method Z_SCORE_MAD
bundle is  mni_CC_ForcepsMinor
Removing outliers with method FLIP
bundle is  mni_UF_L
FLIPPERRRRRRRR
Removing outliers with method FLIP
bundle is  mni_AC
FLIPPERRRRRRRR
Removing outliers with method IQR
bundle is  mni_AC
Removing outliers with method IQR
bundle is  mni_IFOF_R
Removing outliers with method MAD
bundle is  mni_AC
Removing outliers with method MAD
bundle is  mni_IFOF_R
Removing outliers with me

Traceback (most recent call last):
  File "/home/local/USHERBROOKE/davy3001/Documents/COMBAT/Jodoin/Combat_robust/scripts/combat_quick_fit.py", line 29, in <module>
    from clinical_combat.utils.robust import remove_outliers
  File "/home/local/USHERBROOKE/davy3001/Documents/COMBAT/Jodoin/Combat_robust/clinical_combat/utils/robust.py", line 6, in <module>
    import seaborn as sns
  File "/home/local/USHERBROOKE/davy3001/Documents/COMBAT/Jodoin/Combat_robust/.robust/lib/python3.10/site-packages/seaborn/__init__.py", line 2, in <module>
    from .rcmod import *  # noqa: F401,F403
  File "/home/local/USHERBROOKE/davy3001/Documents/COMBAT/Jodoin/Combat_robust/.robust/lib/python3.10/site-packages/seaborn/rcmod.py", line 5, in <module>
    from . import palettes
  File "/home/local/USHERBROOKE/davy3001/Documents/COMBAT/Jodoin/Combat_robust/.robust/lib/python3.10/site-packages/seaborn/palettes.py", line 9, in <module>
    from .utils import desaturate, get_color_cycle
  File "/home/local/US

KeyboardInterrupt: 