# Notebook 6 - Analyse des résultats MAE
Ce notebook reprend le code de `ANALYSIS _MAE_SIMPLY` pour visualiser les métriques 
issues du Notebook 4 (`4-MAE_robust_pipeline.ipynb`). Les graphiques sont écrits dans 
`RESULTS/MAE_TEST/ANALYSIS_classic`.


In [1]:
import os, math
import pandas as pd
import subprocess
import re
import numpy as np
import json
import csv
from pathlib import Path

from joblib import Parallel, delayed
BW_FOLDER = "BS"


import matplotlib.pyplot as plt
import seaborn as sns



from scripts import combat_info
from scripts import combat_quick_apply
from scripts import combat_quick_QC
from robust_evaluation_tools.robust_utils import get_site, robust_text, rwp_text, get_camcan_file, get_diseases, get_metrics, add_nb_patients_and_diseased
from robust_evaluation_tools.robust_harmonization import fit, apply, visualize_harmonization, QC, compare_with_compilation, create_presentation, compare_distances, compare_with_compilation_var
from robust_evaluation_tools.synthectic_sites_generations import generate_sites
from robust_evaluation_tools.robust_outlier_detection import z_score_detection, flag_sid
from robust_evaluation_tools.robust_MLP import predict_malades_MLP

MAINFOLDER = "RESULTS/MAE_TEST"
SYNTHETIC_SITES = f"{MAINFOLDER}/SYNTHETIC_SITES"
harmonization_method= "classic"
ANALYSIS_FOLDER = f"{MAINFOLDER}/ANALYSIS_{harmonization_method}"
MAE_PLOT_FOLDER = f"{ANALYSIS_FOLDER}/MAE_PLOTS"

## EXECUTOR

In [2]:
SYNTHETIC_SITES_VERSION = "v1"

metrics = get_metrics()

# Paramètres contrôlant l'analyse (laisser à None pour autodétection)
diseases = ["ALL"]  # mettre None pour prendre toutes les maladies disponibles
sample_sizes = None  # déduire automatiquement les tailles disponibles
disease_ratios = None  # déduire automatiquement les ratios disponibles
num_tests = None  # déduire automatiquement le nombre de répétitions
n_jobs = -1


In [3]:
def infer_results_grid(mainfolder, harmonization_method, diseases=None):
    """Explore les résultats générés par le Notebook 4 et
    renvoie (diseases, sample_sizes, disease_ratios, num_tests).
    """
    base_dir = Path(mainfolder) / f"PROCESS_{harmonization_method}"
    if not base_dir.exists():
        raise FileNotFoundError(f"{base_dir} introuvable.")

    if diseases:
        disease_candidates = diseases
    else:
        disease_candidates = sorted(p.name for p in base_dir.iterdir() if p.is_dir())

    detected_diseases = []
    sample_sizes = set()
    disease_ratios = set()
    inferred_num_tests = 0

    for disease in disease_candidates:
        disease_dir = base_dir / disease
        if not disease_dir.is_dir():
            continue
        detected_diseases.append(disease)
        for size_ratio_dir in disease_dir.iterdir():
            if not size_ratio_dir.is_dir() or '_' not in size_ratio_dir.name:
                continue
            size_part, ratio_part = size_ratio_dir.name.split('_', 1)
            try:
                sample_size_val = int(size_part)
                disease_ratio_val = int(ratio_part) / 100
            except ValueError:
                continue
            sample_sizes.add(sample_size_val)
            disease_ratios.add(disease_ratio_val)

            run_ids = [
                int(run_dir.name)
                for run_dir in size_ratio_dir.iterdir()
                if run_dir.is_dir() and run_dir.name.isdigit()
            ]
            if run_ids:
                inferred_num_tests = max(inferred_num_tests, max(run_ids) + 1, len(run_ids))

    detected_diseases = sorted(set(detected_diseases))
    if not detected_diseases or not sample_sizes or not disease_ratios or inferred_num_tests == 0:
        raise ValueError(
            f"Impossible de déduire les paramètres depuis {base_dir}."
        )
    return detected_diseases, sorted(sample_sizes), sorted(disease_ratios), inferred_num_tests


In [4]:
def load_mae_or_maev_compilations(mainfolder, diseases, sample_sizes, disease_ratios, num_tests, mae_or_maev='mae'):
    tests, trains = [], []
    for d in diseases:
        for s in sample_sizes:
            for r in disease_ratios:
                for i in range(num_tests):
                    base = os.path.join(mainfolder, f"PROCESS_{harmonization_method}", d, f"{s}_{int(r*100)}", str(i))
                    test_path  = os.path.join(base, f"{mae_or_maev}_compilation_test.csv")
                    train_path = os.path.join(base, f"{mae_or_maev}_compilation_train.csv")
                    if os.path.isfile(test_path):
                        tests.append(pd.read_csv(test_path))
                    if os.path.isfile(train_path):
                        trains.append(pd.read_csv(train_path))
    df_test  = pd.concat(tests,  ignore_index=True) if tests  else pd.DataFrame()
    df_train = pd.concat(trains, ignore_index=True) if trains else pd.DataFrame()
    return df_test, df_train

In [5]:
def load_compilation(mae_or_maev: str,
                     split: str,
                     *,
                     mainfolder: str,
                     diseases: list[str],
                     sample_sizes: list[int],
                     disease_ratios: list[int],
                     num_tests: int) -> pd.DataFrame:
    if mae_or_maev not in {"mae", "maev", "smape", "std_mae"}:
        raise ValueError("mae_or_maev doit être 'mae' ou 'maev'")
    if split not in {"test", "train"}:
        raise ValueError("split doit être 'test' ou 'train'")

    df_test, df_train = load_mae_or_maev_compilations(
        mainfolder,
        diseases,
        sample_sizes,
        disease_ratios,
        num_tests,
        mae_or_maev=mae_or_maev
    )
    return df_test if split == "test" else df_train

In [6]:
detected_diseases, detected_sample_sizes, detected_disease_ratios, detected_num_tests = infer_results_grid(
    MAINFOLDER, harmonization_method, diseases if diseases is not None else None
)

if diseases is None:
    diseases = detected_diseases
else:
    missing = sorted(set(diseases) - set(detected_diseases))
    if missing:
        print(f"Pas de données pour : {missing}")
    diseases = [d for d in diseases if d in detected_diseases]
    if not diseases:
        raise ValueError("Aucune maladie valide trouvée dans les résultats.")

sample_sizes = detected_sample_sizes if sample_sizes is None else sample_sizes
disease_ratios = detected_disease_ratios if disease_ratios is None else disease_ratios
num_tests = detected_num_tests if num_tests is None else num_tests

print(f"Maladies analysées : {diseases}")
print(f"Tailles d'échantillon : {sample_sizes}")
print(f"Ratios de malades : {disease_ratios}")
print(f"Nombre de répétitions : {num_tests}")

mae_compilation_train_all = load_compilation("mae", "train",
                      mainfolder=MAINFOLDER,
                      diseases=diseases,
                      sample_sizes=sample_sizes,
                      disease_ratios=disease_ratios,
                      num_tests=num_tests)
mae_compilation_test_all = load_compilation("mae", "test",
                      mainfolder=MAINFOLDER,
                      diseases=diseases,
                      sample_sizes=sample_sizes,
                      disease_ratios=disease_ratios,
                      num_tests=num_tests)

smape_compilation_train_all = load_compilation("smape", "train",
                      mainfolder=MAINFOLDER,
                      diseases=diseases,
                      sample_sizes=sample_sizes,
                      disease_ratios=disease_ratios,
                      num_tests=num_tests)
smape_compilation_test_all = load_compilation("smape", "test",
                      mainfolder=MAINFOLDER,
                      diseases=diseases,
                      sample_sizes=sample_sizes,
                      disease_ratios=disease_ratios,
                      num_tests=num_tests)

std_mae_compilation_train_all = load_compilation("std_mae", "train",
                      mainfolder=MAINFOLDER,
                      diseases=diseases,
                      sample_sizes=sample_sizes,
                      disease_ratios=disease_ratios,
                      num_tests=num_tests)
std_mae_compilation_test_all = load_compilation("std_mae", "test",
                      mainfolder=MAINFOLDER,
                      diseases=diseases,
                      sample_sizes=sample_sizes,
                      disease_ratios=disease_ratios,
                      num_tests=num_tests)

Maladies analysées : ['ALL']
Tailles d'échantillon : [100]
Ratios de malades : [0.03, 0.1, 0.3, 0.5, 0.7, 0.8]
Nombre de répétitions : 40


In [7]:
def transformer_df_large_en_long(df_large):

    context_cols = ["site", "method", "robust_method", "disease", "metric"]
    bundle_cols = [col for col in df_large.columns if col not in context_cols]

    df_long = df_large.melt(
        id_vars=context_cols,
        value_vars=bundle_cols,
        var_name="bundle",
        value_name="mae"
    )
    df_long.loc[df_long['robust_method'] == 'No', 'robust_method'] = df_long.loc[df_long['robust_method'] == 'No', 'method']


    return df_long

## ANALYSIS

In [8]:

std_mae_compilation_train_all_long = transformer_df_large_en_long(std_mae_compilation_train_all)
std_mae_compilation_test_all_long  = transformer_df_large_en_long(std_mae_compilation_test_all)
ROBUST_METHODS = [
    'NoRobust',
    'hc',
    'raw',
    'MAD',
    'IQR',
    'SN',
    'QN',
    'MMS',
    'VS',
    'FLIP',
    'G_ZS',
    'G_MAD',
    'MLP7_ALL'
]


df_long = std_mae_compilation_train_all_long
df_long = add_nb_patients_and_diseased(df_long)
df_long = df_long[df_long["robust_method"].isin(ROBUST_METHODS)]


# 1)  Identifie les sites à exclure
sites_with_nan_std = (
    df_long
      .groupby("site")
      .filter(lambda g: g.isna().any().any())   # True si NaN dans le groupe
      ["site"]
      .unique()
)

# 2)  Statistiques
n_nan_sites_std   = len(sites_with_nan_std)
n_total_sites_std = df_long["site"].nunique()

print(f"Sites exclus pour NaN : {n_nan_sites_std} / {n_total_sites_std}")
if n_nan_sites_std:
    print("Liste :", list(sites_with_nan_std))

# 3)  Filtre le DataFrame pour la suite
df_long = df_long[~df_long["site"].isin(sites_with_nan_std)].copy()

pivot_df_std = (
    df_long
    .pivot_table(
        index=['site', 'disease', 'metric', 'bundle',
               'num_patients', 'disease_ratio', 'num_diseased'],
        columns='robust_method',
        values='mae',
        aggfunc='first'   # ou 'mean' si tu peux avoir plusieurs lignes identiques
    )
)

diff_df_std= pivot_df_std.sub(pivot_df_std['NoRobust'], axis=0)

# ranked_std = rank_methods_per_row(pivot_df_std)

Sites exclus pour NaN : 0 / 240


In [11]:
# Renommer directement pivot_df_std : 'hc' -> 'HC_ONLY', 'NoRobust' -> 'All patients'
try:
    mapping = {}
    if 'hc' in pivot_df_std.columns:
        mapping['hc'] = 'HC'
    if 'NoRobust' in pivot_df_std.columns:
        mapping['NoRobust'] = 'NO_FILTERING'
    if 'MLP7_ALL' in pivot_df_std.columns:
        mapping['MLP7_ALL'] = 'MLP'
    if mapping:
        pivot_df_std.rename(columns=mapping, inplace=True)
except NameError:
    print("pivot_df_std n'est pas défini dans cet environnement.")


## EXEC PLOTS MAE


In [20]:
diff_df_std
mean_diff = diff_df_std.groupby(level=['metric', 'bundle']).mean()
mean_diff
# chercher une colonne contenant 'mlp' (insensible à la casse) puis afficher les 5 plus petites valeurs
mlp_cols = [c for c in mean_diff.columns if 'MLP7_ALL' in str(c)]
if not mlp_cols:
    raise KeyError(f"Aucune colonne 'MLP7_ALL' trouvée. Colonnes disponibles: {list(mean_diff.columns)}")
mlp_col = mlp_cols[0]
top5_min = mean_diff[mlp_col].nsmallest(50).reset_index().rename(columns={mlp_col: 'mlp_diff'})
top5_min

Unnamed: 0,metric,bundle,mlp_diff
0,afd,mni_IFOF_L,-0.08765
1,afd,mni_AF_R,-0.08741
2,fat,mni_AST_L,-0.086328
3,fat,mni_FPT_R,-0.086125
4,afd,mni_CC_ForcepsMinor,-0.083895
5,fat,mni_VOF_R,-0.08319
6,afd,mni_IFOF_R,-0.082037
7,fat,mni_AST_R,-0.081313
8,fat,mni_AF_R,-0.080171
9,fat,mni_FPT_L,-0.080026


In [22]:
# Filtrer diff_df_std pour metric='fa', bundle='mni_AST_R', disease_ratio==50
# et afficher le minimum de la colonne MLP (cherche automatiquement la colonne MLP)
mlp_cols = [c for c in diff_df_std.columns if 'MLP7_ALL' in str(c) or str(c).upper().startswith('MLP')]
if not mlp_cols:
    raise KeyError(f"Aucune colonne MLP trouvée. Colonnes disponibles: {list(diff_df_std.columns)}")
mlp_col = mlp_cols[0]

idx = diff_df_std.index
mask = (
    (idx.get_level_values('metric') == 'fa') &
    (idx.get_level_values('bundle') == 'mni_AST_R') &
    (idx.get_level_values('disease_ratio') == 50)
)

subset = diff_df_std.loc[mask, mlp_col]
if subset.empty:
    print("Aucune ligne trouvée pour metric='fa', bundle='mni_AST_R', disease_ratio=50")
else:
    min_mlp = subset.min()
    # afficher le(s) site(s) correspondant au minimum
    min_idx = subset.idxmin()
    sites = subset[subset == min_mlp].index.get_level_values('site').unique()
    print(list(sites))

['ALL_100_patients_50_percent_18']
