# Notebook 10 - Visualisation des harmonisations robustes

Ce notebook automatise la creation des figures issues de `scripts/combat_visualize_harmonization.py` pour un bundle, une metrique et une methode d'harmonisation donnes. Il parcourt les dossiers `RESULTS/MAE_THRESHOLDS/PROCESS_<methode>` pour chaque site synthetique, relance le script de visualisation pour toutes les methodes robustes choisies, puis stocke les images dans un dossier par site.

1. Editer les parametres dans la cellule suivante (metrique, bundle, filtrage des sites et liste des methodes robustes a comparer; inclure `raw`, `hc` et `NoRobust` pour les references).
2. Lancer la preparation qui inventorie les sites disponibles et construit le plan d'execution.
3. Activer `RUN_VISUALIZATION` pour declencher la generation; chaque image comporte le nom de la methode robuste dans le fichier produit.


In [6]:
import csv
from pathlib import Path
from dataclasses import dataclass
from typing import Dict, List, Optional, Sequence

import pandas as pd

from robust_evaluation_tools.robust_utils import get_camcan_file
from robust_evaluation_tools.robust_harmonization import visualize_harmonization

# Dossiers et methode d'harmonisation a analyser
RESULTS_ROOT = Path("RESULTS/MAE_TEST")
HARMONIZATION_METHOD = "classic"

# Filtrage optionnel des sites synthetiques
DISEASES: Optional[Sequence[str]] = ["ALL"]
SAMPLE_SIZES: Optional[Sequence[int]] = None  # Exemple: [100]
DISEASE_RATIO: Optional[float] = 50  # Exemple: 0.1 ou 10 pour 10%
TEST_INDICES: Optional[Sequence[int]] = None  # Exemple: [0, 1, 2]

# Bundle cible
METRIC = "fa"
BUNDLE = "mni_IIT_mask_skeletonFA"

# Methodes robustes a visualiser (inclure raw/hc/NoRobust pour les references)
ROBUST_METHODS: Sequence[str] = [
    "raw",
    "hc",
    "NoRobust",
    "IQR",
    "MAD",
]

TARGET_SPLIT = "train"  # train ou test
OUTPUT_ROOT = Path("RESULTS/HARMONIZATION_VISUALS")
SKIP_EXISTING = True
RUN_VISUALIZATION = True


In [7]:
@dataclass
class SiteRun:
    disease: str
    sample_size: int
    ratio_pct: int
    test_index: int
    metric: str
    site_name: str
    metric_dir: Path
    raw_files: Dict[str, Optional[Path]]

    def raw_path(self, split: str) -> Path:
        path = self.raw_files.get(split)
        if path is None or not path.exists():
            raise FileNotFoundError(f"Impossible de trouver le fichier {split} pour {self.site_name} ({self.metric}).")
        return path


def _normalize_filter(values: Optional[Sequence], kind: str) -> Optional[set]:
    if values is None:
        return None
    normalized = set()
    for value in values:
        if value is None:
            continue
        if kind == "size":
            normalized.add(int(value))
        elif kind == "ratio":
            if isinstance(value, str):
                normalized.add(int(value))
            elif isinstance(value, float):
                normalized.add(int(round(value * 100)))
            else:
                normalized.add(int(value))
        else:
            normalized.add(int(value))
    return normalized


def _extract_site_name(raw_file: Path) -> str:
    with raw_file.open(newline="") as handle:
        reader = csv.DictReader(handle)
        first_row = next(reader, None)
    if not first_row or "site" not in first_row:
        raise ValueError(f"Impossible d'extraire le site depuis {raw_file}.")
    return str(first_row["site"])


def discover_sites(
    results_root: Path,
    harmonization_method: str,
    metric: str,
    diseases: Optional[Sequence[str]] = None,
    sample_sizes: Optional[Sequence[int]] = None,
    disease_ratios: Optional[Sequence[float]] = None,
    test_indices: Optional[Sequence[int]] = None,
) -> List[SiteRun]:
    base_dir = results_root / f"PROCESS_{harmonization_method}"
    if not base_dir.is_dir():
        raise FileNotFoundError(f"{base_dir} est introuvable: lancer d'abord le pipeline MAE.")

    disease_filter = set(diseases) if diseases else None
    size_filter = _normalize_filter(sample_sizes, "size")
    ratio_filter = _normalize_filter(disease_ratios, "ratio")
    test_filter = _normalize_filter(test_indices, "test")

    runs: List[SiteRun] = []
    for disease_dir in sorted(p for p in base_dir.iterdir() if p.is_dir()):
        disease = disease_dir.name
        if disease_filter and disease not in disease_filter:
            continue
        for size_dir in sorted(p for p in disease_dir.iterdir() if p.is_dir()):
            parts = size_dir.name.split("_", 1)
            if len(parts) != 2:
                continue
            size_val, ratio_val = int(parts[0]), int(parts[1])
            if size_filter and size_val not in size_filter:
                continue
            if ratio_filter and ratio_val not in ratio_filter:
                continue
            for test_dir in sorted(p for p in size_dir.iterdir() if p.is_dir() and p.name.isdigit()):
                test_idx = int(test_dir.name)
                if test_filter and test_idx not in test_filter:
                    continue
                metric_dir = test_dir / metric
                if not metric_dir.is_dir():
                    continue
                train_name = f"train_{size_val}_{ratio_val}_{test_idx}_{metric}.csv"
                test_name = f"test_{size_val}_{ratio_val}_{test_idx}_{metric}.csv"
                train_path = metric_dir / train_name
                if not train_path.is_file():
                    continue
                test_path = metric_dir / test_name
                raw_files = {
                    "train": train_path,
                    "test": test_path if test_path.is_file() else None,
                }
                runs.append(
                    SiteRun(
                        disease=disease,
                        sample_size=size_val,
                        ratio_pct=ratio_val,
                        test_index=test_idx,
                        metric=metric,
                        site_name=_extract_site_name(train_path),
                        metric_dir=metric_dir,
                        raw_files=raw_files,
                    )
                )
    return runs


def select_harmonized_file(site_run: SiteRun, method_label: str, split: str) -> Path:
    if method_label == "raw":
        return site_run.raw_path(split)
    folder = site_run.metric_dir / method_label
    if not folder.is_dir():
        raise FileNotFoundError(f"{folder} est introuvable.")
    suffix = "_test" if split == "test" else ""
    pattern = f"{site_run.site_name}{suffix}.{site_run.metric}.*.csv"
    candidates = [
        path
        for path in folder.glob(pattern)
        if path.is_file() and not path.name.endswith(".model.csv")
    ]
    if not candidates:
        raise FileNotFoundError(f"Aucun fichier harmonise pour {site_run.site_name} ({method_label}).")
    candidates.sort(key=lambda p: ("NoRWP" not in p.name, p.name))
    return candidates[0]


def build_jobs(
    site_runs: List[SiteRun],
    methods: Sequence[str],
    split: str,
    bundle: str,
    output_root: Path,
    harmonization_method: str,
) -> List[Dict[str, object]]:
    jobs: List[Dict[str, object]] = []
    base_output = output_root / f"{METRIC}_{bundle}_{harmonization_method}"
    for site in site_runs:
        site_dir = base_output / site.site_name
        site_dir.mkdir(parents=True, exist_ok=True)
        for method_label in methods:
            try:
                harm_file = select_harmonized_file(site, method_label, split)
                # ouvrir le fichier harmonisé, retirer les colonnes 'model' et 'harmonization',
                # sauvegarder une copie nettoyée et utiliser ce nouveau fichier
                try:
                    df_h = pd.read_csv(harm_file)
                except Exception as e:
                    raise ValueError(f"Impossible de lire {harm_file}: {e}")

                for c in ("model", "harmonization"):
                    if c in df_h.columns:
                        df_h.drop(columns=[c], inplace=True)

                cleaned_name = f"{site.site_name}_{method_label}_{bundle}_{split}.harm.cleaned.csv"
                data_dir = site_dir / "data"
                data_dir.mkdir(parents=True, exist_ok=True)
                cleaned_path = data_dir / cleaned_name
                df_h.to_csv(cleaned_path, index=False)
                harm_file = cleaned_path
                raw_file = site.raw_path(split)
            except (FileNotFoundError, ValueError) as exc:
                print(f"[skip] {site.site_name} - {method_label}: {exc}")
                continue
            stub = f"{site.site_name}_{method_label}_{bundle}_{split}"
            jobs.append(
                {
                    "site": site.site_name,
                    "disease": site.disease,
                    "sample_size": site.sample_size,
                    "ratio_pct": site.ratio_pct,
                    "test_index": site.test_index,
                    "method": method_label,
                    "split": split,
                    "raw_file": raw_file,
                    "harm_file": harm_file,
                    "out_dir": site_dir,
                    "stub": stub,
                }
            )
    return jobs


In [8]:
ratio_filter = None if DISEASE_RATIO is None else [DISEASE_RATIO]

site_runs = discover_sites(
    RESULTS_ROOT,
    HARMONIZATION_METHOD,
    METRIC,
    diseases=DISEASES,
    sample_sizes=SAMPLE_SIZES,
    disease_ratios=ratio_filter,
    test_indices=TEST_INDICES,
)
print(f"{len(site_runs)} site(s) trouves pour {METRIC}.")

jobs = build_jobs(site_runs, ROBUST_METHODS, TARGET_SPLIT, BUNDLE, OUTPUT_ROOT, HARMONIZATION_METHOD)
print(f"{len(jobs)} visualisation(s) planifiees.")

jobs_df = pd.DataFrame(
    [
        {
            "site": job["site"],
            "method": job["method"],
            "split": job["split"],
            "sample_size": job["sample_size"],
            "ratio_pct": job["ratio_pct"],
            "test_index": job["test_index"],
            "harmonized_file": job["harm_file"].name,
        }
        for job in jobs
    ]
)
jobs_df.head(20)


40 site(s) trouves pour fa.
200 visualisation(s) planifiees.


Unnamed: 0,site,method,split,sample_size,ratio_pct,test_index,harmonized_file
0,ALL_100_patients_50_percent_0,raw,train,100,50,0,ALL_100_patients_50_percent_0_raw_mni_IIT_mask...
1,ALL_100_patients_50_percent_0,hc,train,100,50,0,ALL_100_patients_50_percent_0_hc_mni_IIT_mask_...
2,ALL_100_patients_50_percent_0,NoRobust,train,100,50,0,ALL_100_patients_50_percent_0_NoRobust_mni_IIT...
3,ALL_100_patients_50_percent_0,IQR,train,100,50,0,ALL_100_patients_50_percent_0_IQR_mni_IIT_mask...
4,ALL_100_patients_50_percent_0,MAD,train,100,50,0,ALL_100_patients_50_percent_0_MAD_mni_IIT_mask...
5,ALL_100_patients_50_percent_1,raw,train,100,50,1,ALL_100_patients_50_percent_1_raw_mni_IIT_mask...
6,ALL_100_patients_50_percent_1,hc,train,100,50,1,ALL_100_patients_50_percent_1_hc_mni_IIT_mask_...
7,ALL_100_patients_50_percent_1,NoRobust,train,100,50,1,ALL_100_patients_50_percent_1_NoRobust_mni_IIT...
8,ALL_100_patients_50_percent_1,IQR,train,100,50,1,ALL_100_patients_50_percent_1_IQR_mni_IIT_mask...
9,ALL_100_patients_50_percent_1,MAD,train,100,50,1,ALL_100_patients_50_percent_1_MAD_mni_IIT_mask...


In [9]:
if RUN_VISUALIZATION and jobs:
    ref_file = get_camcan_file(METRIC, cleaned=True)
    for job in jobs:
        existing = []
        if SKIP_EXISTING:
            existing = list(job["out_dir"].glob(f"*{job['stub']}*.png"))
        if existing:
            print(f"[skip] {job['site']} - {job['method']} ({existing[0].name})")
            continue
        print(f"[run] {job['site']} - {job['method']}")
        try:
            visualize_harmonization(
                job["raw_file"].as_posix(),
                job["harm_file"].as_posix(),
                ref_file,
                job["out_dir"].as_posix(),
                bundles=BUNDLE,
                title=job["stub"],
            )
        except Exception as exc:
            print(f"[error] {job['site']} - {job['method']}: {exc}")
else:
    if not RUN_VISUALIZATION:
        print("RUN_VISUALIZATION est desactive. Basculez le a True pour lancer la generation.")
    elif not jobs:
        print("Aucune visualisation a executer.")


[run] ALL_100_patients_50_percent_0 - raw
[run] ALL_100_patients_50_percent_0 - hc
[run] ALL_100_patients_50_percent_0 - NoRobust
[run] ALL_100_patients_50_percent_0 - IQR
[run] ALL_100_patients_50_percent_0 - MAD
[run] ALL_100_patients_50_percent_1 - raw
[run] ALL_100_patients_50_percent_1 - hc
[run] ALL_100_patients_50_percent_1 - NoRobust
[run] ALL_100_patients_50_percent_1 - IQR
[run] ALL_100_patients_50_percent_1 - MAD
[run] ALL_100_patients_50_percent_10 - raw
[run] ALL_100_patients_50_percent_10 - hc
[run] ALL_100_patients_50_percent_10 - NoRobust
[run] ALL_100_patients_50_percent_10 - IQR
[run] ALL_100_patients_50_percent_10 - MAD
[run] ALL_100_patients_50_percent_11 - raw
[run] ALL_100_patients_50_percent_11 - hc
[run] ALL_100_patients_50_percent_11 - NoRobust
[run] ALL_100_patients_50_percent_11 - IQR
[run] ALL_100_patients_50_percent_11 - MAD
[run] ALL_100_patients_50_percent_12 - raw
[run] ALL_100_patients_50_percent_12 - hc
[run] ALL_100_patients_50_percent_12 - NoRobust
[