# Compilation split (all / mlp / harmonized) par patient

On coupe les compilations en deux moitiés en garantissant qu'un même `sid` 
(patient) reste toujours dans la même moitié pour **toutes** les métriques.


In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
from IPython.display import display
from robust_evaluation_tools.synthectic_sites_generations import augment_df

# Paramètres principaux
METHOD_TAG = "classic"
STRAT_COL = "disease"
RANDOM_STATE = 13
AUGMENT_COPIES = 5  # nombre total de copies (original inclus) pour l'augmentation

# Répertoires
PROCESSED_ROOT = Path("DONNES") / "processed"
COMPILATION_DIR = PROCESSED_ROOT / "compilation" / METHOD_TAG
ALL_DIR = COMPILATION_DIR / "all"
MLP_DIR = COMPILATION_DIR / "mlp"
HARMONIZED_DIR = COMPILATION_DIR / "harmonized"
MASTER_FILE = ALL_DIR / "compilation.all_metrics.csv.gz"
MASTER_FILE_CAMCAN = ALL_DIR / "compilation.all_metrics.with_camcan.csv.gz"

for path in [ALL_DIR, MLP_DIR, HARMONIZED_DIR]:
    path.mkdir(parents=True, exist_ok=True)

print(f"Input (all): {ALL_DIR}")
print(f"MLP split -> {MLP_DIR}")
print(f"Harmonized split -> {HARMONIZED_DIR}")


In [None]:
def disease_ratio(df, strat_col=STRAT_COL):
    return (
        df[strat_col]
        .value_counts(normalize=True)
        .sort_index()
        .round(4)
        .to_dict()
    )


def build_sid_split(master_path=MASTER_FILE, strat_col=STRAT_COL):
    if not master_path.exists():
        raise FileNotFoundError(f"Master file introuvable: {master_path}")

    base = pd.read_csv(master_path, usecols=["sid", strat_col]).drop_duplicates()
    mlp_sids, harmonized_sids = [], []
    toggle_extra = True
    for _, group in base.groupby(strat_col):
        shuffled = group.sample(frac=1, random_state=RANDOM_STATE)
        split_point = len(shuffled) // 2
        if len(shuffled) % 2:  # répartir l'item en trop en alternance
            if toggle_extra:
                split_point += 1
            toggle_extra = not toggle_extra
        mlp_sids.extend(shuffled["sid"].iloc[:split_point])
        harmonized_sids.extend(shuffled["sid"].iloc[split_point:])
    mlp_sids, harmonized_sids = set(mlp_sids), set(harmonized_sids)
    overlap = mlp_sids & harmonized_sids
    if overlap:
        raise ValueError(
            f"Les sids apparaissent dans les deux splits: {sorted(list(overlap))[:5]}"
        )
    return base, mlp_sids, harmonized_sids


def split_and_save(input_path, mlp_sids, harmonized_sids, master_ratio):
    df = pd.read_csv(input_path)
    if "old_site" not in df.columns:
        if "site" in df.columns:
            df["old_site"] = df["site"]
        elif "source_site" in df.columns:
            df["old_site"] = df["source_site"]
        else:
            print(
                f"[warn] {input_path.name} : colonne 'site' absente, impossible de renseigner old_site"
            )
    if "sid" not in df.columns:
        print(f"[skip] {input_path.name} : colonne 'sid' absente")
        return None
    if STRAT_COL not in df.columns:
        print(f"[skip] {input_path.name} : colonne '{STRAT_COL}' absente")
        return None

    file_sids = set(df["sid"].unique())
    extra_sids = file_sids - mlp_sids - harmonized_sids
    if extra_sids:
        print(f"[warn] {input_path.name}: {len(extra_sids)} sid hors master split")
    mlp_df = df[df["sid"].isin(mlp_sids)]
    harmonized_df = df[df["sid"].isin(harmonized_sids)]

    mlp_path = MLP_DIR / input_path.name
    harmonized_path = HARMONIZED_DIR / input_path.name
    mlp_df.to_csv(mlp_path, index=False, compression="gzip")
    harmonized_df.to_csv(harmonized_path, index=False, compression="gzip")

    return {
        "file": input_path.name,
        "rows_in": len(df),
        "rows_mlp": len(mlp_df),
        "rows_harmonized": len(harmonized_df),
        "ratio_master": master_ratio,
        "ratio_in": disease_ratio(df),
        "ratio_mlp": disease_ratio(mlp_df),
        "ratio_harmonized": disease_ratio(harmonized_df),
    }


In [None]:
# Construire les splits master sur les sids (avec/sans CamCAN)
split_configs = [
    {
        "name": "standard",
        "master_path": MASTER_FILE,
        "file_filter": lambda path: "with_camcan" not in path.name,
    },
    {
        "name": "with_camcan",
        "master_path": MASTER_FILE_CAMCAN,
        "file_filter": lambda path: "with_camcan" in path.name,
    },
]

summaries = []
for config in split_configs:
    master_path = config["master_path"]
    if not master_path.exists():
        print(f"[skip] Master file introuvable pour {config['name']}: {master_path}")
        continue

    master_df, MLP_SIDS, HARMONIZED_SIDS = build_sid_split(master_path=master_path)
    master_ratio = disease_ratio(master_df)
    print(
        f"[{config['name']}] Master sids -> MLP: {len(MLP_SIDS)} | Harmonized: {len(HARMONIZED_SIDS)}"
    )
    print(f"[{config['name']}] Ratios master (disease): {master_ratio}")

    compilation_files = [
        path
        for path in sorted(ALL_DIR.glob("compilation.*.csv.gz"))
        if config["file_filter"](path)
    ]

    print(f"[{config['name']}] {len(compilation_files)} fichiers à couper")
    for file_path in compilation_files:
        summary = split_and_save(file_path, MLP_SIDS, HARMONIZED_SIDS, master_ratio)
        if summary:
            summary["split"] = config["name"]
            summaries.append(summary)

summary_df = pd.DataFrame(summaries)
display(summary_df)


In [None]:
# Augmenter les datasets splittés en garantissant des deltas cohérents par sid

def build_sid_augmentations(sids, copies=AUGMENT_COPIES, random_state=RANDOM_STATE):
    rng = np.random.default_rng(random_state)
    return {sid: rng.choice([-1, 1], size=copies - 1) for sid in sorted(sids)}


def apply_aug_with_mapping(df, sid_aug_map, copies=AUGMENT_COPIES):
    augmented = [df]
    for copy_idx in range(1, copies):
        temp = df.copy()
        temp_age = []
        for _, row in temp.iterrows():
            deltas = sid_aug_map.get(row["sid"])
            delta = deltas[copy_idx - 1] if deltas is not None else 0
            temp_age.append(row["age"] + delta)
        temp["age"] = temp_age
        temp["sid"] = temp["sid"].astype(str) + f"_aug{copy_idx}"
        if "mean" in temp.columns:
            temp["mean"] = temp["mean"] * (
                1
                + np.random.default_rng(RANDOM_STATE + copy_idx).choice(
                    [-0.02, -0.01, 0.01, 0.02], size=len(temp)
                )
            )
        augmented.append(temp)
    return pd.concat(augmented, ignore_index=True)


def augment_folder_consistent(src_dir, copies=AUGMENT_COPIES, master_filename="compilation.all_metrics.csv.gz"):
    src_dir = Path(src_dir)
    dst_dir = src_dir.parent / f"{src_dir.name}_AUG_{copies}"
    dst_dir.mkdir(parents=True, exist_ok=True)

    master_path = src_dir / master_filename
    if not master_path.exists():
        print(f"[skip] master de référence introuvable dans {src_dir}: {master_filename}")
        return

    master_df = pd.read_csv(master_path, usecols=["sid", "age"])
    sid_aug_map = build_sid_augmentations(master_df["sid"].unique(), copies=copies)

    written = 0
    for file_path in sorted(src_dir.glob("*.csv.gz")):
        df = pd.read_csv(file_path)
        aug_df = apply_aug_with_mapping(df, sid_aug_map, copies=copies)
        compression = "gzip" if file_path.suffixes and file_path.suffixes[-1] == ".gz" else None
        out_path = dst_dir / file_path.name
        aug_df.to_csv(out_path, index=False, compression=compression)
        written += 1
    print(f"[augment] {written} fichiers écrits dans {dst_dir}")

augment_folder_consistent(MLP_DIR)
augment_folder_consistent(HARMONIZED_DIR)
