# Multi-site Compilation Builder

Select eligible site-level CSV files, harmonize them against CamCAN, and export compilation tables.


In [1]:
from pathlib import Path

import pandas as pd
from IPython.display import display

from robust_evaluation_tools.robust_utils import get_metrics
from robust_evaluation_tools.robust_harmonization import fit, apply

HARMONIZATION_METHOD = "classic"
ROBUST_MODE = "No"
RWP_MODE = False
INCLUDE_HC = True
METHOD_TAG = HARMONIZATION_METHOD.replace(" ", "_")

RAW_SITES_DIR = Path("DONNES") / "raw"
RAW_CAMCAN_DIR = Path("DONNES") / "raw_CAMCAN"
PROCESSED_ROOT = Path("DONNES") / "processed"
CLEAN_CAMCAN_DIR = PROCESSED_ROOT / "CamCAN_clean"
COMPILATION_DIR = PROCESSED_ROOT / "compilation" / METHOD_TAG / "all"
MODELS_DIR = PROCESSED_ROOT / "models" / METHOD_TAG
HARMONIZED_SITES_DIR = PROCESSED_ROOT / "harmonized_sites" / METHOD_TAG

for path in [PROCESSED_ROOT, CLEAN_CAMCAN_DIR, COMPILATION_DIR, MODELS_DIR, HARMONIZED_SITES_DIR]:
    path.mkdir(parents=True, exist_ok=True)

MIN_HC_SUBJECTS = 8
MIN_SICK_SUBJECTS = 5


In [2]:
def get_camcan_metric_path(metric, cleaned=False):
    suffix = "clean" if cleaned else "raw"
    directory = CLEAN_CAMCAN_DIR if cleaned else RAW_CAMCAN_DIR
    return directory / f"CamCAN.{metric}.{suffix}.csv.gz"

def iter_site_files(metric):
    metric_dir = RAW_SITES_DIR / metric
    if not metric_dir.exists():
        return []
    return sorted(metric_dir.glob(f"*.{metric}.raw.csv.gz"))

def normalize_handedness_file(file_path):
    df = pd.read_csv(file_path)
    if "handedness" not in df.columns:
        return file_path
    updated = df["handedness"].replace({False: 1, True: 2, "False": 1, "True": 2})
    if updated.equals(df["handedness"]):
        return file_path
    df["handedness"] = updated
    compression = "gzip" if str(file_path).endswith(".gz") else None
    df.to_csv(file_path, index=False, compression=compression)
    return file_path


def summarize_site_subjects(file_path):
    df = pd.read_csv(file_path, usecols=["sid", "disease"])
    unique = df.drop_duplicates()
    nb_total = unique["sid"].nunique()
    nb_hc = unique[unique["disease"] == "HC"]["sid"].nunique()
    nb_sick = nb_total - nb_hc
    return nb_total, nb_hc, nb_sick


def site_has_required_subjects(file_path, min_hc=MIN_HC_SUBJECTS, min_sick=MIN_SICK_SUBJECTS):
    nb_total, nb_hc, nb_sick = summarize_site_subjects(file_path)
    return (nb_hc >= min_hc and nb_sick >= min_sick), nb_total, nb_hc, nb_sick


def build_metric_compilation(metric):
    ref_file = get_camcan_metric_path(metric, cleaned=True)
    if not ref_file.exists():
        return None
    ref_file = normalize_handedness_file(ref_file)
    camcan_df = pd.read_csv(ref_file)
    camcan_df["source_site"] = "CamCAN"
    site_files = iter_site_files(metric)
    if not site_files:
        return None

    compiled_frames = []
    site_rows = []

    for site_file in site_files:
        site_file = normalize_handedness_file(site_file)
        site_name = site_file.name.split(f".{metric}.raw")[0]
        is_valid, nb_total, nb_hc, nb_sick = site_has_required_subjects(site_file)
        site_rows.append({
            "metric": metric,
            "site": site_name,
            "nb_total": nb_total,
            "nb_hc": nb_hc,
            "nb_sick": nb_sick,
            "selected": bool(is_valid),
            "source_path": str(site_file),
        })
        if not is_valid:
            continue

        model_dir = MODELS_DIR / metric
        model_dir.mkdir(parents=True, exist_ok=True)
        clean_dir = HARMONIZED_SITES_DIR / metric
        clean_dir.mkdir(parents=True, exist_ok=True)

        model_path = fit(
            str(site_file),
            str(ref_file),
            metric,
            HARMONIZATION_METHOD,
            ROBUST_MODE,
            RWP_MODE,
            str(model_dir),
            INCLUDE_HC,
        )
        harmonized_file = apply(
            str(site_file),
            model_path,
            metric,
            HARMONIZATION_METHOD,
            ROBUST_MODE,
            RWP_MODE,
            str(clean_dir),
        )
        harmonized_file = normalize_handedness_file(harmonized_file)
        harmonized_df = pd.read_csv(harmonized_file)
        harmonized_df["source_site"] = site_name
        compiled_frames.append(harmonized_df)

    summary_df = pd.DataFrame(site_rows)
    if not compiled_frames:
        return pd.DataFrame(), summary_df, camcan_df

    metric_df = pd.concat(compiled_frames, ignore_index=True)
    dst = COMPILATION_DIR / f"compilation.{metric}.csv.gz"
    metric_df.to_csv(dst, index=False, compression="gzip")

    metric_with_camcan = pd.concat([metric_df, camcan_df], ignore_index=True)
    dst_with_camcan = COMPILATION_DIR / f"compilation.{metric}.with_camcan.csv.gz"
    metric_with_camcan.to_csv(dst_with_camcan, index=False, compression="gzip")

    return metric_df, summary_df, camcan_df


In [3]:
compilation_frames = []
compilation_frames_with_camcan = []
site_level_reports = []

for metric in get_metrics():
    result = build_metric_compilation(metric)
    if result is None:
        continue
    metric_df, summary_df, camcan_df = result
    site_level_reports.append(summary_df)

    if not metric_df.empty:
        metric_df["metric_bundle"] = metric_df["metric"] + "_" + metric_df["bundle"]
        compilation_frames.append(metric_df)

        metric_with_camcan = pd.concat([metric_df, camcan_df], ignore_index=True)
        metric_with_camcan["metric_bundle"] = metric_with_camcan["metric"] + "_" + metric_with_camcan["bundle"]
        compilation_frames_with_camcan.append(metric_with_camcan)
    else:
        camcan_only = camcan_df.copy()
        camcan_only["metric_bundle"] = camcan_only["metric"] + "_" + camcan_only["bundle"]
        compilation_frames_with_camcan.append(camcan_only)

if compilation_frames:
    compilation_all_metrics = pd.concat(compilation_frames, ignore_index=True)
    all_metrics_path = COMPILATION_DIR / "compilation.all_metrics.csv.gz"
    compilation_all_metrics.to_csv(all_metrics_path, index=False, compression="gzip")
else:
    compilation_all_metrics = pd.DataFrame()

if compilation_frames_with_camcan:
    compilation_all_metrics_with_camcan = pd.concat(compilation_frames_with_camcan, ignore_index=True)
    all_metrics_with_camcan_path = COMPILATION_DIR / "compilation.all_metrics.with_camcan.csv.gz"
    compilation_all_metrics_with_camcan.to_csv(
        all_metrics_with_camcan_path, index=False, compression="gzip"
    )
else:
    compilation_all_metrics_with_camcan = pd.DataFrame()

site_summary_df = pd.concat(site_level_reports, ignore_index=True) if site_level_reports else pd.DataFrame()


In [4]:
print("Site selection summary (first rows):")
display(site_summary_df.head(20))

if not compilation_all_metrics.empty:
    print("Compilation preview:")
    display(compilation_all_metrics.head())
else:
    print("No compilation generated yet. Ensure CamCAN clean files and site CSVs exist before rerunning.")


Site selection summary (first rows):


Unnamed: 0,metric,site,nb_total,nb_hc,nb_sick,selected,source_path
0,ad,35343,164,96,68,True,DONNES/raw/ad/35343.ad.raw.csv.gz
1,ad,35426,87,23,64,True,DONNES/raw/ad/35426.ad.raw.csv.gz
2,ad,43zR_iAwl9Ck,93,23,70,True,DONNES/raw/ad/43zR_iAwl9Ck.ad.raw.csv.gz
3,ad,47k_ouILA7fA,2,2,0,False,DONNES/raw/ad/47k_ouILA7fA.ad.raw.csv.gz
4,ad,4N..m6cr3es6,10,3,7,False,DONNES/raw/ad/4N..m6cr3es6.ad.raw.csv.gz
5,ad,4RYQhNAwMfVs,23,23,0,False,DONNES/raw/ad/4RYQhNAwMfVs.ad.raw.csv.gz
6,ad,4b4Ef0_uu0Hg,24,6,18,False,DONNES/raw/ad/4b4Ef0_uu0Hg.ad.raw.csv.gz
7,ad,4mg_jmm4yTS2,109,4,105,False,DONNES/raw/ad/4mg_jmm4yTS2.ad.raw.csv.gz
8,ad,4pGtp5fZYgoo,66,32,34,True,DONNES/raw/ad/4pGtp5fZYgoo.ad.raw.csv.gz
9,ad,4tAi.VgKuNm2,12,6,6,False,DONNES/raw/ad/4tAi.VgKuNm2.ad.raw.csv.gz


Compilation preview:


Unnamed: 0,sid,bundle,metric,mean,site,age,sex,handedness,disease,harmonization,model,source_site,metric_bundle
0,sub-10228,mni_AC,ad,0.001151,35343,40.0,2,1,HC,classic,35343.ad.classic.NoRobust.NoRWP,35343,ad_mni_AC
1,sub-10891,mni_AC,ad,0.001145,35343,21.0,2,1,HC,classic,35343.ad.classic.NoRobust.NoRWP,35343,ad_mni_AC
2,sub-10460,mni_AC,ad,0.001123,35343,40.0,1,1,HC,classic,35343.ad.classic.NoRobust.NoRWP,35343,ad_mni_AC
3,sub-10855,mni_AC,ad,0.001165,35343,47.0,1,1,HC,classic,35343.ad.classic.NoRobust.NoRWP,35343,ad_mni_AC
4,sub-10325,mni_AC,ad,0.001113,35343,29.0,2,1,HC,classic,35343.ad.classic.NoRobust.NoRWP,35343,ad_mni_AC


In [5]:
site_summary_df

Unnamed: 0,metric,site,nb_total,nb_hc,nb_sick,selected,source_path
0,ad,35343,164,96,68,True,DONNES/raw/ad/35343.ad.raw.csv.gz
1,ad,35426,87,23,64,True,DONNES/raw/ad/35426.ad.raw.csv.gz
2,ad,43zR_iAwl9Ck,93,23,70,True,DONNES/raw/ad/43zR_iAwl9Ck.ad.raw.csv.gz
3,ad,47k_ouILA7fA,2,2,0,False,DONNES/raw/ad/47k_ouILA7fA.ad.raw.csv.gz
4,ad,4N..m6cr3es6,10,3,7,False,DONNES/raw/ad/4N..m6cr3es6.ad.raw.csv.gz
...,...,...,...,...,...,...,...
789,rdt,adni_94_Siemens_3T,7,7,0,False,DONNES/raw/rdt/adni_94_Siemens_3T.rdt.raw.csv.gz
790,rdt,adni_98_GE_3T,50,27,23,True,DONNES/raw/rdt/adni_98_GE_3T.rdt.raw.csv.gz
791,rdt,adni_99_GE_3T,22,14,8,True,DONNES/raw/rdt/adni_99_GE_3T.rdt.raw.csv.gz
792,rdt,adni_9_GE_3T,9,7,2,False,DONNES/raw/rdt/adni_9_GE_3T.rdt.raw.csv.gz
