# Harmonisation ComBat multi-methodes

Ce notebook lance le script `combat_quick.py` pour harmoniser un site cible
contre un site de reference via ComBat Classic, Clinic, CovBat et ComBat-GAM.
Chaque methode est executee deux fois : sans filtrage robuste (`robust='No'`)
et avec `robust='MLP4_ALL_9'`. Ajustez les chemins dans la premiere cellule
lorsque vous changez de jeu de donnees.


In [None]:
from pathlib import Path

# === Parametres a personnaliser ===
REPO_ROOT = Path().resolve()
if REPO_ROOT.name == 'notebooks':
    REPO_ROOT = REPO_ROOT.parent

REFERENCE_DATA = REPO_ROOT / 'DONNES' / 'CamCAN' / 'CamCAN.afd.raw.csv.gz'
# TODO: mettre a jour ce chemin avec le site a harmoniser
MOVING_DATA = REPO_ROOT / 'A_REMPLACER' / 'site_a_harmoniser.csv.gz'

OUTPUT_ROOT = REPO_ROOT / 'RESULTS' / 'combat_notebook'
METHODS = ['classic', 'clinic', 'covbat', 'gam']
ROBUST_OPTIONS = ['No', 'MLP4_ALL_9']
EXTRA_FIT_ARGS = []  # Exemple: ['--ignore_sex']
DRY_RUN = False



In [None]:
import os
import sys
import pandas as pd

if not MOVING_DATA.exists():
    raise FileNotFoundError(f'Fichier MOVING_DATA introuvable: {MOVING_DATA}. Modifiez la cellule precedente.')
if not REFERENCE_DATA.exists():
    raise FileNotFoundError(f'Fichier REFERENCE_DATA introuvable: {REFERENCE_DATA}. Modifiez la cellule precedente.')

OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)

env_path = os.environ.get('PYTHONPATH')
paths = [] if not env_path else env_path.split(os.pathsep)
if str(REPO_ROOT) not in paths:
    os.environ['PYTHONPATH'] = str(REPO_ROOT) if not env_path else str(REPO_ROOT) + os.pathsep + env_path

mov_sample = pd.read_csv(MOVING_DATA, nrows=100)
ref_sample = pd.read_csv(REFERENCE_DATA, nrows=100)
MOVING_SITE = str(mov_sample['site'].iloc[0])
REFERENCE_SITE = str(ref_sample['site'].iloc[0])
METRIC = str(mov_sample['metric'].iloc[0])
print(f'Site cible    : {MOVING_SITE}')
print(f'Site reference: {REFERENCE_SITE}')
print(f'Metrique      : {METRIC}')

del mov_sample, ref_sample



In [None]:
import subprocess
import shlex
from datetime import datetime


def slugify(text):
    return ''.join(ch if ch.isalnum() else '_' for ch in str(text))


def _env_with_repo():
    env = os.environ.copy()
    existing = env.get('PYTHONPATH')
    paths = [] if not existing else existing.split(os.pathsep)
    if str(REPO_ROOT) not in paths:
        env['PYTHONPATH'] = str(REPO_ROOT) if not existing else str(REPO_ROOT) + os.pathsep + existing
    return env


def run_harmonization(method, robust_tag, overwrite=True, extra_fit_args=None, dry_run=None):
    if dry_run is None:
        dry_run = DRY_RUN
    if extra_fit_args is None:
        extra_fit_args = EXTRA_FIT_ARGS

    robust_cli = robust_tag if str(robust_tag).lower() != 'no' else 'No'
    robust_slug = slugify(robust_cli)
    method_slug = slugify(method)
    combo_dir = OUTPUT_ROOT / f'{method_slug}_robust_{robust_slug}'
    combo_dir.mkdir(parents=True, exist_ok=True)

    model_name = f"{slugify(MOVING_SITE)}-{slugify(REFERENCE_SITE)}.{METRIC}.{method_slug}.robust-{robust_slug}.model.csv"
    results_name = f"{slugify(MOVING_SITE)}.{METRIC}.{method_slug}.robust-{robust_slug}.res.csv.gz"

    quick_cmd = [
        sys.executable,
        str(REPO_ROOT / 'scripts' / 'combat_quick.py'),
        str(REFERENCE_DATA),
        str(MOVING_DATA),
        '--method', method,
        '--out_dir', str(combo_dir),
        '--output_model_filename', model_name,
        '--output_results_filename', results_name,
        '--robust', robust_cli,
        '-v', 'INFO',
    ]
    if overwrite:
        quick_cmd.append('-f')
    if extra_fit_args:
        quick_cmd.extend(extra_fit_args)

    env = _env_with_repo()

    print(f"[{datetime.now().isoformat(timespec='seconds')}] {method}/{robust_cli} - combat_quick")
    print(' ', ' '.join(shlex.quote(part) for part in quick_cmd))
    if not dry_run:
        subprocess.run(quick_cmd, check=True, env=env)

    return {
        'method': method,
        'robust': robust_cli,
        'model_path': combo_dir / model_name,
        'results_path': combo_dir / results_name,
        'output_dir': combo_dir,
    }




In [None]:
runs = []
for method in METHODS:
    for robust in ROBUST_OPTIONS:
        print('=' * 80)
        print(f'Lancement {method} / {robust}')
        try:
            summary = run_harmonization(method, robust)
        except subprocess.CalledProcessError as exc:
            print(f'Execution echouee pour {method}/{robust}: {exc}')
            summary = {
                'method': method,
                'robust': robust,
                'model_path': None,
                'results_path': None,
                'output_dir': None,
                'error': str(exc),
            }
        runs.append(summary)

runs_df = pd.DataFrame(runs)
runs_df



## Charger les donnees harmonisees

La cellule suivante montre comment charger les fichiers produits si vous souhaitez
les manipuler directement dans le notebook. Commentez-la ou adaptez-la selon vos
besoins.


In [None]:
harmonized_data = {}
for row in runs_df.dropna(subset=['results_path']).itertuples():
    path = Path(row.results_path)
    if path.exists():
        harmonized_data[(row.method, row.robust)] = pd.read_csv(path)
    else:
        print(f'Aucun fichier trouve pour {row.method}/{row.robust}: {path}')

list(harmonized_data.keys())

