# CamCAN QC and Cleaning Pipeline

Identify extreme CamCAN subjects per metric and regenerate cleaned CSV exports used downstream.


In [1]:
from pathlib import Path

import pandas as pd
from IPython.display import display

from robust_evaluation_tools.robust_utils import get_metrics, remove_covariates_effects

RAW_CAMCAN_DIR = Path("DONNES") / "raw_CAMCAN"
PROCESSED_ROOT = Path("DONNES") / "processed"
CLEAN_CAMCAN_DIR = PROCESSED_ROOT / "CamCAN_clean"

for path in [PROCESSED_ROOT, CLEAN_CAMCAN_DIR]:
    path.mkdir(parents=True, exist_ok=True)

Z_THRESHOLD = 3.0
FORCE_REPROCESS_CAMCAN = False


In [2]:
def get_camcan_metric_path(metric, cleaned=False):
    suffix = "clean" if cleaned else "raw"
    directory = CLEAN_CAMCAN_DIR if cleaned else RAW_CAMCAN_DIR
    return directory / f"CamCAN.{metric}.{suffix}.csv.gz"


def detect_extreme_sids(camcan_df, z_threshold):
    df_cov = remove_covariates_effects(camcan_df.copy())
    df_cov = df_cov[df_cov["site"].str.lower() == "camcan"]
    extreme = set()
    for bundle, bundle_df in df_cov.groupby("bundle"):
        values = bundle_df["mean_no_cov"].dropna()
        if len(values) < 3:
            continue
        std = values.std(ddof=0)
        if std == 0:
            continue
        z_scores = (values - values.mean()) / std
        flagged = bundle_df.loc[z_scores.abs() > z_threshold, "sid"]
        extreme.update(flagged.tolist())
    return extreme


def detect_camcan_outliers():
    detection_rows = []
    metric_outliers = {}
    available_metrics = []

    for metric in get_metrics():
        src = get_camcan_metric_path(metric)
        if not src.exists():
            continue
        available_metrics.append(metric)
        df = pd.read_csv(src)
        outliers = detect_extreme_sids(df, Z_THRESHOLD)
        metric_outliers[metric] = outliers
        detection_rows.append({
            "metric": metric,
            "input_rows": len(df),
            "input_subjects": df["sid"].nunique(),
            "detected_outlier_sids": len(outliers),
            "source_path": str(src),
        })

    return available_metrics, detection_rows, metric_outliers


def write_clean_camcan_metric(metric, global_outliers):
    src = get_camcan_metric_path(metric)
    if not src.exists():
        return None
    df = pd.read_csv(src)
    cleaned_df = df[~df["sid"].isin(global_outliers)].copy()
    dst = get_camcan_metric_path(metric, cleaned=True)
    cleaned_df.to_csv(dst, index=False, compression="gzip")
    return {
        "metric": metric,
        "output_rows": len(cleaned_df),
        "output_subjects": cleaned_df["sid"].nunique(),
        "removed_rows": len(df) - len(cleaned_df),
        "removed_sids": len(global_outliers & set(df["sid"].unique())),
        "output_path": str(dst),
    }


def list_raw_camcan_metrics():
    metrics = []
    for metric in get_metrics():
        if get_camcan_metric_path(metric).exists():
            metrics.append(metric)
    return metrics


def clean_camcan_files_exist(metrics):
    if not metrics:
        return False
    return all(get_camcan_metric_path(metric, cleaned=True).exists() for metric in metrics)


In [3]:
raw_camcan_metrics = list_raw_camcan_metrics()
clean_files_present = clean_camcan_files_exist(raw_camcan_metrics)

camcan_detection_df = pd.DataFrame()
camcan_cleaning_df = pd.DataFrame()
global_camcan_outliers = set()

if FORCE_REPROCESS_CAMCAN or not clean_files_present:
    available_camcan_metrics, camcan_detection_rows, metric_outliers = detect_camcan_outliers()

    for sids in metric_outliers.values():
        global_camcan_outliers.update(sids)

    camcan_cleaning_rows = []
    for metric in available_camcan_metrics:
        report = write_clean_camcan_metric(metric, global_camcan_outliers)
        if report:
            camcan_cleaning_rows.append(report)

    camcan_detection_df = pd.DataFrame(camcan_detection_rows)
    camcan_cleaning_df = pd.DataFrame(camcan_cleaning_rows)

    print(f"Global CamCAN outliers to remove across metrics: {len(global_camcan_outliers)} subjects")
    if not camcan_detection_df.empty:
        print("Detection stats per metric:")
        display(camcan_detection_df)
    if not camcan_cleaning_df.empty:
        print("Cleaned CamCAN files per metric (after removing global outliers):")
        display(camcan_cleaning_df)
else:
    print("Clean CamCAN files already exist â€“ skipping reprocessing. Set FORCE_REPROCESS_CAMCAN = True to force a rebuild.")
    existing_rows = []
    for metric in raw_camcan_metrics:
        clean_path = get_camcan_metric_path(metric, cleaned=True)
        if not clean_path.exists():
            continue
        df = pd.read_csv(clean_path)
        existing_rows.append({
            "metric": metric,
            "output_rows": len(df),
            "output_subjects": df["sid"].nunique(),
            "removed_rows": None,
            "removed_sids": None,
            "output_path": str(clean_path),
        })
    camcan_cleaning_df = pd.DataFrame(existing_rows)
    if not camcan_cleaning_df.empty:
        display(camcan_cleaning_df)


Global CamCAN outliers to remove across metrics: 94 subjects
Detection stats per metric:


Unnamed: 0,metric,input_rows,input_subjects,detected_outlier_sids,source_path
0,ad,19845,441,30,DONNES/raw_CAMCAN/CamCAN.ad.raw.csv.gz
1,adt,19845,441,28,DONNES/raw_CAMCAN/CamCAN.adt.raw.csv.gz
2,afd,19845,441,32,DONNES/raw_CAMCAN/CamCAN.afd.raw.csv.gz
3,fa,19845,441,27,DONNES/raw_CAMCAN/CamCAN.fa.raw.csv.gz
4,fat,19845,441,30,DONNES/raw_CAMCAN/CamCAN.fat.raw.csv.gz
5,fw,19845,441,34,DONNES/raw_CAMCAN/CamCAN.fw.raw.csv.gz
6,md,19845,441,30,DONNES/raw_CAMCAN/CamCAN.md.raw.csv.gz
7,mdt,19845,441,27,DONNES/raw_CAMCAN/CamCAN.mdt.raw.csv.gz
8,rd,19845,441,27,DONNES/raw_CAMCAN/CamCAN.rd.raw.csv.gz
9,rdt,19845,441,25,DONNES/raw_CAMCAN/CamCAN.rdt.raw.csv.gz


Cleaned CamCAN files per metric (after removing global outliers):


Unnamed: 0,metric,output_rows,output_subjects,removed_rows,removed_sids,output_path
0,ad,15615,347,4230,94,DONNES/processed/CamCAN_clean/CamCAN.ad.clean....
1,adt,15615,347,4230,94,DONNES/processed/CamCAN_clean/CamCAN.adt.clean...
2,afd,15615,347,4230,94,DONNES/processed/CamCAN_clean/CamCAN.afd.clean...
3,fa,15615,347,4230,94,DONNES/processed/CamCAN_clean/CamCAN.fa.clean....
4,fat,15615,347,4230,94,DONNES/processed/CamCAN_clean/CamCAN.fat.clean...
5,fw,15615,347,4230,94,DONNES/processed/CamCAN_clean/CamCAN.fw.clean....
6,md,15615,347,4230,94,DONNES/processed/CamCAN_clean/CamCAN.md.clean....
7,mdt,15615,347,4230,94,DONNES/processed/CamCAN_clean/CamCAN.mdt.clean...
8,rd,15615,347,4230,94,DONNES/processed/CamCAN_clean/CamCAN.rd.clean....
9,rdt,15615,347,4230,94,DONNES/processed/CamCAN_clean/CamCAN.rdt.clean...
