In [6]:
print("hi")

import os
import pandas as pd

import numpy as np
import glob
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, shapiro

from robust_evaluation_tools.robust_utils import get_info, get_bundles, get_metrics, remove_covariates_effects
import subprocess

RAW_DIRECTORY = 'DONNES/CamCAN/'
FILTERED_DIRECTORY = 'DONNES_F/CamCAN/'
PLOTS_DIRECTORY = 'RESULTS/CamCAN_PLOTS'

method= "classic"

hi


In [7]:
import os, glob, numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
from scipy.stats import shapiro, norm

def get_extreme_sids(in_dir, Z):
    extreme = set()
    for metric in get_metrics():
        f = os.path.join(in_dir, f"CamCAN.{metric}.raw.csv.gz")
        if not os.path.isfile(f):
            continue
        df = remove_covariates_effects(pd.read_csv(f))          # ajoute mean_no_cov
        cam = df[df["site"].str.lower() == "camcan"]
        for bundle, g in cam.groupby("bundle"):
            vals = g["mean_no_cov"].dropna()
            if len(vals) < 3 or vals.std(ddof=0) == 0:
                continue
            z = (vals - vals.mean()) / vals.std(ddof=0)
            extreme.update(g.loc[z.abs() > Z, "sid"])
    return extreme

def plot_bundle(g, extreme, title, out_path):
    # flag des outliers
    g = g.copy()
    g["outlier"] = g["sid"].isin(extreme)

    all_vals = g["mean_no_cov"].dropna()
    clean    = g[~g["outlier"]]["mean_no_cov"].dropna()
    if len(all_vals) < 3:
        return

    mu,  sg,  p  = all_vals.mean(), all_vals.std(ddof=0), shapiro(all_vals)[1]
    muf, sgf, pf = clean.mean(),    clean.std(ddof=0),    shapiro(clean)[1]

    fig, ax = plt.subplots(1, 3, figsize=(18, 5))

    # distributions
    sns.histplot(all_vals, bins=30, kde=True, stat="density",
                 ax=ax[0], color="forestgreen", alpha=.35, label="all")
    sns.histplot(clean,    bins=30, kde=True, stat="density",
                 ax=ax[0], color="royalblue",  alpha=.35, label="filtered")
    x = np.linspace(all_vals.min(), all_vals.max(), 250)
    ax[0].plot(x, norm.pdf(x, mu, sg), color="black", lw=1.5)
    ax[0].set_title(f"{title}\nμ={mu:.3f},σ={sg:.3f},p={p:.2g} | μf={muf:.3f},σf={sgf:.3f},pf={pf:.2g}")
    ax[0].legend()

    palette = {True: "red", False: "black"}
    x_axis  = g["age"] if "age" in g.columns else range(len(g))

    sns.scatterplot(x=x_axis, y=g["mean"],
                    hue=g["outlier"], palette=palette,
                    ax=ax[1], legend=False, s=12)
    ax[1].set_title("mean")

    sns.scatterplot(x=x_axis, y=g["mean_no_cov"],
                    hue=g["outlier"], palette=palette,
                    ax=ax[2], legend=False, s=12, marker="X")
    ax[2].set_title("mean_no_cov")

    plt.tight_layout()
    plt.savefig(out_path, dpi=300)
    plt.close()
    return pf

def build_camcan_plots_outliers(in_dir, out_root, Z):
    pfs = []
    extreme = get_extreme_sids(in_dir, Z)

    flagged, filtered = [], []

    for metric in get_metrics():
        f = os.path.join(in_dir, f"CamCAN.{metric}.raw.csv.gz")
        if not os.path.isfile(f):
            continue
        df = remove_covariates_effects(pd.read_csv(f))
        df["is_extreme"] = df["sid"].isin(extreme)
        flagged.append(df)
        filtered.append(df[~df["is_extreme"]])
        

        cam = df
        metric_dir = os.path.join(out_root, str(Z) , metric)
        os.makedirs(metric_dir, exist_ok=True)

        for bundle in cam["bundle"].unique():
            g = cam[cam["bundle"] == bundle]
            out_path = os.path.join(metric_dir, f"{bundle}.png")
            pf = plot_bundle(g, extreme, f"{metric} • {bundle}", out_path)
            pfs.append(pf)

    # print(f"Found {len(extreme)} extreme subjects with Z > {Z}.")
    # print("Shapiro test : ", np.mean(pfs))
    return len(extreme), np.mean(pfs)



In [8]:
def make_filtered_CAM_CAN(in_dir, filt_dir, Z):
    os.makedirs(filt_dir, exist_ok=True)
    extreme = get_extreme_sids(in_dir, Z)

    for metric in get_metrics():
        f = os.path.join(in_dir, f"CamCAN.{metric}.raw.csv.gz")
        
        if not os.path.isfile(f):
            continue       
        df = pd.read_csv(f)      
        filtered_df = df[~df["sid"].isin(extreme)]
        dst_csv = os.path.join(filt_dir, f"CamCAN.{metric}.raw.csv.gz")
        filtered_df.to_csv(dst_csv, index=False, compression="gzip")

In [9]:
# from joblib import Parallel, delayed
# vals= [2, 2.25, 2.5, 2.75, 3, 3.25, 3.5, 3.75, 4, 4.25]
# # utilisation en parallèle
# results = Parallel(n_jobs=-1)(
#     delayed(build_camcan_plots_outliers)(RAW_DIRECTORY, PLOTS_DIRECTORY, i)
#     for i in vals
# )

# for v, (nb_extreme, shape_value) in zip(vals, results):
#     print(f"Z={v}: nb_extreme={nb_extreme}, shape_value={shape_value}")

In [None]:
make_filtered_CAM_CAN(RAW_DIRECTORY, FILTERED_DIRECTORY, 3)

