## IMPORTS and UTILS

In [22]:
print("hi")

import os
import pandas as pd
import subprocess
import re
import numpy as np
import json
import csv

from joblib import Parallel, delayed



import matplotlib.pyplot as plt


import os

from scripts import combat_info
from scripts import combat_quick_apply
from scripts import combat_quick_QC
from robust_evaluation_tools.robust_utils import get_site, robust_text, rwp_text, get_camcan_file, get_diseases, get_metrics, add_nb_patients_and_diseased, get_disease, remove_covariates_effects
from robust_evaluation_tools.robust_harmonization import fit, apply, visualize_harmonization, QC, compare_with_compilation, create_presentation, compare_distances
from robust_evaluation_tools.synthectic_sites_generations import generate_sites
from robust_evaluation_tools.robust_outlier_detection import scatter_plot_with_colors, find_outliers, get_matching_indexes

MAINFOLDER = "RESULTS/SEUIL"
SYNTHETIC_SITES = f"{MAINFOLDER}/SYNTHETIC_SITES"

ANALYSIS_FOLDER = f"{MAINFOLDER}/ANALYSIS"

robust_methods_for_analysis = ['TOP5', 'TOP10', 'TOP20', 'TOP30','TOP40', 'TOP50',]

hi


## HARMONIZATION

In [None]:
def harmonize(f_train, ref_data_file, metric,harmonizartion_method, f_test, directory, robust, rwp,hc, disease, sample_size, disease_ratio, test_index):
    os.makedirs(directory, exist_ok=True)
    print(f_train)
    
    # Fit the model
    output_model_filename = fit(f_train, ref_data_file, metric, harmonizartion_method, robust, rwp, directory, hc,)
    # Apply the model
    output_filename = apply(f_test, output_model_filename, metric, harmonizartion_method, robust, rwp, directory)
    
    # Visualize the harmonization
    visualize_harmonization(f_test, output_filename, ref_data_file, directory, bundles = '')
    mae = compare_with_compilation(pd.read_csv(output_filename))
    if robust != "No":
        outliers_idx = get_matching_indexes(f_train, os.path.join(directory,f'outliers_{sample_size}_patients_{int(disease_ratio*100)}_percent_{test_index}_{robust_text(robust)}_{rwp_text(rwp)}.csv'))
        df = pd.read_csv(f_train)
        df_no_cov = remove_covariates_effects(df)
        df_no_cov = df_no_cov.loc[df.index]
        scatter_plot_with_colors(
            df_no_cov, outliers_idx, 'mean_no_cov', os.path.join(directory,'SCATTER_PLOT'),
            f'Scatter_{robust_text(robust)}_{rwp_text(rwp)},_{disease}_{sample_size}_{int(disease_ratio*100)}_{test_index}',
            f'Scatter for method {robust} disease: {disease}, with {sample_size} patients, {disease_ratio * 100} % of sick #{test_index}'
        )
    
    mae['site'] = get_site(f_train)
    
    return mae

In [None]:
def analyse_site(f_train,f_test, robust, directory, ref_data_file, metric,harmonizartion_method, disease, sample_size, disease_ratio, test_index):
    # 4 harmonization
    harmonization_hc = harmonize(f_train, ref_data_file, metric,harmonizartion_method, f_test, os.path.join(directory, "hc"), "No", False, True, disease, sample_size, disease_ratio, test_index)
    harmonization_no_robust = harmonize(f_train, ref_data_file, metric,harmonizartion_method, f_test, os.path.join(directory, "NoRobust"), "No", False, False, disease, sample_size, disease_ratio, test_index)
    harmonization_robust = harmonize(f_train, ref_data_file, metric,harmonizartion_method, f_test, os.path.join(directory, "robust"), robust, False, False, disease, sample_size, disease_ratio, test_index)
    harmonization_robust_rwp = harmonize(f_train, ref_data_file, metric,harmonizartion_method, f_test, os.path.join(directory, "robust_rwp"), robust, True, False, disease, sample_size, disease_ratio, test_index)


    #create_presentation(directory, harmonizartion_method)

    # Combine MEA in a single DataFrame
    mea_combined = pd.concat([harmonization_hc, harmonization_no_robust, harmonization_robust, harmonization_robust_rwp], ignore_index=True)
    mea_combined['method'] = ['hc', 'no_robust', 'robust', 'robust_rwp']


    #TODO bundles et analyze outliers
    return mea_combined

In [None]:

# Function to analyze a single (sample_size, disease_ratio) combination
def process_analysis(disease, metric, sample_size, disease_ratio,harmonization_method, robust_method, SYNTHETIC_SITES_VERSION, num_tests):
    directory = os.path.join(MAINFOLDER, robust_method, disease, metric)
    directory_site = os.path.join(SYNTHETIC_SITES, SYNTHETIC_SITES_VERSION,disease)
    
    sizeDir = os.path.join(directory, f"{sample_size}_{int(disease_ratio * 100)}")
    sizeDir_site = os.path.join(directory_site, f"{sample_size}_{int(disease_ratio * 100)}")

    mea_compilation = pd.DataFrame()

    for test_index in range(num_tests):
        tempDir = os.path.join(sizeDir, f"{test_index}")
        tempDir_site = os.path.join(sizeDir_site, f"{test_index}")
        os.makedirs(tempDir, exist_ok=True)

        train_file_name = f"train_{sample_size}_{int(disease_ratio * 100)}_{test_index}_{metric}.csv"
        test_file_name = f"test_{sample_size}_{int(disease_ratio * 100)}_{test_index}_{metric}.csv"

        # Load and save training dataset
        train_file = os.path.join(tempDir_site, train_file_name)
        train_df = pd.read_csv(train_file)
        train_df = train_df[~train_df['bundle'].isin(['left_ventricle', 'right_ventricle'])]
        train_df = train_df.drop(columns=['mean_no_cov', 'metric_bundle'])
        new_train_file = os.path.join(tempDir, train_file_name)
        train_df.to_csv(new_train_file, index=False)
        

        # Load and save test dataset
        test_file = os.path.join(tempDir_site, test_file_name)
        test_df = pd.read_csv(test_file)
        test_df = test_df[~test_df['bundle'].isin(['left_ventricle', 'right_ventricle'])]
        test_df = test_df.drop(columns=['mean_no_cov', 'metric_bundle'])
        new_test_file = os.path.join(tempDir, test_file_name)
        test_df.to_csv(new_test_file, index=False)

        ref_data_file = get_camcan_file(metric)

        # Analyze the site
        mea_analyze = analyse_site(
            new_train_file, new_test_file, robust_method, tempDir, ref_data_file, metric, harmonization_method , disease, sample_size, disease_ratio, test_index
        )

        outliers_idx = find_outliers(pd.read_csv(train_file), robust_method)

        scatter_plot_with_colors(
            pd.read_csv(train_file), outliers_idx, 'mean_no_cov', os.path.join(sizeDir,'SCATTER_PLOT'),
            f'Scatter_{robust_method}_{disease}_{sample_size}_{int(disease_ratio*100)}_{test_index}',
            f'Scatter for method {robust_method} disease: {disease}, with {sample_size} patients, {disease_ratio * 100} % of sick #{test_index}'
        )

        # df = pd.read_csv(new_train_file)
        # df_no_cov = remove_covariates_effects(df)

        # print(f"Outliers {df_no_cov['bundle'].iloc[0]} 'sid':", df_no_cov.loc[outliers_idx]['sid'].tolist())

        # scatter_plot_with_colors(
        #     df_no_cov , outliers_idx, 'mean_no_cov', os.path.join(sizeDir,'SCATTER_PLOT'),
        #     f'Scatter_{robust_method}_{disease}_{sample_size}_{int(disease_ratio*100)}_{test_index}',
        #     f'Scatter for method {robust_method} disease: {disease}, with {sample_size} patients, {disease_ratio * 100} % of sick #{test_index}'
        # )



        mea_analyze['robust_method'] = robust_method
        mea_analyze['disease'] = disease
        mea_analyze['metric'] = metric
        mea_compilation = pd.concat([mea_compilation, mea_analyze], ignore_index=True)

    # Save results for this combination
    os.makedirs(directory, exist_ok=True)
    mea_compilation.to_csv(os.path.join(sizeDir, "mea_compilation.csv"), index=False)

    return os.path.join(sizeDir, "mea_compilation.csv")

# Parallelized analysis method (excluding num_tests from parallelization)
def analyse_method(sample_sizes, disease_ratios, num_tests, robust_methods,diseases, metrics, harmonization_method, SYNTHETIC_SITES_VERSION):
    # Generate all task combinations (excluding num_tests)
    tasks = [
        (disease,metric, sample_size, disease_ratio,harmonization_method, robust_method, SYNTHETIC_SITES_VERSION, num_tests)
        for robust_method in robust_methods
        for disease in diseases
        for sample_size in sample_sizes
        for disease_ratio in disease_ratios
        for metric in metrics
    ]

    # Run all combinations in parallel and collect file paths
    results = Parallel(n_jobs=-1)(delayed(process_analysis)(*task) for task in tasks)

    mea_compilation = pd.concat([pd.read_csv(file) for file in results], ignore_index=True)

    # Save final compiled results
    directory = os.path.join(MAINFOLDER)
    os.makedirs(directory, exist_ok=True)

    mea_compilation.to_csv(os.path.join(directory, "mea_compilation.csv"), index=False)



In [None]:
def generate_sites_for_disease(disease, SYNTHETIC_SITES, SYNTHETIC_SITES_VERSION, sample_sizes, disease_ratios, num_tests):
    # Load data for the disease
    data_path = path = os.path.join('DONNES','COMPILATIONS_AUG_3', f'{disease}_combination_all_metrics_CamCAN.csv.gz')

    # Define site directory
    directory_site = os.path.join(SYNTHETIC_SITES, SYNTHETIC_SITES_VERSION, disease)

    # Generate synthetic sites
    generate_sites(sample_sizes, disease_ratios, num_tests, directory_site, data_path, disease=None)

## EXECUTOR

In [None]:
harmonization_method= "classic"

SYNTHETIC_SITES_VERSION = "v1"

metrics = get_metrics()
#diseases = get_diseases(True)
diseases = ["SYN_2"]
robust_methods = ["TOP5", "TOP10", "TOP20", "TOP30", "TOP40", "TOP50", 'CHEAT']


sample_sizes = [100]  # Différentes tailles d'échantillon
disease_ratios = [0.1, 0.3, 0.5]  # Différents pourcentages de malades
num_tests = 6  # Nombre de tests à effectuer pour chaque combinaison

# Parallel(n_jobs=-1)(
#     delayed(generate_sites_for_disease)(disease, SYNTHETIC_SITES, SYNTHETIC_SITES_VERSION, sample_sizes, disease_ratios, num_tests)
#     for disease in diseases
# )

In [None]:
analyse_method(sample_sizes, disease_ratios, num_tests, robust_methods,diseases, metrics, harmonization_method, SYNTHETIC_SITES_VERSION)

In [None]:
# Charger le fichier principal
mea_compilation = pd.read_csv(os.path.join(MAINFOLDER, "mea_compilation.csv"))

# Récupérer les méthodes uniques
methods = mea_compilation['robust_method'].unique()

# Sauvegarder un fichier CSV pour chaque méthode
for method in methods:
    # Filtrer les lignes correspondant à la méthode
    method_df = mea_compilation[mea_compilation['robust_method'] == method]
    
    # Créer un dossier pour la méthode
    method_directory = os.path.join(MAINFOLDER, method)
    os.makedirs(method_directory, exist_ok=True)
    
    # Sauvegarder le fichier CSV dans le dossier
    method_file_path = os.path.join(method_directory, f"{method}_mea_compilation.csv")
    method_df.to_csv(method_file_path, index=False)

In [None]:
def gather_all_method_files(main_folder, methods):
    all_dfs = []
    for m in methods:
        path = os.path.join(main_folder, m, f"{m}_mea_compilation.csv")
        if os.path.exists(path):
            df = pd.read_csv(path)
            all_dfs.append(df)
    return pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()

## ANALYSIS

In [23]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import Parallel, delayed

def plot_mea(df, sample_size, disease, metric, directory):
    directory = os.path.join(directory, "MAE_PLOTS", disease, metric, str(sample_size))
    os.makedirs(directory, exist_ok=True)
    
    df_filtered = df[
        (df['num_patients'] == sample_size) & 
        (df['disease'] == disease) & 
        (df['metric'] == metric)
    ]

    # Méthodes de base + méthodes robustes
    robust_methods = df_filtered['robust_method'].dropna().unique()
    base_methods = ["hc", "no_robust"]
    robust_base_methods = ["robust",]
    methods = base_methods + [f"{rb}_{rm}" for rb in robust_base_methods for rm in robust_methods]
    
    # Couleurs
    method_colors = {
        "hc": "green",
        "no_robust": "red"
    }
    robust_palette = sns.color_palette("viridis", len(robust_methods))
    robust_rwp_palette = sns.color_palette("magma", len(robust_methods))
    for i, rm in enumerate(robust_methods):
        method_colors[f"robust_{rm}"] = robust_palette[i]
        method_colors[f"robust_rwp_{rm}"] = robust_rwp_palette[i]

    # Boucle sur les "bundles"
    for bundle_column in df_filtered.columns:
        # On ignore les colonnes non-numériques
        if bundle_column in ['site', 'method', 'num_patients', 'disease_ratio',
                             'num_diseased', 'metric', 'disease', 'robust_method']:
            continue

        bundle_df = df_filtered[[bundle_column, 'site', 'method', 'num_patients',
                                 'disease_ratio', 'num_diseased', 'robust_method']].copy()
        unique_ratios = sorted(bundle_df['disease_ratio'].unique())  # Trié pour être sûr de l'ordre

        # *** ICI on paramètre la figure + le positionnement ***
        fig, ax = plt.subplots(figsize=(14, 7))  # Ajuster au besoin

        # Abscisses pour chaque ratio
        x = np.arange(len(unique_ratios))
        
        # Largeur totale allouée pour le « groupe » de méthodes à chaque ratio
        group_width = 0.8  
        # On répartit cette largeur entre toutes les méthodes
        n_methods = len(methods)
        box_width = group_width / n_methods

        # Tracé des boxplots pour chaque méthode
        for i_m, method in enumerate(methods):
            if "_" in method and method != "no_robust":
                method_base, robust_type = method.rsplit("_", 1)
                method_df = bundle_df[
                    (bundle_df['method'] == method_base) & 
                    (bundle_df['robust_method'] == robust_type)
                ]
            else:
                method_df = bundle_df[bundle_df['method'] == method]

            # On prépare la liste de valeurs par ratio
            data = [
                method_df[method_df['disease_ratio'] == ratio][bundle_column].values 
                for ratio in unique_ratios
            ]

            # Positions: on centre autour de chaque x
            # Exemple: x - group_width/2 + (i_m+0.5)*box_width
            positions = x - group_width/2 + (i_m + 0.5)*box_width

            color = method_colors.get(method, "black")
            
            # S’il y a au moins un point de données
            if any(len(d) > 0 for d in data):
                ax.boxplot(
                    data,
                    positions=positions,
                    widths=box_width * 0.8,  # Légèrement plus petit que box_width
                    patch_artist=True,
                    boxprops=dict(facecolor=color, color=color),
                    medianprops=dict(color='black')
                )

        ax.set_xlabel('Prct de patients malades')
        ax.set_ylabel('MAE')
        ax.set_title(
            f"MAE de l'harmonization selon le pourcentage de patients malades\n"
            f"Maladie: {disease}  |  Metric: {metric}  |  Bundle: {bundle_column}\n"
            f"Nb patient total: {sample_size}"
        )

        # On place les ticks au milieu de chaque groupe (i.e. sur x)
        ax.set_xticks(x)
        ax.set_xticklabels(unique_ratios)

        # Légende manuelle
        legend_handles = [
            plt.Line2D([0], [0], color=method_colors[m], lw=3, label=f'Method: {m}')
            for m in methods
        ]
        ax.legend(handles=legend_handles, loc="upper left", bbox_to_anchor=(1, 1))

        plt.tight_layout()
        plt.savefig(os.path.join(directory, f'{bundle_column}_boxplot.png'), bbox_inches="tight")
        plt.close()


# Load dataset
mea_df = gather_all_method_files(MAINFOLDER, robust_methods_for_analysis)
add_nb_patients_and_diseased(mea_df)

# Generate all task combinations
tasks = [
    (mea_df, sample_size, disease, metric, ANALYSIS_FOLDER)
    for disease in diseases
    for sample_size in sample_sizes
    for metric in metrics
]

# Run all tasks in parallel
Parallel(n_jobs=-1)(
    delayed(plot_mea)(*task) for task in tasks
)

69722.89s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
69723.06s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
69723.23s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
69723.39s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
69723.56s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
69723.73s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
69723.90s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
69724.06s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
69724.23s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
69724.40s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
69724.57s - pydevd: Sending message rela

[None, None, None, None, None, None, None, None, None, None]