## IMPORTS and UTILS

In [None]:
import os
import pandas as pd
import numpy as np
from scipy.stats import entropy
from math import sqrt
import seaborn as sns

import matplotlib.pyplot as plt

from joblib import Parallel, delayed


from robust_evaluation_tools.robust_utils import get_complete_combination, get_metrics, get_diseases
from robust_evaluation_tools.robust_outlier_detection import find_outliers, analyze_detection_performance
from robust_evaluation_tools.synthectic_sites_generations import generate_sites
from robust_evaluation_tools.robust_analysis import calculate_precision_by_bundle
from clinical_combat.harmonization.QuickCombat import QuickCombat

all_metrics = get_metrics()
all_diseases = get_diseases(False)

MAINFOLDER = "RESULTS/DISTRIBUTION_ANALYSIS"
SYNTHETIC_SITES = f"{MAINFOLDER}/SYNTHETIC_SITES"

ANALYSIS_FOLDER = "ANALYSIS"

raw_directory = os.path.join('DONNES_F', 'COMPILATIONS')

In [None]:
# Fonction pour calculer la divergence KL
def kl_divergence(p, q):
    return entropy(p, q)

## EXECUTOR

In [None]:
# # AFFICHE LES SCATTER PLOTS AU LIEU DES DISTRIBUZTIONS

# # Assuming 'HC' is a specific value in the 'disease' column that indicates healthy controls
# HC_LABEL = 'HC'

# # Plot distribution for each bundle
# for disease in all_diseases:
#     combination = pd.read_csv(os.path.join(raw_directory, f'{disease}_combination_all_metrics_CamCAN.csv.gz'))
#     metric_bundles = combination['metric_bundle'].unique()
#     for metric_bundle in metric_bundles:
#         subset_all = combination[combination['metric_bundle'] == metric_bundle]
#         subset = subset_all[subset_all['old_site'] != 'CamCAN']
#         bundle = subset_all.bundle.unique()[0]
#         metric = subset_all.metric.unique()[0]
#         camCan_subset = subset_all[subset_all['old_site'] == 'CamCAN']
        
#         # Scatter plot with mean
#         plt.figure(figsize=(10, 6))
#         plt.scatter(subset[subset['disease'] == HC_LABEL]['age'], subset[subset['disease'] == HC_LABEL]['mean'], color='blue', label='HC', alpha=0.5)
#         plt.scatter(camCan_subset['age'], camCan_subset['mean'], color='green', label='CamCAN', alpha=0.5)
#         plt.scatter(subset[subset['disease'] != HC_LABEL]['age'], subset[subset['disease'] != HC_LABEL]['mean'], color='red', label=f'{disease}', alpha=0.5)
        
#         plt.title(f'Distribution of {disease} in metric {metric} for bundle: {bundle} (mean)')
#         plt.xlabel('Age')
#         plt.ylabel('Mean')
#         plt.legend()
#         plt.show()
        
#         # Scatter plot with mean_no_cov
#         plt.figure(figsize=(10, 6))
#         plt.scatter(subset[subset['disease'] == HC_LABEL]['age'], subset[subset['disease'] == HC_LABEL]['mean_no_cov'], color='blue', label='HC', alpha=0.5)
#         plt.scatter(camCan_subset['age'], camCan_subset['mean_no_cov'], color='green', label='CamCAN', alpha=0.5)
#         plt.scatter(subset[subset['disease'] != HC_LABEL]['age'], subset[subset['disease'] != HC_LABEL]['mean_no_cov'], color='red', label=f'{disease}', alpha=0.5)
        
#         plt.title(f'Distribution of {disease} in metric {metric} for bundle: {bundle} (mean_no_cov)')
#         plt.xlabel('Age')
#         plt.ylabel('Mean_no_cov')
#         plt.legend()
#         plt.show()

In [None]:
# AFFICHE LES SCATTER PLOTS AU LIEU DES DISTRIBUZTIONS
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import Parallel, delayed

HC_LABEL = 'HC'
SCATTER_FOLDER = os.path.join(MAINFOLDER, 'COMPILATION_SCATTER_PLOTS')
os.makedirs(SCATTER_FOLDER, exist_ok=True)


def process_disease(disease, raw_directory, out_root):
    combination = pd.read_csv(
        os.path.join(raw_directory, f'{disease}_combination_all_metrics_CamCAN.csv.gz')
    )

    for metric_bundle, subset_all in combination.groupby('metric_bundle'):
        subset = subset_all[subset_all['old_site'] != 'CamCAN']
        bundle = subset_all.bundle.iloc[0]
        metric = subset_all.metric.iloc[0]

        if subset.empty:
            continue

        plot_specs = (
            ('mean', 'Mean', False),
            ('mean_no_cov', 'Mean (no covariates)', True),
        )

        for value_col, ylabel, with_marginals in plot_specs:
            subset_plot = subset.dropna(subset=['age', value_col])
            if subset_plot.empty:
                continue

            hc_mask = subset_plot['disease'] == HC_LABEL
            patient_mask = subset_plot['disease'] != HC_LABEL

            if with_marginals:
                fig = plt.figure(figsize=(12, 6))
                gs = fig.add_gridspec(1, 2, width_ratios=[4, 1], wspace=0.05)
                ax = fig.add_subplot(gs[0])
                ax_marginal = fig.add_subplot(gs[1], sharey=ax)
            else:
                fig, ax = plt.subplots(figsize=(10, 6))
                ax_marginal = None

            ax.scatter(subset_plot[hc_mask]['age'],
                       subset_plot[hc_mask][value_col],
                       color='blue', alpha=0.5, label='HC')
            ax.scatter(subset_plot[patient_mask]['age'],
                       subset_plot[patient_mask][value_col],
                       color='red', alpha=0.5, label=disease)
            ax.set_title(f'{disease} - {metric_bundle} ({ylabel})')
            ax.set_xlabel('Age')
            ax.set_ylabel(ylabel)
            ax.legend()
            ax.grid(True, alpha=0.2)

            if ax_marginal is not None:
                sns.kdeplot(
                    y=subset_plot[hc_mask][value_col],
                    ax=ax_marginal,
                    fill=True,
                    bw_adjust=0.8,
                    linewidth=0,
                    color='blue',
                    alpha=0.35,
                )
                sns.kdeplot(
                    y=subset_plot[patient_mask][value_col],
                    ax=ax_marginal,
                    fill=True,
                    bw_adjust=0.8,
                    linewidth=0,
                    color='red',
                    alpha=0.35,
                )
                ax_marginal.set_xlabel('Density')
                ax_marginal.grid(False)
                ax_marginal.tick_params(labelleft=False)
                ax_marginal.set_xlim(left=0)
                ax_marginal.spines['top'].set_visible(False)
                ax_marginal.spines['right'].set_visible(False)

            out_dir = os.path.join(out_root, disease, metric)
            os.makedirs(out_dir, exist_ok=True)

            suffix = 'mean' if value_col == 'mean' else 'mean_no_cov'
            out_path = os.path.join(out_dir, f'{metric_bundle}_{suffix}.png')
            fig.tight_layout()
            fig.savefig(out_path)
            plt.close(fig)


Parallel(n_jobs=-1)(
    delayed(process_disease)(d, raw_directory, SCATTER_FOLDER)
    for d in all_diseases
)


In [None]:

# Assuming 'HC' is a specific value in the 'disease' column that indicates healthy controls
HC_LABEL = 'HC'
DISTRIBUTION_FOLDER = os.path.join(MAINFOLDER, 'COMPILATION_DISTRIBUTION')
os.makedirs(DISTRIBUTION_FOLDER, exist_ok=True)

def process_disease(disease, raw_directory, DISTRIBUTION_FOLDER):
    print(f"Processing {disease}...")  # Debugging info
    combination = pd.read_csv(os.path.join(raw_directory, f'{disease}_combination_all_metrics_CamCAN.csv.gz'))
    metric_bundles = combination['metric_bundle'].unique()
    
    for metric_bundle in metric_bundles:
        subset_all = combination[combination['metric_bundle'] == metric_bundle]
        subset = subset_all[subset_all['old_site'] != 'CamCAN']
        
        bundle = subset_all.bundle.unique()[0]
        metric = subset_all.metric.unique()[0]
        camCan_subset = subset_all[subset_all['old_site'] == 'CamCAN']

        plt.figure(figsize=(10, 6))
        sns.kdeplot(subset[subset['disease'] == HC_LABEL]['mean_no_cov'], color='blue', label='HC', fill=True)
        # sns.kdeplot(camCan_subset['mean_no_cov'], color='green', label='CamCAN', fill=True)
        sns.kdeplot(subset[subset['disease'] != HC_LABEL]['mean_no_cov'], color='red', label=f'{disease}', fill=True)
        plt.title(f'Distribution of {disease} in metric {metric} for bundle: {bundle}')
        plt.xlabel('Residual mean (covariate-corrected)')
        plt.ylabel('Frequency')
        plt.legend()

        output_dir = os.path.join(DISTRIBUTION_FOLDER, disease, metric)
        os.makedirs(output_dir, exist_ok=True)

        # Save the plot
        output_path = os.path.join(output_dir, f'{metric_bundle}.png')
        plt.savefig(output_path)
        plt.close()

# Run diseases in parallel, but keep metric bundles sequential
Parallel(n_jobs=-1)(
    delayed(process_disease)(disease, raw_directory, DISTRIBUTION_FOLDER)
    for disease in all_diseases
)

In [None]:

# Assuming 'HC' is a specific value in the 'disease' column that indicates healthy controls
HC_LABEL = 'HC'

# Function to process one disease
def process_disease(disease, raw_directory):
    print(f"Processing {disease}...")  # Debugging info
    results = []
    
    # Load the data for the disease
    combination = pd.read_csv(os.path.join(raw_directory, f'{disease}_combination_all_metrics_CamCAN.csv.gz'))
    metric_bundles = combination['metric_bundle'].unique()
    
    for metric_bundle in metric_bundles:
        subset_all = combination[combination['metric_bundle'] == metric_bundle]
        bundle = subset_all.bundle.unique()[0]
        metric = subset_all.metric.unique()[0]

        # Subsets for HC and non-HC
        hc_data = subset_all[(subset_all['disease'] == HC_LABEL) & (subset_all['old_site'] == 'CamCAN')]['mean_no_cov'].dropna()
        non_hc_data = subset_all[subset_all['disease'] != HC_LABEL]['mean_no_cov'].dropna()

        # Ensure both distributions are not empty
        if len(hc_data) > 1 and len(non_hc_data) > 1:
            # Create histograms to normalize distributions
            hc_hist, bins = np.histogram(hc_data, bins=50, density=True)
            non_hc_hist, _ = np.histogram(non_hc_data, bins=bins, density=True)

            # Compute metrics
            mean_hc = np.mean(hc_data)
            mean_non_hc = np.mean(non_hc_data)
            std_hc = np.std(hc_data)
            std_non_hc = np.std(non_hc_data)
            var_hc = np.var(hc_data, ddof=1)
            var_non_hc = np.var(non_hc_data, ddof=1)
            n_hc = len(hc_data)
            n_non_hc = len(non_hc_data)

            # Pooled standard deviation
            pooled_std = abs(np.sqrt(((n_hc - 1) * var_hc + (n_non_hc - 1) * var_non_hc) / (n_hc + n_non_hc - 2)))

            # Compute distance metrics
            bhatt_dist = QuickCombat.bhattacharyya_distance(hc_data, non_hc_data)
            kl_div = kl_divergence(hc_hist + 1e-10, non_hc_hist + 1e-10)  # Add epsilon to avoid division by zero
            euclidean_dist = abs(mean_hc - mean_non_hc)
            mahalanobis_dist = abs(mean_hc - mean_non_hc) / sqrt((std_hc**2 + std_non_hc**2) / 2)
            d_cohen = abs(mean_hc - mean_non_hc) / pooled_std

            # Store results
            results.append({
                'disease': disease,
                'metric_bundle': metric_bundle,
                'bundle': bundle,
                'metric': metric,
                'bhattacharyya_distance': bhatt_dist,
                'kl_divergence': kl_div,
                'euclidean_distance': euclidean_dist,
                'mahalanobis_distance': mahalanobis_dist,
                'prct_distance': abs(euclidean_dist / mean_hc),
                'd_cohen': d_cohen
            })
    
    return results


# Run diseases in parallel, but keep metric bundles sequential
all_results = Parallel(n_jobs=-1)(
    delayed(process_disease)(disease, raw_directory)
    for disease in all_diseases
)

# Flatten results
results = [item for sublist in all_results for item in sublist]

# Convert results into DataFrame
results_df = pd.DataFrame(results)

# Save results
output_dir = MAINFOLDER
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, 'distance_metrics_results.csv')
results_df.to_csv(output_path, index=False)

print(f"Results saved to {output_path}")


In [None]:
# Initialisation du DataFrame de résultat pour les valeurs maximales
max_results = []

output_path = f'{MAINFOLDER}/distance_metrics_results.csv'
results_df = pd.read_csv(output_path)

# Parcourir chaque maladie
for disease in results_df['disease'].unique():
    disease_subset = results_df[results_df['disease'] == disease]
    
    # Identifier les metric_bundles avec les valeurs maximales pour chaque mesure de distance
    max_bhatt = disease_subset.loc[disease_subset['bhattacharyya_distance'].idxmax()]
    max_kl = disease_subset.loc[disease_subset['kl_divergence'].idxmax()]
    max_euclidean = disease_subset.loc[disease_subset['euclidean_distance'].idxmax()]
    max_mahalanobis = disease_subset.loc[disease_subset['mahalanobis_distance'].idxmax()]
    
    # Ajouter les résultats dans une liste
    max_results.append({
        'disease': disease,
        'metric_bundle_max_bhatt': max_bhatt['metric_bundle'],
        'bhattacharyya_distance': max_bhatt['bhattacharyya_distance'],
        'metric_bundle_max_kl': max_kl['metric_bundle'],
        'kl_divergence': max_kl['kl_divergence'],
        'metric_bundle_max_euclidean': max_euclidean['metric_bundle'],
        'euclidean_distance': max_euclidean['euclidean_distance'],
        'metric_bundle_max_mahalanobis': max_mahalanobis['metric_bundle'],
        'mahalanobis_distance': max_mahalanobis['mahalanobis_distance']
    })

# Convertir les résultats en DataFrame
max_results_df = pd.DataFrame(max_results)

# Afficher les résultats
print(max_results_df)

In [None]:
results_df
results_df = results_df[~results_df['bundle'].isin(['left_ventricle', 'right_ventricle'])]


In [None]:
# Define the output directory
column = 'prct_distance'

output_dir = f'{MAINFOLDER}/PRCT_HISTOGRAMS/{column}/PER_METRIC'
os.makedirs(output_dir, exist_ok=True)

# Parcourir chaque maladie
for disease in results_df['disease'].unique():
    disease_subset = results_df[results_df['disease'] == disease]
    
    # Parcourir chaque metric
    for metric in disease_subset['metric'].unique():
        metric_subset = disease_subset[disease_subset['metric'] == metric]
        
        # Afficher l'histogramme de la distribution de prct_distance
        plt.figure(figsize=(10, 6))
        plt.hist(metric_subset[column], bins=30, alpha=0.7, color='blue', edgecolor='black')
        plt.title(f'Distribution of {column} for {disease} - {metric}')
        plt.xlabel(f'{column}')
        plt.ylabel('Frequency')
        plt.grid(True)
        
        # Create the directory for the disease and metric
        disease_dir = os.path.join(output_dir, disease)
        os.makedirs(disease_dir, exist_ok=True)
        
        # Save the plot
        output_path = os.path.join(disease_dir, f'{disease}_{metric}_{column}_histogram.png')
        plt.savefig(output_path)
        plt.close()# Define the output directory

In [None]:
# Define the output directory
output_dir = f'{MAINFOLDER}/PRCT_HISTOGRAMS/{column}/PER_DISEASE'
os.makedirs(output_dir, exist_ok=True)

# Parcourir chaque maladie
for disease in results_df['disease'].unique():
    disease_subset = results_df[results_df['disease'] == disease]  
    # Afficher l'histogramme de la distribution de prct_distance
    plt.figure(figsize=(10, 6))
    plt.hist(disease_subset[column], bins=30, alpha=0.7, color='blue', edgecolor='black')
    plt.title(f'Distribution of {column} for {disease} -')
    plt.xlabel(f'{column}')
    plt.ylabel('Frequency')
    plt.grid(True)
    
    # Save the plot
    output_path = os.path.join(output_dir, f'{disease}__{column}_histogram.png')
    plt.savefig(output_path)
    plt.close()

In [None]:
output_dir = f'{MAINFOLDER}/PRCT_HISTOGRAMS/{column}/PER_METRIC_ALL'
os.makedirs(output_dir, exist_ok=True)

# Récupérer la liste unique des metrics
all_metrics = results_df['metric'].unique()

# Boucle sur chaque metric
for metric in all_metrics:
    # Filtrer le DataFrame sur le metric courant
    metric_subset = results_df[results_df['metric'] == metric]
    
    # Créer une figure
    plt.figure(figsize=(10, 6))
    
    # Récupérer la liste unique des maladies pour ce metric
    all_diseases = metric_subset['disease'].unique()
    
    # Pour chaque maladie, tracer l’histogramme de prct_distance
    for disease in all_diseases:
        disease_subset = metric_subset[metric_subset['disease'] == disease]
        plt.hist(
            disease_subset[column], 
            bins=30, 
            alpha=0.5, 
            label=disease, 
            edgecolor='black'
        )
    
    # Ajouter légende et titres
    plt.title(f'Distribution de prct_distance pour le metric : {metric}')
    plt.xlabel(f'{column}')
    plt.ylabel('Fréquence')
    plt.legend()
    plt.grid(True)

    # Enregistrer la figure
    output_path = os.path.join(output_dir, f'{metric}_all_diseases_{column}_histogram.png')
    plt.savefig(output_path)
    plt.close()

In [None]:
output_dir = f'{MAINFOLDER}/PRCT_HISTOGRAMS/{column}'
os.makedirs(output_dir, exist_ok=True)


# Boucle sur chaque metric
    
# Créer une figure
plt.figure(figsize=(10, 6))

# Récupérer la liste unique des maladies pour ce metric
all_diseases = results_df['disease'].unique()

# Pour chaque maladie, tracer l’histogramme de prct_distance
for disease in all_diseases:
    disease_subset = results_df[results_df['disease'] == disease]
    plt.hist(
        disease_subset[column], 
        bins=30, 
        alpha=0.5, 
        label=disease, 
        edgecolor='black'
    )

# Ajouter légende et titres
plt.title(f'Distribution de {column} pour le metric : {metric}')
plt.xlabel('prct_distance')
plt.ylabel('Fréquence')
plt.legend()
plt.grid(True)

# Enregistrer la figure
output_path = os.path.join(output_dir, f'all_diseases_{column}_histogram.png')
plt.savefig(output_path)
plt.close()

In [None]:
# Create subset of diseases where the string starts with 'SYN'
syn_diseases = ["SYN_0.5", "SYN_1",]
output_dir = f'{MAINFOLDER}/PRCT_HISTOGRAMS/{column}/PER_METRIC_WITH_SYN'
os.makedirs(output_dir, exist_ok=True)

# Parcourir chaque maladie
for disease in all_diseases:
    disease_subset = results_df[results_df['disease'] == disease]
    
    # Parcourir chaque metric
    for metric in disease_subset['metric'].unique():
        metric_subset = disease_subset[disease_subset['metric'] == metric]
        
        # Afficher l'histogramme de la distribution de prct_distance
        plt.figure(figsize=(10, 6))
        for dis in syn_diseases:
            syn_sub = results_df[results_df['disease'] == dis]
            syn_met = syn_sub[syn_sub['metric'] == metric]
            plt.hist(
                syn_met[column], 
                bins=30, 
                alpha=0.2, 
                label=dis, 
                edgecolor='black'
            )
        plt.hist(metric_subset[column], bins=30, alpha=0.7, color='blue', edgecolor='black', label=disease)
        plt.title(f'Distribution of {column} for {disease} - {metric}')
        plt.xlabel(f'{column}')
        plt.ylabel('Frequency')
        plt.legend()
        plt.grid(True)

        # Pour chaque maladie, tracer l’histogramme de prct_distance
        
        # Create the directory for the disease and metric
        disease_dir = os.path.join(output_dir, disease)
        os.makedirs(disease_dir, exist_ok=True)
        
        # Save the plot
        output_path = os.path.join(disease_dir, f'{disease}_{metric}_{column}_histogram.png')
        plt.savefig(output_path)
        plt.close()

In [None]:
# output_path = 'RESULTS/DISTRIBUTION_RESULTS/distance_metrics_results.csv'
# distance_metrics_results = pd.read_csv(output_path)

# average_results = distance_metrics_results.groupby(['disease', 'metric']).mean(numeric_only=True).reset_index()
# average_results
# highest_lowest_metrics = average_results.groupby('disease').agg(
#     highest_bhattacharyya=('bhattacharyya_distance', lambda x: average_results.loc[x.idxmax(), 'metric']),
#     lowest_bhattacharyya=('bhattacharyya_distance', lambda x: average_results.loc[x.idxmin(), 'metric']),
#     highest_kl=('kl_divergence', lambda x: average_results.loc[x.idxmax(), 'metric']),
#     lowest_kl=('kl_divergence', lambda x: average_results.loc[x.idxmin(), 'metric']),
#     highest_euclidean=('euclidean_distance', lambda x: average_results.loc[x.idxmax(), 'metric']),
#     lowest_euclidean=('euclidean_distance', lambda x: average_results.loc[x.idxmin(), 'metric']),
#     highest_mahalanobis=('mahalanobis_distance', lambda x: average_results.loc[x.idxmax(), 'metric']),
#     lowest_mahalanobis=('mahalanobis_distance', lambda x: average_results.loc[x.idxmin(), 'metric'])
# ).reset_index()

# highest_lowest_metrics

In [None]:
# output_path = 'RESULTS/DISTRIBUTION_RESULTS/distance_metrics_results.csv'
# distance_metrics_results = pd.read_csv(output_path)

# # Find the bundle/metric with the highest and lowest value for each distance metric for each disease
# highest_lowest_values = distance_metrics_results.groupby('disease').agg(
#     highest_bhattacharyya=('bhattacharyya_distance', lambda x: distance_metrics_results.loc[x.idxmax(), 'metric_bundle']),
#     lowest_bhattacharyya=('bhattacharyya_distance', lambda x: distance_metrics_results.loc[x.idxmin(), 'metric_bundle']),
#     highest_kl=('kl_divergence', lambda x: distance_metrics_results.loc[x.idxmax(), 'metric_bundle']),
#     lowest_kl=('kl_divergence', lambda x: distance_metrics_results.loc[x.idxmin(), 'metric_bundle']),
#     highest_euclidean=('euclidean_distance', lambda x: distance_metrics_results.loc[x.idxmax(), 'metric_bundle']),
#     lowest_euclidean=('euclidean_distance', lambda x: distance_metrics_results.loc[x.idxmin(), 'metric_bundle']),
#     highest_mahalanobis=('mahalanobis_distance', lambda x: distance_metrics_results.loc[x.idxmax(), 'metric_bundle']),
#     lowest_mahalanobis=('mahalanobis_distance', lambda x: distance_metrics_results.loc[x.idxmin(), 'metric_bundle'])
# ).reset_index()

# highest_lowest_values

In [None]:
output_path = f'{MAINFOLDER}/distance_metrics_results.csv'
results_df = pd.read_csv(output_path)
# Create a DataFrame to store the results
top_3_smallest_d_cohen = []

# Iterate over each disease
for disease in results_df['disease'].unique():
    disease_subset = results_df[results_df['disease'] == disease]
    
    # Iterate over each metric
    for metric in disease_subset['metric'].unique():
        metric_subset = disease_subset[disease_subset['metric'] == metric]
        
        # Sort by d_cohen and select the top 3 smallest values
        smallest_d_cohen = metric_subset.nsmallest(3, 'd_cohen')
        
        # Add the results to the list
        for _, row in smallest_d_cohen.iterrows():
            top_3_smallest_d_cohen.append({
                'disease': disease,
                'metric': metric,
                'bundle': row['metric_bundle'],
                'd_cohen': row['d_cohen']
            })

# Convert the results into a DataFrame
top_3_smallest_d_cohen_df = pd.DataFrame(top_3_smallest_d_cohen)

# Display the results
print(top_3_smallest_d_cohen_df)

In [None]:
output_path = f'{MAINFOLDER}/distance_metrics_results.csv'
results_df = pd.read_csv(output_path)
# Create a DataFrame to store the results
top_3_worst_d_cohen = []

# Iterate over each disease
for disease in results_df['disease'].unique():
    disease_subset = results_df[results_df['disease'] == disease]
    
    # Iterate over each metric
    for metric in disease_subset['metric'].unique():
        metric_subset = disease_subset[disease_subset['metric'] == metric]
        
        # Sort by d_cohen and select the top 3 largest values
        worst_d_cohen = metric_subset.nlargest(3, 'd_cohen')
        
        # Add the results to the list
        for _, row in worst_d_cohen.iterrows():
            top_3_worst_d_cohen.append({
                'disease': disease,
                'metric': metric,
                'bundle': row['metric_bundle'],
                'd_cohen': row['d_cohen']
            })

# Convert the results into a DataFrame
top_3_worst_d_cohen_df = pd.DataFrame(top_3_worst_d_cohen)

# Display the results
print(top_3_worst_d_cohen_df)

In [None]:
# 1. Enlève les maladies SYN
filtered_df = results_df[~results_df['disease'].str.startswith('SYN')]

# 2. Moyenne de d_cohen pour chaque combinaison metric + bundle
mean_dcohen = (
    filtered_df
    .groupby(['metric', 'bundle'])['d_cohen']
    .mean()
    .reset_index()
)

# 3. Top 3 des bundles avec la plus petite moyenne de d_cohen pour chaque metric
top3_per_metric = (
    mean_dcohen
    .sort_values(['metric', 'd_cohen'])
    .groupby('metric')
    .head(3)
    .reset_index(drop=True)
)

# Affichage
print(top3_per_metric)


In [None]:
# 1. Toujours filtrer les maladies SYN
filtered_df = results_df[~results_df['disease'].str.startswith('SYN')]

# 2. Moyenne de d_cohen pour chaque (metric, bundle)
mean_dcohen_metric_bundle = (
    filtered_df
    .groupby(['metric', 'bundle'])['d_cohen']
    .mean()
    .reset_index()
)

# 3. Moyenne de d_cohen pour chaque bundle (tous metrics confondus)
mean_dcohen_per_bundle = (
    mean_dcohen_metric_bundle
    .groupby('bundle')['d_cohen']
    .mean()
    .reset_index()
)

# 4. Prendre les 3 bundles avec la plus petite moyenne overall
top3_bundles_overall = mean_dcohen_per_bundle.nsmallest(3, 'd_cohen')

# Afficher le résultat
print(top3_bundles_overall)


In [None]:
# 1. Reprend les 3 meilleurs bundles (si pas déjà dans la variable)
top_bundles = top3_bundles_overall['bundle'].tolist()

# 2. Filtrer les lignes associées à ces bundles (et exclure SYN)
worst_cases = results_df[
    (results_df['bundle'].isin(top_bundles)) &
    (~results_df['disease'].str.startswith('SYN'))
]

# 3. Trouver le pire cas pour chaque bundle
worst_per_bundle = (
    worst_cases.loc[
        worst_cases.groupby('bundle')['d_cohen'].idxmax()
    ][['bundle', 'disease', 'metric', 'd_cohen']]
    .reset_index(drop=True)
)

# Afficher le résultat
print(worst_per_bundle)
