## IMPORTS and UTILS

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from joblib import Parallel, delayed


from robust_evaluation_tools.robust_utils import get_complete_combination, get_diseases, add_nb_patients_and_diseased
from robust_evaluation_tools.robust_outlier_detection import find_outliers, analyze_detection_performance, scatter_plot_with_colors
from robust_evaluation_tools.synthectic_sites_generations import generate_sites
from robust_evaluation_tools.robust_analysis import calculate_precision_by_bundle

MAINFOLDER = "RESULTS/PRECISION_TEST"
SYNTHETIC_SITES = f"{MAINFOLDER}/SYNTHETIC_SITES"

ANALYSIS_FOLDER = f"{MAINFOLDER}/ANALYSIS"

In [None]:
def generate_sites_for_disease(disease, SYNTHETIC_SITES, SYNTHETIC_SITES_VERSION, sample_sizes, disease_ratios, num_tests):
    # Load data for the disease
    data_path = path = os.path.join('DONNES','COMPILATIONS', f'{disease}_combination_all_metrics_CamCAN.csv.gz')

    # Define site directory
    directory_site = os.path.join(SYNTHETIC_SITES, SYNTHETIC_SITES_VERSION, disease)

    # Generate synthetic sites
    generate_sites(sample_sizes, disease_ratios, num_tests, directory_site, data_path, disease=None)

## EXECUTOR

In [None]:

method= "classic"
SYNTHETIC_SITES_VERSION = "v1"

diseases = get_diseases(True)
# diseases = ['SYN_0.1']
# robust_methods = ['IQR', 'MAD', 'MAD_MEAN']
robust_methods = ['MAD_MEAN', "IQR"]

print('do')


#sample_sizes = [30, 50, 100, 150, 200, 300]  # Différentes tailles d'échantillon
#disease_ratios = [0.1, 0.2, 0.3, 0.4, 0.5, 0.7]  # Différents pourcentages de malades
sample_sizes = [100]  # Différentes tailles d'échantillon
disease_ratios = [0.3]  # Différents pourcentages de malades
num_tests = 10  # Nombre de tests à effectuer pour chaque combinaison


Parallel(n_jobs=-1)(
    delayed(generate_sites_for_disease)(disease, SYNTHETIC_SITES, SYNTHETIC_SITES_VERSION, sample_sizes, disease_ratios, num_tests)
    for disease in diseases
)




In [None]:
# for robust_method in robust_methods:
#     directory = os.path.join(MAINFOLDER, robust_method)
#     # Initialize DataFrames to store the results
#     outliers_compilation = pd.DataFrame()
#     detection_metrics_summary = pd.DataFrame()
#     for disease in diseases:
#         directory_disease = os.path.join(directory, disease)
#         directory_site = os.path.join(SYNTHETIC_SITES ,SYNTHETIC_SITES_VERSION, disease)
#         for sample_size in sample_sizes:
#             for disease_ratio in disease_ratios:        
#                 sizeDir = os.path.join(directory_disease, f"{sample_size}_{int(disease_ratio*100)}")
#                 sizeDir_site = os.path.join(directory_site, f"{sample_size}_{int(disease_ratio*100)}")
#                 for i in range(num_tests):
#                     tempDir = os.path.join(sizeDir, f"{i}")
#                     tempDir_site = os.path.join(sizeDir_site, f"{i}")
#                     os.makedirs(tempDir, exist_ok=True)

#                     train_file_name = f"train_{sample_size}_{int(disease_ratio*100)}_{i}_all.csv"
#                     test_file_name = f"test_{sample_size}_{int(disease_ratio*100)}_{i}_all.csv"
                    
#                     # Sauvegarder l'échantillon dans un fichier temporaire
#                     temp_file = os.path.join(tempDir_site,train_file_name )
#                     train_df = pd.read_csv(temp_file)
#                     train_df = train_df[~train_df['bundle'].isin(['left_ventricle', 'right_ventricle'])]
#                     train_df.to_csv(os.path.join(tempDir,train_file_name ), index=False)

#                     test_file = os.path.join(tempDir_site, test_file_name)
#                     test_df = pd.read_csv(test_file)
#                     test_df = test_df[~test_df['bundle'].isin(['left_ventricle', 'right_ventricle'])]
#                     test_df.to_csv(os.path.join(tempDir,test_file_name ), index=False)



#                     outliers_idx = find_outliers(train_df, robust_method)
#                     detection_performance = analyze_detection_performance(outliers_idx, train_df)
#                     scatter_plot_with_colors(train_df, outliers_idx,  'mean_no_cov', sizeDir ,f'Scatter_{robust_method}_{disease}_{sample_size}_{int(disease_ratio*100)}_{i}', f'Scatter for method {robust_method} disease: {disease}, with {sample_size} patients, {disease_ratio * 100} % of sick #{i}')

#                     detection_performance['disease'] = disease

#                     outliers = train_df.loc[outliers_idx]
                    

#                     detection_metrics_summary = pd.concat([detection_metrics_summary, detection_performance])
#                     outliers_compilation = pd.concat([outliers_compilation, outliers])
#     # Save the metrics and distances compilation DataFrames to CSV files
#     detection_metrics_summary.to_csv(os.path.join(directory, "detection_metrics_summary.csv"), index=False)
#     outliers_compilation.to_csv(os.path.join(directory, "outliers_compilation.csv"), index=False)

In [None]:
import os
import pandas as pd
from joblib import Parallel, delayed

# Function to process one combination of robust_method, disease, sample_size, and disease_ratio
def process_combination(robust_method, disease, sample_size, disease_ratio, MAINFOLDER, SYNTHETIC_SITES, SYNTHETIC_SITES_VERSION, num_tests):
    directory = os.path.join(MAINFOLDER, robust_method, disease)
    directory_site = os.path.join(SYNTHETIC_SITES, SYNTHETIC_SITES_VERSION, disease)

    sizeDir = os.path.join(directory, f"{sample_size}_{int(disease_ratio*100)}")
    sizeDir_site = os.path.join(directory_site, f"{sample_size}_{int(disease_ratio*100)}")

    detection_metrics_files = []
    outliers_files = []

    for i in range(num_tests):
        tempDir = os.path.join(sizeDir, f"{i}")
        tempDir_site = os.path.join(sizeDir_site, f"{i}")
        os.makedirs(tempDir, exist_ok=True)

        train_file_name = f"train_{sample_size}_{int(disease_ratio*100)}_{i}_all.csv"
        test_file_name = f"test_{sample_size}_{int(disease_ratio*100)}_{i}_all.csv"

        # Load and filter training dataset
        temp_file = os.path.join(tempDir_site, train_file_name)
        train_df = pd.read_csv(temp_file)
        train_df = train_df[~train_df['bundle'].isin(['left_ventricle', 'right_ventricle'])]
        train_df.to_csv(os.path.join(tempDir, train_file_name), index=False)

        # Load and filter test dataset
        test_file = os.path.join(tempDir_site, test_file_name)
        test_df = pd.read_csv(test_file)
        test_df = test_df[~test_df['bundle'].isin(['left_ventricle', 'right_ventricle'])]
        test_df.to_csv(os.path.join(tempDir, test_file_name), index=False)

        # Outlier detection
        outliers_idx = find_outliers(train_df, robust_method)
        detection_performance = analyze_detection_performance(outliers_idx, train_df)

        

        # Scatter plot for visualization
        scatter_plot_with_colors(
            train_df, outliers_idx, 'mean_no_cov', sizeDir,
            f'Scatter_{robust_method}_{disease}_{sample_size}_{int(disease_ratio*100)}_{i}',
            f'Scatter for method {robust_method} disease: {disease}, with {sample_size} patients, {disease_ratio * 100} % of sick #{i}'
        )

        # Store results
        detection_performance['disease'] = disease
        outliers = train_df.loc[outliers_idx]

        detection_performance['method'] = robust_method
        outliers['method'] = robust_method


        detection_file = os.path.join(sizeDir, f"detection_metrics_summary_{sample_size}_{int(disease_ratio*100)}_{i}.csv")
        outliers_file = os.path.join(sizeDir, f"outliers_compilation_{sample_size}_{int(disease_ratio*100)}_{i}.csv")

        detection_performance.to_csv(detection_file, index=False)
        outliers.to_csv(outliers_file, index=False)

        detection_metrics_files.append(detection_file)
        outliers_files.append(outliers_file)

    return detection_metrics_files, outliers_files

# Generate all task combinations
tasks = [
    (robust_method, disease, sample_size, disease_ratio, MAINFOLDER, SYNTHETIC_SITES, SYNTHETIC_SITES_VERSION, num_tests)
    for robust_method in robust_methods
    for disease in diseases
    for sample_size in sample_sizes
    for disease_ratio in disease_ratios
]

# Run all combinations in parallel and collect file paths
results = Parallel(n_jobs=-1)(
    delayed(process_combination)(*task) for task in tasks
)

print(f"Final results saved in {MAINFOLDER}")


In [None]:
# Flatten file lists
detection_metrics_files = [file for sublist in results for file in sublist[0]]
outliers_files = [file for sublist in results for file in sublist[1]]

# Read and concatenate all detection_metrics_summary files
detection_metrics_summary = pd.concat([pd.read_csv(file) for file in detection_metrics_files], ignore_index=True)

# Read and concatenate all outliers_compilation files
outliers_compilation = pd.concat([pd.read_csv(file) for file in outliers_files], ignore_index=True)

# Save final concatenated files
detection_metrics_summary.to_csv(os.path.join(MAINFOLDER, "detection_metrics_summary.csv"), index=False)
outliers_compilation.to_csv(os.path.join(MAINFOLDER, "outliers_compilation.csv"), index=False)

In [None]:
detection_metrics_summary

In [None]:
# # Exemple d'utilisation
# precision_df = calculate_precision_by_bundle(pd.read_csv(os.path.join(MAINFOLDER, robust_method, "detection_metrics_summary.csv")))
# precision_df = precision_df.sort_values(by='precision', ascending=False)
# precision_df.to_csv(os.path.join(directory_disease, "metrics_per_bundle.csv"), index=True)
# precision_df

In [None]:
# precision_df = precision_df.sort_values(by='f1_score', ascending=False)
# precision_df

In [None]:

# Load the combined data
data_combined = pd.read_csv(os.path.join(MAINFOLDER,'detection_metrics_summary.csv'))

results_folder = os.path.join(MAINFOLDER, 'DETECTION_PERFORMANCE_BOXPLOTS')
os.makedirs(results_folder, exist_ok=True)
for disease in diseases:
    disease_data = data_combined[data_combined['disease'] == disease]
    
    # Filter the relevant columns
    filtered_data = disease_data[['metric', 'overall', 'site', 'method']]
    
    # Extract metrics of interest
    metrics_of_interest = ['precision', 'f1_score', 'recall', 'false_positives']
    filtered_data = filtered_data[filtered_data['metric'].isin(metrics_of_interest)]
    
    # Group sites by unique patient-malade combination
    filtered_data['site_group'] = filtered_data['site'].str.extract(r'(\d+_patients_\d+_percent)')[0]
    
    # Set up the plot style
    sns.set(style="whitegrid")
    
    # Create a figure with subplots
    fig, axes = plt.subplots(2, 2, figsize=(20, 14))
    axes = axes.flatten()
    
    for i, metric in enumerate(metrics_of_interest):
        metric_data = filtered_data[filtered_data['metric'] == metric]
    
        # Create the boxplot
        sns.boxplot(
            x='site_group',
            y='overall',
            hue='method',  # Add the method as a hue
            data=metric_data,
            showfliers=False,  # Remove outliers for clarity
            ax=axes[i]
        )
    
        # Customize the plot
        axes[i].set_title(f'Boxplot of {metric} across patient-malade combinations (by method) for {disease}', fontsize=16)
        axes[i].set_xlabel('Patient-Malade Combination', fontsize=12)
        axes[i].set_ylabel('Overall Value', fontsize=12)
        axes[i].tick_params(axis='x', rotation=45)
        axes[i].legend(title='Method', fontsize=10)
    
    plt.tight_layout()
    plt.savefig(os.path.join(results_folder,f'{disease}_detection_performance.png'))
    plt.close(fig)


In [None]:
data_combined = add_nb_patients_and_diseased(data_combined)





In [None]:
# Remove the 'site' and 'num_diseased' columns
data_combinedCopy = data_combined.copy()
data_combinedCopy = data_combinedCopy.drop(columns=['site', 'num_diseased'], errors='ignore')
data_combinedCopy

In [None]:

mean_df = (
    data_combinedCopy
    .groupby(['num_patients', 'disease_ratio', 'disease', 'metric', 'method'])
    .mean()
    .reset_index()
)

mean_df