## IMPORTS and UTILS

In [None]:
import os
import pandas as pd
import shutil
import random


In [None]:
# Function to delete files that don't start with 'adni'
def delete_non_adni_files(folder):
    for mainroot, metrics, _ in os.walk(folder):
        for dir in metrics:
            for root, _, files in os.walk(os.path.join(mainroot,dir)):
                for file in files:
                    if not file.startswith('adni'):
                        os.remove(os.path.join(root, file))

# Call the function on the RAWFOLDER
delete_non_adni_files("RAW/ADNI")

In [None]:
# Function to delete files that don't start with 'adni'
def change_camCan(folder):
    for mainroot, metrics, _ in os.walk(folder):
        for dir in metrics:
            for root, _, files in os.walk(os.path.join(mainroot,dir)):
                for file in files:
                    os.rename(os.path.join(root, file), os.path.join(mainroot, file))
                    os.remove
# Call the function on the RAWFOLDER
change_camCan("RAW/CamCAN")

In [None]:

def organize_files_by_disease_new_structure(main_folder):
    for metric in os.listdir(main_folder):
        metric_folder = os.path.join(main_folder, metric)
        if os.path.isdir(metric_folder):
            for file in os.listdir(metric_folder):
                if file.endswith('.csv.gz') or file.endswith('.csv'):
                    df = pd.read_csv(os.path.join(metric_folder, file))
                    diseases = df['disease'].unique()
                    diseases = [d for d in diseases if d != 'HC']
                    if diseases:
                        for disease in diseases:
                            disease_folder = disease
                            target_folder = os.path.join("RAW", disease_folder, metric)
                            os.makedirs(target_folder, exist_ok=True)
                            filtered_df = df[(df['disease'] == disease) | (df['disease'] == 'HC')]
                            filtered_file_path = os.path.join(target_folder, file)
                            filtered_df.to_csv(filtered_file_path, index=False)
                    else:
                        target_folder = os.path.join("RAW", 'HC', metric)
                        os.makedirs(target_folder, exist_ok=True)
                        shutil.copy(os.path.join(metric_folder, file), os.path.join(target_folder, file))
                    

# Call the function on the desired folder
organize_files_by_disease_new_structure("DONNES/raw")

In [None]:
def update_handess_column(folder):
    for mainroot, dirs, _ in os.walk(folder):
        for dir in dirs:
            dir_path = os.path.join(mainroot, dir)
            for root, _, files in os.walk(dir_path):
                for file in files:
                    if file.endswith('.csv.gz') or file.endswith('.csv'):
                        file_path = os.path.join(root, file)
                        df = pd.read_csv(file_path)
                        if 'handedness' in df.columns:
                            df['handedness'] = 1
                            df.to_csv(file_path, index=False)

# Call the function on the ADHD folder
update_handess_column("RAW/SCHZ")

In [None]:
def build_astmix_dataframe(
    diseases,
    source_dir='DONNES_F/COMPILATIONS_AUG_3',
    random_state=0
):
    # Initialisation du générateur aléatoire
    random.seed(random_state)

    parts = []
    diseased_counts = {}

    for d in diseases:
        path = os.path.join(source_dir,
                            f'{d}_combination_all_metrics_CamCAN.csv.gz')
        df = pd.read_csv(path, compression='gzip')

        # Sépare les malades et les HC
        diseased = df[df['disease'] != 'HC']
        hc = df[df['disease'] == 'HC']

        diseased_counts[d] = diseased['sid'].nunique()
        parts.append((diseased, hc))

    # Plus petit nombre de patients malades parmi les maladies
    min_diseased = min(diseased_counts.values())

    balanced_chunks = []
    for diseased, hc in parts:
        sids = diseased['sid'].drop_duplicates().tolist()
        sampled_sids = random.sample(sids, min_diseased)

        # Garde les malades équilibrés et tous les HC
        balanced = pd.concat([
            diseased[diseased['sid'].isin(sampled_sids)],
            hc
        ])
        balanced_chunks.append(balanced)

    astmix_df = pd.concat(balanced_chunks, ignore_index=True)

    # Retrait des doublons (sid, metric, bundle)
    astmix_df = astmix_df.drop_duplicates(subset=['sid', 'metric', 'bundle'])

    # Sauvegarde
    os.makedirs(source_dir, exist_ok=True)
    out_file = os.path.join(source_dir,
                            'ASTMIX_combination_all_metrics_CamCAN.csv.gz')
    astmix_df.to_csv(out_file, index=False, compression='gzip')
    print(f'✔️ Fichier créé : {out_file}')

    return astmix_df

In [None]:
build_astmix_dataframe(
    diseases=['AD', 'SCHZ', 'TBI'],
    source_dir='DONNES_F/COMPILATIONS_AUG_3',
    random_state=0
)

In [None]:
astmix_df = pd.read_csv('ASTMIX_OUTPUT/ASTMIX_combination_all_metrics_CamCAN.csv.gz', compression='gzip')

In [None]:
df_unique = astmix_df.groupby('sid').first().reset_index()

# Maintenant, tu comptes le nombre de patients par maladie
df_unique['disease'].value_counts()

# Ou si tu veux garder un DataFrame propre :
df_unique.groupby('disease').size().reset_index(name='count')

In [None]:
import shutil
import os
from joblib import Parallel, delayed
from robust_evaluation_tools.robust_utils import get_metrics
MAINFOLDER = "RESULTS/MAE_TEST"
def remove_distribution_folder(disease, sample_size, disease_ratio, test_index,
                     metrics):
    
    # Définition des chemins
    sizeDir = os.path.join(MAINFOLDER, 'PROCESS', disease,
                           f"{sample_size}_{int(disease_ratio * 100)}",
                           f"{test_index}")


    for metric in metrics:
        tempDir = os.path.join(sizeDir, metric)
        folder_path = os.path.join(tempDir, "DISTRIBUTION")
        #if os.path.exists(folder_path) and test_index not in {0, 1}:
        if os.path.exists(folder_path):
            shutil.rmtree(folder_path)
            print(f"Folder '{folder_path}' has been deleted.")




metrics = get_metrics()
#diseases = get_diseases(True)
diseases = ["ASTMIX", "AD", "SCHZ", "TBI"]

sample_sizes = [5,10,20,30,100,150]  # Différentes tailles d'échantillon
disease_ratios = [0.03, 0.1, 0.3, 0.5]  # Différents pourcentages de malades
num_tests = 12  # Nombre de tests à effectuer pour chaque combinaison
n_jobs=1

tasks = [
        (disease, sample_size, disease_ratio, num_test,metrics)
        for disease in diseases
        for sample_size in sample_sizes
        for disease_ratio in disease_ratios
        for num_test in range(num_tests)
    ]

    # Run all combinations in parallel and collect file paths
results = Parallel(n_jobs=n_jobs)(delayed(remove_distribution_folder)(*task) for task in tasks)