# Analyse des sites vs maladies (DONNES/raw/ad)

Ce notebook parcourt tous les fichiers `*.ad.raw.csv.gz` du dossier `DONNES/raw/ad`, charge `sid`, `site`, `disease`, et **filtre immédiatement** les sites qui ne contiennent que des cas `HC` (à l'exception de `CamCAN`). On compte ensuite les sujets uniques (`sid`) par couple site/maladie.

## Pré-requis
- Les fichiers compressés `.csv.gz` doivent être présents dans `DONNES/raw/ad`.
- Chaque fichier doit contenir les colonnes `sid`, `disease` et `site`. Si `site` est absent, le nom du fichier servira d'identifiant.

In [6]:
from pathlib import Path
import pandas as pd

DATA_DIR = Path('../DONNES/raw/ad')
if not DATA_DIR.exists():
    raise FileNotFoundError(f'Dossier introuvable: {DATA_DIR.resolve()}')

DATA_DIR

PosixPath('../DONNES/raw/ad')

In [7]:
csv_files = sorted(DATA_DIR.glob('*.csv.gz'))
print(f'Nombre de fichiers trouvés: {len(csv_files)}')
csv_files[:5]

Nombre de fichiers trouvés: 79


[PosixPath('../DONNES/raw/ad/35343.ad.raw.csv.gz'),
 PosixPath('../DONNES/raw/ad/35426.ad.raw.csv.gz'),
 PosixPath('../DONNES/raw/ad/43zR_iAwl9Ck.ad.raw.csv.gz'),
 PosixPath('../DONNES/raw/ad/47k_ouILA7fA.ad.raw.csv.gz'),
 PosixPath('../DONNES/raw/ad/4N..m6cr3es6.ad.raw.csv.gz')]

In [8]:
def load_site_disease_data(files, site_exception='CamCAN'):
    frames = []
    for file in files:
        df = pd.read_csv(file)
        if 'site' not in df.columns:
            df['site'] = file.stem.split('.ad')[0]
        for required_col in ['sid', 'disease', 'site']:
            if required_col not in df.columns:
                raise ValueError(f"Colonne '{required_col}' manquante dans {file}")
        frames.append(df[['sid', 'site', 'disease']])

    combined = pd.concat(frames, ignore_index=True)
    combined = combined.drop_duplicates(subset=['sid', 'site', 'disease'])

    disease_sets = combined.groupby('site')['disease'].agg(set)
    allowed_sites = [
        site for site, diseases in disease_sets.items()
        if site == site_exception or any(d != 'HC' for d in diseases)
    ]
    filtered = combined[combined['site'].isin(allowed_sites)].copy()

    removed_sites = sorted(set(disease_sets.index) - set(allowed_sites))
    print(f'Sites conservés: {len(allowed_sites)} | Sites retirés: {len(removed_sites)}')
    if removed_sites:
        preview = ', '.join(removed_sites[:20])
        print('Sites retirés (uniquement HC):', preview)
        if len(removed_sites) > 20:
            print('...')

    return filtered

site_disease_df = load_site_disease_data(csv_files)
site_disease_df.head()

Sites conservés: 62 | Sites retirés: 17
Sites retirés (uniquement HC): 47k_ouILA7fA, 4RYQhNAwMfVs, 6AKHWNAG7HKlJGn5s36.v2k, Bristol, Cheadle, LeipzigU, MRC-CBSU_Siemens_3T_2, NIMH, Newcastle, Pentera, Pentera3T, Reading, WashU, adni_177_Philips_3T, adni_20_Siemens_3T, adni_94_Siemens_3T, stockolm


Unnamed: 0,sid,site,disease
0,sub-10228,35343,HC
1,sub-10891,35343,HC
2,sub-10460,35343,HC
3,sub-10855,35343,HC
4,sub-10325,35343,HC


In [9]:
unique_counts = (
    site_disease_df
    .groupby(['site', 'disease'])['sid']
    .nunique()
    .rename('unique_sid_count')
    .reset_index()
)
unique_counts.head()

Unnamed: 0,site,disease,unique_sid_count
0,35343,ADHD,20
1,35343,BIP,26
2,35343,HC,96
3,35343,SCHZ,22
4,35426,ADHD,19


In [10]:
counts_pivot = unique_counts.pivot(index='site', columns='disease', values='unique_sid_count')
counts_pivot = counts_pivot.fillna(0).astype(int)
counts_pivot.head()

disease,AD,ADHD,BIP,HC,MCI,SCHZ,TBI
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
35343,0,20,26,96,0,22,0
35426,0,19,22,23,0,23,0
43zR_iAwl9Ck,0,0,0,23,0,0,70
4N..m6cr3es6,0,0,0,3,0,0,7
4b4Ef0_uu0Hg,0,0,0,6,0,0,18


In [11]:
disease_sites_df = counts_pivot.copy()
disease_sites_df['NB_HC'] = disease_sites_df.get('HC', 0)
non_hc_cols = [col for col in disease_sites_df.columns if col not in ['NB_HC', 'NB_Sick', 'HC']]
disease_sites_df['NB_Sick'] = disease_sites_df[non_hc_cols].sum(axis=1)
filtered_disease_sites_df = disease_sites_df[(disease_sites_df['NB_HC'] >= 8) & (disease_sites_df['NB_Sick'] >= 5)]
print(f"Sites après filtre NB_HC>=8 et NB_Sick>=5: {len(filtered_disease_sites_df)} / {len(disease_sites_df)}")
filtered_disease_sites_df.head()

Sites après filtre NB_HC>=8 et NB_Sick>=5: 33 / 62


disease,AD,ADHD,BIP,HC,MCI,SCHZ,TBI,NB_HC,NB_Sick
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
35343,0,20,26,96,0,22,0,96,68
35426,0,19,22,23,0,23,0,23,64
43zR_iAwl9Ck,0,0,0,23,0,0,70,23,70
4pGtp5fZYgoo,0,0,0,32,0,0,34,32,34
6KOBCCZqPeFuYxTI03Or2qE,0,0,0,35,0,0,28,35,28


In [12]:
unique_counts_filtered = unique_counts[unique_counts['site'].isin(filtered_disease_sites_df.index)].copy()
unique_counts_filtered.sort_values('unique_sid_count', ascending=False).head(20)

Unnamed: 0,site,disease,unique_sid_count
2,35343,HC,96
28,MRN,HC,80
24,6WlbpwoscdGuK5Jvkt3vndE,TBI,76
48,adni_127_GE_3T,HC,73
9,43zR_iAwl9Ck,TBI,70
91,adni_27_GE_3T,MCI,66
29,MRN,SCHZ,66
90,adni_27_GE_3T,HC,62
47,adni_127_GE_3T,AD,53
21,6KOBCCZqPeFuYxTI03Or2qE,HC,35


In [13]:
output_path = DATA_DIR / 'site_disease_unique_sid_counts.csv'
unique_counts_filtered.to_csv(output_path, index=False)
print(f'Resultats sauvegardés dans {output_path.resolve()}')

Resultats sauvegardés dans /home/local/USHERBROOKE/davy3001/Documents/COMBAT/Jodoin/Combat_robust/DONNES/raw/ad/site_disease_unique_sid_counts.csv


In [14]:
# nombre de sites filtrés
n_sites = filtered_disease_sites_df.shape[0]
print(f"Nombre de sites après filtrage : {n_sites}")

# nombre total de patients uniques par maladie (somme des unique_sid_count sur les sites filtrés)
totaux_par_maladie = unique_counts_filtered.groupby('disease')['unique_sid_count'].sum().sort_values(ascending=False)
print("\nPatients uniques par maladie (somme sur sites filtrés) :")
print(totaux_par_maladie.to_string())

# totals HC / Sick agrégés depuis la table filtered_disease_sites_df (si présents)
if {'NB_HC', 'NB_Sick'}.issubset(filtered_disease_sites_df.columns):
    total_hc = filtered_disease_sites_df['NB_HC'].sum()
    total_sick = filtered_disease_sites_df['NB_Sick'].sum()
    print(f"\nTotal HC (somme NB_HC sur sites filtrés) : {total_hc}")
    print(f"Total Sick (somme NB_Sick sur sites filtrés) : {total_sick}")

# renvoyer un DataFrame synthétique si besoin
resultats = totaux_par_maladie.rename("total_unique_sid").reset_index()
resultats

Nombre de sites après filtrage : 33

Patients uniques par maladie (somme sur sites filtrés) :
disease
HC      885
MCI     378
TBI     208
AD      200
SCHZ    111
BIP      48
ADHD     39

Total HC (somme NB_HC sur sites filtrés) : 885
Total Sick (somme NB_Sick sur sites filtrés) : 984


Unnamed: 0,disease,total_unique_sid
0,HC,885
1,MCI,378
2,TBI,208
3,AD,200
4,SCHZ,111
5,BIP,48
6,ADHD,39
