# Analyse des sites vs maladies (DONNES/raw/ad)

Ce notebook parcourt tous les fichiers `*.ad.raw.csv.gz` du dossier `DONNES/raw/ad`, charge `sid`, `site`, `disease`, et **filtre immédiatement** les sites qui ne contiennent que des cas `HC` (à l'exception de `CamCAN`). On compte ensuite les sujets uniques (`sid`) par couple site/maladie.

## Pré-requis
- Les fichiers compressés `.csv.gz` doivent être présents dans `DONNES/raw/ad`.
- Chaque fichier doit contenir les colonnes `sid`, `disease` et `site`. Si `site` est absent, le nom du fichier servira d'identifiant.

In [None]:
from pathlib import Path
import pandas as pd

DATA_DIR = Path('../DONNES/raw/ad')
if not DATA_DIR.exists():
    raise FileNotFoundError(f'Dossier introuvable: {DATA_DIR.resolve()}')

DATA_DIR

FileNotFoundError: Dossier introuvable: /home/local/USHERBROOKE/davy3001/Documents/COMBAT/Jodoin/Combat_robust/notebooks/DONNES/raw/ad

In [None]:
csv_files = sorted(DATA_DIR.glob('*.csv.gz'))
print(f'Nombre de fichiers trouvés: {len(csv_files)}')
csv_files[:5]

In [None]:
def load_site_disease_data(files, site_exception='CamCAN'):
    frames = []
    for file in files:
        df = pd.read_csv(file)
        if 'site' not in df.columns:
            df['site'] = file.stem.split('.ad')[0]
        for required_col in ['sid', 'disease', 'site']:
            if required_col not in df.columns:
                raise ValueError(f"Colonne '{required_col}' manquante dans {file}")
        frames.append(df[['sid', 'site', 'disease']])

    combined = pd.concat(frames, ignore_index=True)
    combined = combined.drop_duplicates(subset=['sid', 'site', 'disease'])

    disease_sets = combined.groupby('site')['disease'].agg(set)
    allowed_sites = [
        site for site, diseases in disease_sets.items()
        if site == site_exception or any(d != 'HC' for d in diseases)
    ]
    filtered = combined[combined['site'].isin(allowed_sites)].copy()

    removed_sites = sorted(set(disease_sets.index) - set(allowed_sites))
    print(f'Sites conservés: {len(allowed_sites)} | Sites retirés: {len(removed_sites)}')
    if removed_sites:
        preview = ', '.join(removed_sites[:20])
        print('Sites retirés (uniquement HC):', preview)
        if len(removed_sites) > 20:
            print('...')

    return filtered

site_disease_df = load_site_disease_data(csv_files)
site_disease_df.head()

In [None]:
unique_counts = (
    site_disease_df
    .groupby(['site', 'disease'])['sid']
    .nunique()
    .rename('unique_sid_count')
    .reset_index()
)
unique_counts.head()

In [None]:
counts_pivot = unique_counts.pivot(index='site', columns='disease', values='unique_sid_count')
counts_pivot = counts_pivot.fillna(0).astype(int)
counts_pivot.head()

In [None]:
disease_sites_df = counts_pivot.copy()
disease_sites_df['NB_HC'] = disease_sites_df.get('HC', 0)
non_hc_cols = [col for col in disease_sites_df.columns if col not in ['NB_HC', 'NB_Sick', 'HC']]
disease_sites_df['NB_Sick'] = disease_sites_df[non_hc_cols].sum(axis=1)
filtered_disease_sites_df = disease_sites_df[(disease_sites_df['NB_HC'] >= 8) & (disease_sites_df['NB_Sick'] >= 5)]
print(f"Sites après filtre NB_HC>=8 et NB_Sick>=5: {len(filtered_disease_sites_df)} / {len(disease_sites_df)}")
filtered_disease_sites_df.head()

In [None]:
unique_counts_filtered = unique_counts[unique_counts['site'].isin(filtered_disease_sites_df.index)].copy()
unique_counts_filtered.sort_values('unique_sid_count', ascending=False).head(20)

In [None]:
output_path = DATA_DIR / 'site_disease_unique_sid_counts.csv'
unique_counts_filtered.to_csv(output_path, index=False)
print(f'Resultats sauvegardés dans {output_path.resolve()}')