In [6]:
import pandas as pd
from tqdm import tqdm
import os
from typing import List, Dict

In [28]:
# Same ordering as paper
task_2_name: Dict[str, str] = {
    # Operational outcomes
    'longlos': 'Long LOS',
    '30dayreadmit': '30-Day Readmission',
    'icuadmit': 'ICU Admission',
    # Anticipating lab test results
    'thrombocytopenia': 'Thrombocytopenia',
    'hyperkalemia': 'Hyperkalemia',
    'hypoglycemia': 'Hypoglycemia',
    'hyponatremia': 'Hyponatremia',
    'anemia': 'Anemia',
    # Assignment of new diagnoses
    'hypertension': 'Hypertension',
    'hyperlipidemia': 'Hyperlipidemia',
    'pancan': 'Pancreatic Cancer',
    'celiac': 'Celiac',
    'lupus': 'Lupus',
    'ami' : 'Acute MI',
    # Anticipating chest x-ray findings
    'chexpert' : 'Chest X-Ray',
}

task_2_value_type: Dict[str, str] = {
    'pancan': 'boolean',
    'celiac': 'boolean',
    'lupus': 'boolean',
    'ami' : 'boolean',
    'hypertension': 'boolean',
    'hyperlipidemia': 'boolean',
    'longlos': 'boolean',
    '30dayreadmit': 'boolean',
    'icuadmit': 'boolean',
    'thrombocytopenia': 'multiclass',
    'hyperkalemia': 'multiclass',
    'hypoglycemia': 'multiclass',
    'hyponatremia': 'multiclass',
    'anemia': 'multiclass',
    'chexpert' : 'multilabel',
}

In [29]:
path_to_splits = '/Users/mwornow/Downloads/som-nero-nigam-starr.starr_omop_cdm5_confidential_filtered_2024_02_12_ehrshot_release_dua/person_id_map/merged.csv'
df_splits = pd.read_csv(path_to_splits)
df_splits.shape

(6732, 2)

# Get label counts for each task

Do split by train/test/val

In [37]:
results = {
    'train' : [],
    'test' : [],
    'val' : [],
    'all' : []
}
for task, task_name in tqdm(task_2_name.items()):
    if not os.path.exists(f"./{task}_labels.csv"):
        print(f"Skipping {task_name}")
        continue
    df = pd.read_csv(f"./{task}_labels.csv")
    if task_2_value_type[task] == "boolean":
        df['is_positive_label'] = df["boolean_value"]      
    elif task_2_value_type[task] == "multiclass":
        df['is_positive_label'] = df["integer_value"] > 0
    else:
        print(f"Skipping {task_name}")
        continue
    
    # Splits
    for split in ['train', 'test', 'val']:
        df_split = df[df['patient_id'].isin(df_splits[df_splits['split'] == split]['omop_person_id'])]
        results[split].append({
            'task' : task,
            'task_name' : task_name,
            'n_patients' : df_split['patient_id'].nunique(),
            'n_positive_patients' : df_split.groupby('patient_id')['is_positive_label'].max().sum(),
            'n_labels' : df_split.shape[0],
            'n_positive_labels' : df_split['is_positive_label'].sum(),
        })
    
    # All
    results['all'].append({
        'task' : task,
        'task_name' : task_name,
        'n_patients' : df['patient_id'].nunique(),
        'n_positive_patients' : df.groupby('patient_id')['is_positive_label'].max().sum(),
        'n_labels' : df.shape[0],
        'n_positive_labels' : df['is_positive_label'].sum(),
    })

for key in results.keys():
    results[key] = pd.DataFrame(results[key])
    results[key]['n_negative_labels'] = results[key]['n_labels'] - results[key]['n_positive_labels']
    results[key]['n_negative_patients'] = results[key]['n_patients'] - results[key]['n_positive_patients']
    results[key]['label_prevalence'] = results[key]['n_positive_labels'] / results[key]['n_labels']

100%|██████████| 15/15 [00:01<00:00, 14.53it/s]

Skipping Chest X-Ray





In [38]:
# All splits
results['all']

Unnamed: 0,task,task_name,n_patients,n_positive_patients,n_labels,n_positive_labels,n_negative_labels,n_negative_patients,label_prevalence
0,longlos,Long LOS,4636,1772,14723,3286,11437,2864,0.223188
1,30dayreadmit,30-Day Readmission,4777,2363,32725,17327,15398,2414,0.529473
2,icuadmit,ICU Admission,4421,587,13915,713,13202,3834,0.05124
3,thrombocytopenia,Thrombocytopenia,6076,2605,206099,66168,139931,3471,0.32105
4,hyperkalemia,Hyperkalemia,5948,1310,234290,5535,228755,4638,0.023625
5,hypoglycemia,Hypoglycemia,5993,1422,356066,5431,350635,4571,0.015253
6,hyponatremia,Hyponatremia,5937,3735,252578,71111,181467,2202,0.281541
7,anemia,Anemia,6096,4308,214483,147625,66858,1788,0.688283
8,hypertension,Hypertension,2829,747,12452,3232,9220,2082,0.259557
9,hyperlipidemia,Hyperlipidemia,3250,810,16213,3703,12510,2440,0.228397


In [39]:
# Train
results['train']

Unnamed: 0,task,task_name,n_patients,n_positive_patients,n_labels,n_positive_labels,n_negative_labels,n_negative_patients,label_prevalence
0,longlos,Long LOS,1620,645,5562,1267,4295,975,0.227796
1,30dayreadmit,30-Day Readmission,1674,850,12302,6628,5674,824,0.538774
2,icuadmit,ICU Admission,1560,228,5298,271,5027,1332,0.051151
3,thrombocytopenia,Thrombocytopenia,2090,922,79303,24989,54314,1168,0.315108
4,hyperkalemia,Hyperkalemia,2046,469,89381,2202,87179,1577,0.024636
5,hypoglycemia,Hypoglycemia,2063,528,135600,2123,133477,1535,0.015656
6,hyponatremia,Hyponatremia,2043,1310,96298,27665,68633,733,0.287285
7,anemia,Anemia,2096,1495,81941,56961,24980,601,0.695147
8,hypertension,Hypertension,955,258,4158,1055,3103,697,0.253728
9,hyperlipidemia,Hyperlipidemia,1118,279,6044,1242,4802,839,0.205493


In [40]:
# Val
results['val']

Unnamed: 0,task,task_name,n_patients,n_positive_patients,n_labels,n_positive_labels,n_negative_labels,n_negative_patients,label_prevalence
0,longlos,Long LOS,1511,548,4440,979,3461,963,0.220495
1,30dayreadmit,30-Day Readmission,1556,758,10251,5447,4804,798,0.531363
2,icuadmit,ICU Admission,1427,178,4167,221,3946,1249,0.053036
3,thrombocytopenia,Thrombocytopenia,1983,814,61955,19712,42243,1169,0.318166
4,hyperkalemia,Hyperkalemia,1940,430,69919,1566,68353,1510,0.022397
5,hypoglycemia,Hypoglycemia,1957,437,107605,1758,105847,1520,0.016338
6,hyponatremia,Hyponatremia,1935,1186,75961,20541,55420,749,0.270415
7,anemia,Anemia,1993,1394,64634,43992,20642,599,0.680632
8,hypertension,Hypertension,940,241,4246,1104,3142,699,0.260009
9,hyperlipidemia,Hyperlipidemia,1058,270,5173,1189,3984,788,0.229847


In [41]:
# Test
results['test']

Unnamed: 0,task,task_name,n_patients,n_positive_patients,n_labels,n_positive_labels,n_negative_labels,n_negative_patients,label_prevalence
0,longlos,Long LOS,1505,579,4721,1040,3681,926,0.220292
1,30dayreadmit,30-Day Readmission,1547,755,10172,5252,4920,792,0.516319
2,icuadmit,ICU Admission,1434,181,4450,221,4229,1253,0.049663
3,thrombocytopenia,Thrombocytopenia,2003,869,64841,21467,43374,1134,0.331071
4,hyperkalemia,Hyperkalemia,1962,411,74990,1767,73223,1551,0.023563
5,hypoglycemia,Hypoglycemia,1973,457,112861,1550,111311,1516,0.013734
6,hyponatremia,Hyponatremia,1959,1239,80319,22905,57414,720,0.285175
7,anemia,Anemia,2007,1419,67908,46672,21236,588,0.687283
8,hypertension,Hypertension,934,248,4048,1073,2975,686,0.265069
9,hyperlipidemia,Hyperlipidemia,1074,261,4996,1272,3724,813,0.254604
