In [69]:
import pandas as pd

from sklearn.metrics import roc_auc_score
import numpy as np

In [70]:
!ls ../../data/MIMIC-CXR/mimic-cxr-jpg-2.0.0-small

admissions.csv		      mimic-cxr-2.0.0-split.csv
clip-files		      mimic-cxr-2.1.0-test-set-labeled.csv
final-datasets		      mimic_cxr_labels_reports_splits.csv
IMAGE_FILENAMES		      mimic-cxr-lt
images			      mimic_cxr_sectioned.csv
LICENSE.txt		      mimic_cxr_sections.csv
mimic-cxr-2.0.0-chexpert.csv  patients.csv
mimic-cxr-2.0.0-merged.csv    README
mimic-cxr-2.0.0-metadata.csv  reports
mimic-cxr-2.0.0-negbio.csv    resize_mimic.py


In [71]:
# MIMIC Demographic Robustness


admissions = pd.read_csv("../../data/MIMIC-CXR/mimic-cxr-jpg-2.0.0-small/admissions.csv", sep=",")[['subject_id','race']]
admissions['subject_id'] = admissions['subject_id'].astype(int)
admissions.columns

Index(['subject_id', 'race'], dtype='object')

In [92]:
# Create a mapping dictionary for race categories
race_mapping = {
    # White/European categories
    'WHITE': 'White',
    'WHITE - OTHER EUROPEAN': 'White',
    'WHITE - RUSSIAN': 'White',
    'WHITE - EASTERN EUROPEAN': 'White',
    'WHITE - BRAZILIAN': 'White',
    'PORTUGUESE': 'White',
    
    # Black categories
    'BLACK/AFRICAN AMERICAN': 'Black',
    'BLACK/CAPE VERDEAN': 'Black',
    'BLACK/CARIBBEAN ISLAND': 'Black',
    'BLACK/AFRICAN': 'Black',
    
    # Asian categories
    'ASIAN': 'East Asian',
    'ASIAN - CHINESE': 'East Asian',
    'ASIAN - SOUTH EAST ASIAN': 'South East Asian',
    'ASIAN - ASIAN INDIAN': 'Indian',
    'ASIAN - KOREAN': 'East Asian',
    
    # Hispanic/Latino categories
    'HISPANIC/LATINO - PUERTO RICAN': 'Hispanic/Latino',
    'HISPANIC OR LATINO': 'Hispanic/Latino',
    'HISPANIC/LATINO - DOMINICAN': 'Hispanic/Latino',
    'HISPANIC/LATINO - GUATEMALAN': 'Hispanic/Latino',
    'HISPANIC/LATINO - SALVADORAN': 'Hispanic/Latino',
    'HISPANIC/LATINO - COLUMBIAN': 'Hispanic/Latino',
    'HISPANIC/LATINO - MEXICAN': 'Hispanic/Latino',
    'HISPANIC/LATINO - HONDURAN': 'Hispanic/Latino',
    'HISPANIC/LATINO - CUBAN': 'Hispanic/Latino',
    'HISPANIC/LATINO - CENTRAL AMERICAN': 'Hispanic/Latino',
    'SOUTH AMERICAN': 'Hispanic/Latino',
    
    # US American categories
    'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER': 'Pacific Islander',
    'AMERICAN INDIAN/ALASKA NATIVE': 'Indigenous American',
    
    # Other categories
    'OTHER': 'Other',
    'MULTIPLE RACE/ETHNICITY': 'Other',
    
    # Unknown/Missing categories
    'UNKNOWN': 'Other',
    'UNABLE TO OBTAIN': 'Other',
    'PATIENT DECLINED TO ANSWER': 'Other'
}

# Apply the mapping to create a new simplified race column
admissions['race_simplified'] = admissions['race'].map(race_mapping)

# View the simplified distribution
print(admissions['race_simplified'].value_counts())

race_simplified
White                  362601
Black                   89057
Other                   39894
Hispanic/Latino         32984
East Asian              16117
South East Asian         1973
Indian                   1661
Indigenous American      1247
Pacific Islander          494
Name: count, dtype: int64


In [109]:
def get_subject_from_path(path: str) -> str:
    """
    Extract subject ID from the file path.
    
    Args:
        path: String containing the file path (e.g., mimic/files/p10/p10046166/s56173345/...)
        
    Returns:
        Subject ID without 'p' prefix (e.g., '10046166')
    """
    # Split the path and get the 4th component (index 3) which contains the full subject ID
    return path.split('/')[3][1:]  # Remove the 'p' prefix

def calculate_macro_auc(df, category_col, category_value):
    mask = df[category_col] == category_value
    category_data = df[mask]
    
    aucs = []
    conditions = [col.replace('_pred', '') for col in df.columns if col.endswith('_pred')]
    
    for condition in conditions:
        y_true = category_data[f'{condition}_true']
        y_pred = category_data[f'{condition}_pred']
        
        if len(np.unique(y_true)) < 2:
            continue
            
        try:
            auc = roc_auc_score(y_true, y_pred)
            aucs.append(auc)
        except ValueError:
            continue
    
    return np.mean(aucs) if aucs else np.nan

def calculate_group_metrics(df, group_col, categories):
    aucs = {}
    for category in categories:
        macro_auc = calculate_macro_auc(df, group_col, category)
        aucs[category] = macro_auc
        
    results = pd.DataFrame({
        'Macro AUC-ROC': aucs,
        'Sample Size': df[group_col].value_counts()
    }).round(3)
    
    return results

In [110]:
results_clip = pd.read_csv("../../results/mimic/clip-mimic-full-ft/test_predictions.csv", sep=",")
results_clip['subject_id'] = results_clip['path'].apply(lambda x: get_subject_from_path(x)).astype(int)
print(results_clip.shape)
# Merge the results with admissions data
# Drop duplicates from admissions before merging to avoid multiplying rows
results_clip = results_clip.merge(admissions[['subject_id', 'race_simplified']].drop_duplicates(subset=['subject_id']), on='subject_id', how='left')

# Convert age to numeric, coercing errors to NaN
results_clip['age'] = pd.to_numeric(results_clip['age'], errors='coerce')

# Calculate for race
races = ['White', 'Black', 'Hispanic/Latino', 'Other', 'East Asian', 
         'Indian', 'South East Asian', 'Indigenous American']
race_results = calculate_group_metrics(results_clip, 'race_simplified', races)

# Calculate for sex
sexes = ['M', 'F', 'unknown']
sex_results = calculate_group_metrics(results_clip, 'sex', sexes)

# Create age bins with new ranges
results_clip['age_bin'] = pd.cut(results_clip['age'], 
                                bins=[17, 35, 50, 70, 100],
                                labels=['18-35', '36-50', '51-70', '71+'])
age_results = calculate_group_metrics(results_clip, 'age_bin', 
                                    results_clip['age_bin'].dropna().unique())

# Print results
print("\nMacro AUC-ROC scores by race:")
print(race_results)
print("\nMacro AUC-ROC scores by sex:")
print(sex_results)
print("\nMacro AUC-ROC scores by age bin:")
print(age_results.sort_index())

(5159, 32)

Macro AUC-ROC scores by race:
                     Macro AUC-ROC  Sample Size
White                        0.770         3411
Black                        0.781         1026
Hispanic/Latino              0.744          154
Other                        0.751          123
East Asian                   0.713           84
Indian                       0.696           53
South East Asian             0.624           17
Indigenous American          0.735           14

Macro AUC-ROC scores by sex:
         Macro AUC-ROC  Sample Size
M                0.762         2744
F                0.781         2138
unknown          0.734          277

Macro AUC-ROC scores by age bin:
       Macro AUC-ROC  Sample Size
18-35          0.740           67
36-50          0.806          426
51-70          0.770         2772
71+            0.762         1617


In [111]:
results_dino = pd.read_csv("../../results/mimic/dino-mimic-full-ft/test_predictions.csv", sep=",")
results_dino['subject_id'] = results_dino['path'].apply(lambda x: get_subject_from_path(x)).astype(int)
print(results_dino.shape)
# Merge the results with admissions data
# Drop duplicates from admissions before merging to avoid multiplying rows
results_dino = results_dino.merge(admissions[['subject_id', 'race_simplified']].drop_duplicates(subset=['subject_id']), on='subject_id', how='left')

# Convert age to numeric, coercing errors to NaN
results_dino['age'] = pd.to_numeric(results_dino['age'], errors='coerce')

# Calculate for race
races = ['White', 'Black', 'Hispanic/Latino', 'Other', 'East Asian', 
         'Indian', 'South East Asian', 'Indigenous American']
race_results = calculate_group_metrics(results_dino, 'race_simplified', races)

# Calculate for sex
sexes = ['M', 'F', 'unknown']
sex_results = calculate_group_metrics(results_dino, 'sex', sexes)

# Create age bins with new ranges
results_dino['age_bin'] = pd.cut(results_dino['age'], 
                                bins=[17, 35, 50, 70, 100],
                                labels=['18-35', '36-50', '51-70', '71+'])
age_results = calculate_group_metrics(results_dino, 'age_bin', 
                                    results_dino['age_bin'].dropna().unique())

# Print results
print("\nMacro AUC-ROC scores by race:")
print(race_results)
print("\nMacro AUC-ROC scores by sex:")
print(sex_results)
print("\nMacro AUC-ROC scores by age bin:")
print(age_results.sort_index())

(5159, 32)

Macro AUC-ROC scores by race:
                     Macro AUC-ROC  Sample Size
White                        0.767         3411
Black                        0.781         1026
Hispanic/Latino              0.711          154
Other                        0.741          123
East Asian                   0.743           84
Indian                       0.594           53
South East Asian             0.499           17
Indigenous American          0.795           14

Macro AUC-ROC scores by sex:
         Macro AUC-ROC  Sample Size
M                0.763         2744
F                0.777         2138
unknown          0.719          277

Macro AUC-ROC scores by age bin:
       Macro AUC-ROC  Sample Size
18-35          0.741           67
36-50          0.800          426
51-70          0.769         2772
71+            0.757         1617


In [113]:
results_clip = pd.read_csv("../../results/nih/clip-linear-bce-1e-5/test_predictions.csv", sep=",")
results_clip

Unnamed: 0,path,age,sex,Effusion_pred,Effusion_true,Cardiomegaly_pred,Cardiomegaly_true,Hernia_pred,Hernia_true,Infiltration_pred,...,Pneumothorax_pred,Pneumothorax_true,Consolidation_pred,Consolidation_true,Fibrosis_pred,Fibrosis_true,Atelectasis_pred,Atelectasis_true,Emphysema_pred,Emphysema_true
0,nih/files/00000003_000.jpg,81,F,0.006558,0.0,0.005496,0.0,0.261728,1.0,0.187914,...,0.009401,0.0,0.016332,0.0,0.049511,0.0,0.126238,0.0,0.035488,0.0
1,nih/files/00000003_001.jpg,74,F,0.033695,0.0,0.000766,0.0,0.021580,1.0,0.072808,...,0.009996,0.0,0.013031,0.0,0.052587,0.0,0.024457,0.0,0.063976,0.0
2,nih/files/00000003_002.jpg,75,F,0.038123,0.0,0.000563,0.0,0.010076,1.0,0.139784,...,0.067669,0.0,0.019529,0.0,0.028977,0.0,0.250383,0.0,0.050591,0.0
3,nih/files/00000003_003.jpg,76,F,0.007421,0.0,0.002291,0.0,0.197293,1.0,0.190932,...,0.006647,0.0,0.008383,0.0,0.057987,0.0,0.084552,0.0,0.028125,0.0
4,nih/files/00000003_004.jpg,77,F,0.026186,0.0,0.005705,0.0,0.136466,1.0,0.141375,...,0.020259,0.0,0.012189,0.0,0.116217,0.0,0.153732,0.0,0.110478,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25591,nih/files/00030800_000.jpg,33,F,0.002021,0.0,0.000814,0.0,0.000123,0.0,0.069264,...,0.001955,0.0,0.001592,0.0,0.003554,0.0,0.002743,0.0,0.000820,0.0
25592,nih/files/00030802_000.jpg,28,M,0.008127,0.0,0.000126,0.0,0.001026,0.0,0.024381,...,0.000690,0.0,0.003689,0.0,0.026131,0.0,0.004384,0.0,0.004066,0.0
25593,nih/files/00030803_000.jpg,42,F,0.000445,0.0,0.000232,0.0,0.000064,0.0,0.026123,...,0.005908,0.0,0.001366,0.0,0.001946,0.0,0.001059,0.0,0.001289,0.0
25594,nih/files/00030804_000.jpg,29,F,0.001043,0.0,0.001165,0.0,0.000260,0.0,0.012811,...,0.000711,0.0,0.000901,0.0,0.001119,0.0,0.001156,0.0,0.000957,0.0


In [114]:
# NIH


results_clip = pd.read_csv("../../results/nih/clip-linear-bce-1e-5/test_predictions.csv", sep=",")
print(results_clip.shape)
# Merge the results with admissions data

# Convert age to numeric, coercing errors to NaN
results_clip['age'] = pd.to_numeric(results_clip['age'], errors='coerce')


# Calculate for sex
sexes = ['M', 'F', 'unknown']
sex_results = calculate_group_metrics(results_clip, 'sex', sexes)

# Create age bins with new ranges
results_clip['age_bin'] = pd.cut(results_clip['age'], 
                                bins=[17, 35, 50, 70, 100],
                                labels=['18-35', '36-50', '51-70', '71+'])
age_results = calculate_group_metrics(results_clip, 'age_bin', 
                                    results_clip['age_bin'].dropna().unique())


print("\nMacro AUC-ROC scores by sex:")
print(sex_results)
print("\nMacro AUC-ROC scores by age bin:")
print(age_results.sort_index())

(25596, 33)

Macro AUC-ROC scores by sex:
         Macro AUC-ROC  Sample Size
F                0.789      10714.0
M                0.792      14882.0
unknown            NaN          NaN

Macro AUC-ROC scores by age bin:
       Macro AUC-ROC  Sample Size
18-35          0.736         6076
36-50          0.796         6431
51-70          0.784        10538
71+            0.772         1481


In [115]:


results_dino = pd.read_csv("../../results/nih/dino-linear-bce-1e-5/test_predictions.csv", sep=",")
print(results_dino.shape)
# Merge the results with admissions data

# Convert age to numeric, coercing errors to NaN
results_dino['age'] = pd.to_numeric(results_dino['age'], errors='coerce')


# Calculate for sex
sexes = ['M', 'F', 'unknown']
sex_results = calculate_group_metrics(results_dino, 'sex', sexes)

# Create age bins with new ranges
results_dino['age_bin'] = pd.cut(results_dino['age'], 
                                bins=[17, 35, 50, 70, 100],
                                labels=['18-35', '36-50', '51-70', '71+'])
age_results = calculate_group_metrics(results_dino, 'age_bin', 
                                    results_dino['age_bin'].dropna().unique())


print("\nMacro AUC-ROC scores by sex:")
print(sex_results)
print("\nMacro AUC-ROC scores by age bin:")
print(age_results.sort_index())

(25596, 33)

Macro AUC-ROC scores by sex:
         Macro AUC-ROC  Sample Size
F                0.789      10714.0
M                0.782      14882.0
unknown            NaN          NaN

Macro AUC-ROC scores by age bin:
       Macro AUC-ROC  Sample Size
18-35          0.739         6076
36-50          0.795         6431
51-70          0.774        10538
71+            0.760         1481
