In [1]:
from pathlib import Path
import pandas as pd

In [4]:
from mae_mimic.config import METADATA_PATH, SPLIT_PATH, NEGIBOX_PATH, CHEXPERT_PATH, DATA_PATH

# Dataset stats

In [5]:
# load in mapping file
df = pd.read_csv(METADATA_PATH, header=0, sep=',')

n = df.shape[0]
print(f'{n} DICOMs in MIMIC-CXR v2.0.0.')

n = df['study_id'].nunique()
print(f'  {n} studies.')

n = df['subject_id'].nunique()
print(f'  {n} subjects.')

dicoms = set(df['dicom_id'].tolist())

377110 DICOMs in MIMIC-CXR v2.0.0.
  227835 studies.
  65379 subjects.


# Load dataframes

In [6]:
df_split = pd.read_csv(SPLIT_PATH)
df_metadata = pd.read_csv(METADATA_PATH)

## Generate view

In [7]:
# initialize view with a mapping from ViewPosition
VIEW_MAP = {
    'AP': 'frontal',
    'PA': 'frontal',
    'LATERAL': 'lateral',
    'LL': 'lateral',
    'LPO': 'other',
    'RAO': 'other',
    'RPO': 'other',
    'LAO': 'other',
    # the below are overwritten in some instances by manual review
    'AP AXIAL': 'other',
    'XTABLE LATERAL': 'other',
    'AP LLD': 'other',
    'PA LLD': 'other',
    'L5 S1': 'other',
    'SWIMMERS': 'other',
    'AP RLD': 'other',
    'PA RLD': 'other',
}

df_metadata['view'] = df_metadata['ViewPosition'].map(VIEW_MAP)

# Merge dataframes

In [8]:
df = df_split.merge(df_metadata.drop(['study_id', 'subject_id'], axis=1),
                   on='dicom_id', how='inner')


nb = pd.read_csv(NEGIBOX_PATH)
# avoid redundant columns
nb.drop('subject_id', axis=1, inplace=True)
findings = [x for x in nb.columns if x != 'study_id']
df = df.merge(nb, how='left', on='study_id')

# indicator flag for the study having a NegBio finding
df['has_negbio_finding'] = df[[x for x in findings if x != 'No Finding']].notnull().sum(axis=1) > 0

df[['dicom_id', 'split', 'view'] + findings].head(n=16).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
dicom_id,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,096052b7-d256dc40-453a102b-fa7d01c6-1b22c6b4,b79e55c3-735ce5ac-64412506-cdc9ea79-f1af521f,dcfeeac4-1597e318-d0e6736a-8b2c2238-47ac3f1b,0c4eb1e1-b801903c-bcebe8a4-3da9cd3c-3b94a27c,2a280266-c8bae121-54d75383-cac046f4-ca37aa16,8959e402-2175d68d-edba5a6c-baab51c3-9359f700,9e7a6aae-2580e589-6212d336-9813ebbd-a9239a34,b75df1bd-0f22d631-52d73526-2ae7b85a-d843b39d,d0b71acc-b5a62046-bbb5f6b8-7b173b85-65cdf738
split,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train
view,frontal,lateral,frontal,lateral,frontal,frontal,frontal,frontal,lateral,lateral,lateral,frontal,lateral,lateral,frontal,frontal
Atelectasis,,,,,,,,,,,,,,,,
Cardiomegaly,,,,,,,,,,,,,,,,
Consolidation,,,,,,,,1.0,1.0,1.0,,,,,,
Edema,,,,,,,,,,,,,,,,-1.0
Enlarged Cardiomediastinum,,,,,,,,,,,,,,,,
Fracture,,,,,,,,,,,,,,,,
Lung Lesion,,,,,,,,,,,,,,,,


# Frequency of findings

In [9]:
nb = pd.read_csv(NEGIBOX_PATH)
cx = pd.read_csv(CHEXPERT_PATH)

# merge these findings to create a table
# both agree -> output label
# disagree -> output -9

# drop subject_id from cx - we have it in nb
df = nb.merge(
    cx.drop('subject_id', axis=1),
    how='left',
    left_on='study_id', right_on='study_id',
    suffixes=('', '_cx')
)

# subselect to training set
study_ids = set(df_split.loc[df_split['split']=='train', 'study_id'])
df = df.loc[df['study_id'].isin(study_ids)]

# replace numeric labels with meaningful labels
# also annotate disagreements between the two labelers
labels = {0: 'Negative', 1: 'Positive', -1: 'Uncertain', -9: 'Disagreement'}
for c in df.columns:
    if c in ('subject_id', 'study_id'):
        continue
    elif c.endswith('_cx'):
        continue
    
    # chexpert column
    c_cx = f'{c}_cx'
    
    # annotate disagreement
    for val in labels.keys():
        if val == -9:
            continue
        
        # check one is null and the other isn't
        idx = df[c].isnull() & df[c_cx].notnull()
        df.loc[idx, c] = -9
        
        idx = df[c].notnull() & df[c_cx].isnull()
        df.loc[idx, c] = -9
        
        # check both non-null, but different value
        idx = df[c].notnull() & df[c_cx].notnull() & (df[c] != df[c_cx])
        df.loc[idx, c] = -9
        
    # now for those missing in negbio
    idx = df[c].isnull() & df[f'{c}_cx'].notnull()
    df.loc[idx, c] = -9
    
    df[c] = df[c].map(labels)
    
# drop chexpert columns
cols_drop = [c for c in df.columns if c.endswith('_cx')]
df.drop(cols_drop, axis=1, inplace=True)

# display a few example cases
# display(df.head(n=10))

# create a summary table of the findings
grp_cols = [c for c in df.columns if c not in ('subject_id', 'study_id')]
tbl = {}
for c in grp_cols:
    tbl[c] = df[c].value_counts().to_dict()
tbl = pd.DataFrame.from_dict(tbl, orient='index')


# pretty format the labels
N = df.shape[0]
for c in tbl.columns:
    tbl[c] = tbl[c].apply(lambda x: f'{x:,} ({100.0*x/N:3.1f}%)')

# sort columns
print(f'Frequency of labels in MIMIC-CXR-JPG on the training subset of {df.shape[0]:,} unique radiologic studies.')
tbl = tbl[['Positive', 'Negative', 'Uncertain', 'Disagreement']]
tbl.style.to_latex('findings_frequency.tex')
tbl

Frequency of labels in MIMIC-CXR-JPG on the training subset of 222,750 unique radiologic studies.


Unnamed: 0,Positive,Negative,Uncertain,Disagreement
Atelectasis,"44,012 (19.8%)",921.0 (0.4%),"9,623.0 (4.3%)","1,705 (0.8%)"
Cardiomegaly,"38,002 (17.1%)","15,563.0 (7.0%)","5,753.0 (2.6%)","5,769 (2.6%)"
Consolidation,"10,199 (4.6%)","7,791.0 (3.5%)","2,913.0 (1.3%)","1,576 (0.7%)"
Edema,"25,549 (11.5%)","24,746.0 (11.1%)","11,426.0 (5.1%)","2,282 (1.0%)"
Enlarged Cardiomediastinum,"6,798 (3.1%)","5,158.0 (2.3%)","9,015.0 (4.0%)",248 (0.1%)
Fracture,"3,675 (1.6%)",871.0 (0.4%),295.0 (0.1%),867 (0.4%)
Lung Lesion,"5,939 (2.7%)",822.0 (0.4%),996.0 (0.4%),289 (0.1%)
Lung Opacity,"49,512 (22.2%)","2,794.0 (1.3%)","2,052.0 (0.9%)","2,460 (1.1%)"
No Finding,"74,019 (33.2%)",nan (nan%),nan (nan%),"3,825 (1.7%)"
Pleural Effusion,"51,680 (23.2%)","26,532.0 (11.9%)","5,184.0 (2.3%)","1,617 (0.7%)"


In [10]:
metadata_cols = ['dicom_id', 'subject_id', 'study_id', 'view']
nb_cols = ['Pleural Effusion', 'subject_id', 'study_id']
split_cols = ['dicom_id', 'split']

df = df_metadata[metadata_cols]\
        .merge(nb[nb_cols], on=['subject_id', 'study_id'])\
        .drop(columns=['subject_id', 'study_id'])\
        .merge(df_split[split_cols], on='dicom_id')
df = df[df.view == 'frontal']


df['Pleural Effusion'] = df['Pleural Effusion'].fillna('na')
mapper = {
    -1   : 0,
    'na' : 0,
    0    : 0,
    1    : 1
}
df['Pleural Effusion_mapped'] = df['Pleural Effusion'].map(mapper)


df.groupby(['split', 'Pleural Effusion_mapped']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,dicom_id,view,Pleural Effusion
split,Pleural Effusion_mapped,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
test,0,2327,2327,2327
test,1,1076,1076,1076
train,0,182839,182839,182839
train,1,55123,55123,55123
validate,0,1461,1461,1461
validate,1,498,498,498
