In [1]:
import pandas as pd
import numpy as np

from src.config import METADATA_PATH, SPLIT_PATH, NEGIBOX_PATH, CHEXPERT_PATH

%load_ext autoreload
%autoreload 2

# Summarize dataset

In [2]:
# load in mapping file
df = pd.read_csv(METADATA_PATH, header=0, sep=',')

n = df.shape[0]
print(f'{n} DICOMs in MIMIC-CXR v2.0.0.')

n = df['study_id'].nunique()
print(f'  {n} studies.')

n = df['subject_id'].nunique()
print(f'  {n} subjects.')

dicoms = set(df['dicom_id'].tolist())

377110 DICOMs in MIMIC-CXR v2.0.0.
  227835 studies.
  65379 subjects.


# Load dataframes

In [3]:
df_split = pd.read_csv(SPLIT_PATH)
df_metadata = pd.read_csv(METADATA_PATH)

In [4]:
df_metadata

Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,CHEST (PA AND LAT),PA,3056,2544,21800506,213014.531,CHEST (PA AND LAT),postero-anterior,Erect
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,10000032,50414267,CHEST (PA AND LAT),LATERAL,3056,2544,21800506,213014.531,CHEST (PA AND LAT),lateral,Erect
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,CHEST (PA AND LAT),PA,3056,2544,21800626,165500.312,CHEST (PA AND LAT),postero-anterior,Erect
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,10000032,53189527,CHEST (PA AND LAT),LATERAL,3056,2544,21800626,165500.312,CHEST (PA AND LAT),lateral,Erect
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,CHEST (PORTABLE AP),AP,2705,2539,21800723,80556.875,CHEST (PORTABLE AP),antero-posterior,
...,...,...,...,...,...,...,...,...,...,...,...,...
377105,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,19999733,57132437,CHEST (PA AND LAT),PA,3056,2544,21520708,224550.171,CHEST (PA AND LAT),postero-anterior,Erect
377106,58c403aa-35ff8bd9-73e39f54-8dc9cc5d-e0ec3fa9,19999733,57132437,CHEST (PA AND LAT),LATERAL,3056,2544,21520708,224550.171,CHEST (PA AND LAT),lateral,Erect
377107,58766883-376a15ce-3b323a28-6af950a0-16b793bd,19999987,55368167,CHEST (PORTABLE AP),AP,2544,3056,21451104,51448.218,CHEST (PORTABLE AP),antero-posterior,Erect
377108,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,19999987,58621812,CHEST (PORTABLE AP),AP,3056,2544,21451102,202809.234,CHEST (PORTABLE AP),antero-posterior,Erect


In [8]:
# initialize view with a mapping from ViewPosition
VIEW_MAP = {
    'AP': 'frontal',
    'PA': 'frontal',
    'LATERAL': 'lateral',
    'LL': 'lateral',
    'LPO': 'other',
    'RAO': 'other',
    'RPO': 'other',
    'LAO': 'other',
    # the below are overwritten in some instances by manual review
    'AP AXIAL': 'other',
    'XTABLE LATERAL': 'other',
    'AP LLD': 'other',
    'PA LLD': 'other',
    'L5 S1': 'other',
    'SWIMMERS': 'other',
    'AP RLD': 'other',
    'PA RLD': 'other',
}
df_metadata['view'] = df_metadata['ViewPosition'].map(VIEW_MAP)

In [9]:
df_metadata

Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,view
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,CHEST (PA AND LAT),PA,3056,2544,21800506,213014.531,CHEST (PA AND LAT),postero-anterior,Erect,frontal
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,10000032,50414267,CHEST (PA AND LAT),LATERAL,3056,2544,21800506,213014.531,CHEST (PA AND LAT),lateral,Erect,lateral
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,CHEST (PA AND LAT),PA,3056,2544,21800626,165500.312,CHEST (PA AND LAT),postero-anterior,Erect,frontal
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,10000032,53189527,CHEST (PA AND LAT),LATERAL,3056,2544,21800626,165500.312,CHEST (PA AND LAT),lateral,Erect,lateral
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,CHEST (PORTABLE AP),AP,2705,2539,21800723,80556.875,CHEST (PORTABLE AP),antero-posterior,,frontal
...,...,...,...,...,...,...,...,...,...,...,...,...,...
377105,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,19999733,57132437,CHEST (PA AND LAT),PA,3056,2544,21520708,224550.171,CHEST (PA AND LAT),postero-anterior,Erect,frontal
377106,58c403aa-35ff8bd9-73e39f54-8dc9cc5d-e0ec3fa9,19999733,57132437,CHEST (PA AND LAT),LATERAL,3056,2544,21520708,224550.171,CHEST (PA AND LAT),lateral,Erect,lateral
377107,58766883-376a15ce-3b323a28-6af950a0-16b793bd,19999987,55368167,CHEST (PORTABLE AP),AP,2544,3056,21451104,51448.218,CHEST (PORTABLE AP),antero-posterior,Erect,frontal
377108,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,19999987,58621812,CHEST (PORTABLE AP),AP,3056,2544,21451102,202809.234,CHEST (PORTABLE AP),antero-posterior,Erect,frontal


In [11]:
splits = ['train', 'validate', 'test']
split_views = df.groupby(['split', 'view'])[['dicom_id']].count()

row_idx = ['frontal', 'lateral']

# number of images in each set
n_images = {}
for c in splits:
    n_images[c] = split_views.loc[c].loc[row_idx, 'dicom_id'].sum()

tbl = pd.DataFrame.from_dict(n_images, orient='index')
tbl.columns = ['Number of images']
tbl = tbl.T

# number of images in each set for each view
n_images = {}
for c in splits:
    n_images[c] = {}
    for view in row_idx:
        n_images[c][view] = split_views.loc[c].loc[view, 'dicom_id']
n_images = pd.DataFrame.from_dict(n_images, orient='index')
n_images = n_images.T


# convert frontal/lateral/other into "N (%)"
for i in n_images.index:
    for c in splits:
        val = n_images.loc[i, c]
        n_images.loc[i, c] = f'{val} ({100.0*val/tbl.loc["Number of images", c]:3.1f}%)'

tbl = pd.concat([tbl, n_images], axis=0, sort=False)

# add in the number of subjects
n_studies = df.groupby('split')[['study_id']].nunique().T
n_studies.index = ['Number of studies']
tbl = pd.concat([tbl, n_studies], axis=0, sort=False)

# studies with a finding
n_studies = df.loc[df['has_negbio_finding']].groupby('split')[['study_id']].nunique().T
n_studies.index = ['  with a finding']
for c in splits:
    val = n_studies.loc['  with a finding', c]
    n_studies.loc['  with a finding', c] = f'{val} ({100.0*val/tbl.loc["Number of studies", c]:3.1f}%)'
tbl = pd.concat([tbl, n_studies], axis=0, sort=False)

# patients
n_pt = df.groupby('split')[['subject_id']].nunique().T
n_pt.index = ['Number of patients']
tbl = pd.concat([tbl, n_pt], axis=0, sort=False)


# patients with a finding
n_studies = df.loc[df['has_negbio_finding']].groupby('split')[['subject_id']].nunique().T
n_studies.index = ['  with a finding']
for c in splits:
    val = n_studies.loc['  with a finding', c]
    n_studies.loc['  with a finding', c] = f'{val} ({100.0*val/tbl.loc["Number of patients", c]:3.1f}%)'
tbl = pd.concat([tbl, n_studies], axis=0, sort=False)

tbl.to_latex('table2.tex')

tbl

  tbl.to_latex('table2.tex')


Unnamed: 0,train,validate,test
Number of images,353622,2867,4837
frontal,237973 (67.3%),1959 (68.3%),3405 (70.4%)
lateral,115649 (32.7%),908 (31.7%),1432 (29.6%)
Number of studies,222758,1808,3269
with a finding,170420 (76.5%),1394 (77.1%),2912 (89.1%)
Number of patients,64586,500,293
with a finding,44157 (68.4%),344 (68.8%),288 (98.3%)


In [18]:
nb['Pleural Effusion'].value_counts(dropna=False, normalize=True)

 NaN    0.616937
 1.0    0.234138
 0.0    0.120872
-1.0    0.028052
Name: Pleural Effusion, dtype: float64

In [19]:
nb

Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,10000032,50414267,,,,,,,,,1.0,,,,,
1,10000032,53189527,,,,,,,,,1.0,,,,,
2,10000032,53911762,,,,,,,,,1.0,,,,,
3,10000032,56699142,,,,,,,,,1.0,,,,,
4,10000764,57375967,,,1.0,,,,,,,,,-1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227822,19999442,58708861,,,,,,,,,1.0,,,,,1.0
227823,19999733,57132437,,,,,,,,,1.0,,,,,
227824,19999987,55368167,1.0,-1.0,,,,,0.0,,,0.0,,,0.0,
227825,19999987,58621812,1.0,,,,,,,,,,,,,1.0


# Frequency of findings

In [12]:
nb = pd.read_csv(NEGIBOX_PATH)
cx = pd.read_csv(CHEXPERT_PATH)


# merge these findings to create a table
# both agree -> output label
# disagree -> output -9

# drop subject_id from cx - we have it in nb
df = nb.merge(
    cx.drop('subject_id', axis=1),
    how='left',
    left_on='study_id', right_on='study_id',
    suffixes=('', '_cx')
)

# subselect to training set
study_ids = set(df_split.loc[df_split['split']=='train', 'study_id'])
df = df.loc[df['study_id'].isin(study_ids)]

# replace numeric labels with meaningful labels
# also annotate disagreements between the two labelers
labels = {0: 'Negative', 1: 'Positive', -1: 'Uncertain', -9: 'Disagreement'}
for c in df.columns:
    if c in ('subject_id', 'study_id'):
        continue
    elif c.endswith('_cx'):
        continue
    
    # chexpert column
    c_cx = f'{c}_cx'
    
    # annotate disagreement
    for val in labels.keys():
        if val == -9:
            continue
        
        # check one is null and the other isn't
        idx = df[c].isnull() & df[c_cx].notnull()
        df.loc[idx, c] = -9
        
        idx = df[c].notnull() & df[c_cx].isnull()
        df.loc[idx, c] = -9
        
        # check both non-null, but different value
        idx = df[c].notnull() & df[c_cx].notnull() & (df[c] != df[c_cx])
        df.loc[idx, c] = -9
        
    # now for those missing in negbio
    idx = df[c].isnull() & df[f'{c}_cx'].notnull()
    df.loc[idx, c] = -9
    
    df[c] = df[c].map(labels)
    
# drop chexpert columns
cols_drop = [c for c in df.columns if c.endswith('_cx')]
df.drop(cols_drop, axis=1, inplace=True)

# display a few example cases
display(df.head(n=10))

# create a summary table of the findings
grp_cols = [c for c in df.columns if c not in ('subject_id', 'study_id')]
tbl = {}
for c in grp_cols:
    tbl[c] = df[c].value_counts().to_dict()
tbl = pd.DataFrame.from_dict(tbl, orient='index')


# pretty format the labels
N = df.shape[0]
for c in tbl.columns:
    tbl[c] = tbl[c].apply(lambda x: f'{x:,} ({100.0*x/N:3.1f}%)')

# sort columns
print(f'Frequency of labels in MIMIC-CXR-JPG on the training subset of {df.shape[0]:,} unique radiologic studies.')
tbl = tbl[['Positive', 'Negative', 'Uncertain', 'Disagreement']]
tbl.to_latex('findings_frequency.tex')
tbl

Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,10000032,50414267,,,,,,,,,Positive,,,,,
1,10000032,53189527,,,,,,,,,Positive,,,,,
2,10000032,53911762,,,,,,,,,Positive,,,,,
3,10000032,56699142,,,,,,,,,Positive,,,,,
4,10000764,57375967,,,Positive,,,,,,,,,Uncertain,,
5,10000898,50771383,,,,,,,,,Positive,,,,,
6,10000898,54205396,,,,,,,,,Positive,,,,,
7,10000935,50578979,,,,Uncertain,,,,Disagreement,,Positive,,Positive,,
8,10000935,51178377,,,,,,,,Positive,,,,Uncertain,,
9,10000935,55697293,,,,,,,,,Positive,,,,,


Frequency of labels in MIMIC-CXR-JPG on the training subset of 222,750 unique radiologic studies.


  tbl.to_latex('findings_frequency.tex')


Unnamed: 0,Positive,Negative,Uncertain,Disagreement
Atelectasis,"44,012 (19.8%)",921.0 (0.4%),"9,623.0 (4.3%)","1,705 (0.8%)"
Cardiomegaly,"38,002 (17.1%)","15,563.0 (7.0%)","5,753.0 (2.6%)","5,769 (2.6%)"
Consolidation,"10,199 (4.6%)","7,791.0 (3.5%)","2,913.0 (1.3%)","1,576 (0.7%)"
Edema,"25,549 (11.5%)","24,746.0 (11.1%)","11,426.0 (5.1%)","2,282 (1.0%)"
Enlarged Cardiomediastinum,"6,798 (3.1%)","5,158.0 (2.3%)","9,015.0 (4.0%)",248 (0.1%)
Fracture,"3,675 (1.6%)",871.0 (0.4%),295.0 (0.1%),867 (0.4%)
Lung Lesion,"5,939 (2.7%)",822.0 (0.4%),996.0 (0.4%),289 (0.1%)
Lung Opacity,"49,512 (22.2%)","2,794.0 (1.3%)","2,052.0 (0.9%)","2,460 (1.1%)"
No Finding,"74,019 (33.2%)",nan (nan%),nan (nan%),"3,825 (1.7%)"
Pleural Effusion,"51,680 (23.2%)","26,532.0 (11.9%)","5,184.0 (2.3%)","1,617 (0.7%)"


# Report sectioning

In [8]:
df = pd.read_csv(mimic_cxr_path / 'cxr-record-list.csv.gz', header=0, sep=',')
sections = pd.read_csv(mimic_cxr_path / 'mimic-cxr-sections/mimic_cxr_sectioned.csv')

print(sections.columns)
N = df['study_id'].nunique()
print(f'Of the total {N:,} reports.. ')
idx = sections['study'].notnull()
for c in ['impression', 'findings', 'last_paragraph']:
    n = sections.loc[idx, c].count()
    print(f'  {n:,} ({100.0*n/N:3.1f}%) had a {c} section')
    # limit next check to only rows where this section is null
    idx = idx & sections[c].isnull()


Index(['study', 'impression', 'findings', 'last_paragraph', 'comparison'], dtype='object')
Of the total 227,835 reports.. 
  189,561 (83.2%) had a impression section
  27,684 (12.2%) had a findings section
  10,514 (4.6%) had a last_paragraph section
