# MIMIC CSV

This notebook generates `mimic.csv` and `mimic_nf.csv` assuming the following are downloaded:

* [MIMIC-CXR-JPG](https://physionet.org/content/mimic-cxr-jpg/2.0.0/) dataset


In [1]:
from os.path import join

import pandas as pd

from common import read_toml
from common import AGE_INTERVAL
from common import MIMIC_PATHOLOGIES as PATHOLOGIES

UNIQUE_STUDIES = False

### Listing files

In [2]:
ds_name = 'mimic'
config = read_toml('config.toml')
metachest_dir = config['metachest_dir']
base_dir = join(config[f'{ds_name}_dir'], 'physionet.org/files/mimic-cxr-jpg/2.0.0')
!ls -hs1 {base_dir}

total 34M
4.0K files
4.0K index.html
4.0K LICENSE.txt
2.1M mimic-cxr-2.0.0-chexpert.csv.gz
 16M mimic-cxr-2.0.0-metadata.csv.gz
2.1M mimic-cxr-2.0.0-negbio.csv.gz
 12M mimic-cxr-2.0.0-split.csv.gz
2.3M patients.csv.gz
 12K README
4.0K SHA256SUMS.txt


### Generating CSV

In [3]:
meta_csv_path = join(base_dir, 'mimic-cxr-2.0.0-metadata.csv.gz')
meta_df = pd.read_csv(meta_csv_path, compression='gzip', low_memory=False)
meta_df

Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,CHEST (PA AND LAT),PA,3056,2544,21800506,213014.531,CHEST (PA AND LAT),postero-anterior,Erect
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,10000032,50414267,CHEST (PA AND LAT),LATERAL,3056,2544,21800506,213014.531,CHEST (PA AND LAT),lateral,Erect
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,CHEST (PA AND LAT),PA,3056,2544,21800626,165500.312,CHEST (PA AND LAT),postero-anterior,Erect
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,10000032,53189527,CHEST (PA AND LAT),LATERAL,3056,2544,21800626,165500.312,CHEST (PA AND LAT),lateral,Erect
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,CHEST (PORTABLE AP),AP,2705,2539,21800723,80556.875,CHEST (PORTABLE AP),antero-posterior,
...,...,...,...,...,...,...,...,...,...,...,...,...
377105,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,19999733,57132437,CHEST (PA AND LAT),PA,3056,2544,21520708,224550.171,CHEST (PA AND LAT),postero-anterior,Erect
377106,58c403aa-35ff8bd9-73e39f54-8dc9cc5d-e0ec3fa9,19999733,57132437,CHEST (PA AND LAT),LATERAL,3056,2544,21520708,224550.171,CHEST (PA AND LAT),lateral,Erect
377107,58766883-376a15ce-3b323a28-6af950a0-16b793bd,19999987,55368167,CHEST (PORTABLE AP),AP,2544,3056,21451104,51448.218,CHEST (PORTABLE AP),antero-posterior,Erect
377108,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,19999987,58621812,CHEST (PORTABLE AP),AP,3056,2544,21451102,202809.234,CHEST (PORTABLE AP),antero-posterior,Erect


In [4]:
labels_csv_path = join(base_dir, 'mimic-cxr-2.0.0-chexpert.csv.gz')
labels_df = pd.read_csv(labels_csv_path, compression='gzip', low_memory=False)
labels_df

Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,10000032,50414267,,,,,,,,,1.0,,,,,
1,10000032,53189527,,,,,,,,,1.0,,,,,
2,10000032,53911762,,,,,,,,,1.0,,,,,
3,10000032,56699142,,,,,,,,,1.0,,,,,
4,10000764,57375967,,,1.0,,,,,,,,,-1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227822,19999442,58708861,,,,,,,,,1.0,,,,,1.0
227823,19999733,57132437,,,,,,,,,1.0,,,,,
227824,19999987,55368167,1.0,-1.0,,,,,0.0,,,0.0,,,0.0,
227825,19999987,58621812,1.0,,,,,,,,,,,,,1.0


In [None]:
# https://github.com/MIT-LCP/mimic-iv-website/blob/master/content/core/patients.md
# https://physionet.org/content/mimiciv/2.2/hosp/patients.csv.gz
patients_csv_path = join(base_dir, 'patients.csv.gz')
patients_df = pd.read_csv(patients_csv_path, compression='gzip', low_memory=False)
patients_df

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10000032,F,52,2180,2014 - 2016,2180-09-09
1,10000048,F,23,2126,2008 - 2010,
2,10000068,F,19,2160,2008 - 2010,
3,10000084,M,72,2160,2017 - 2019,2161-02-13
4,10000102,F,27,2136,2008 - 2010,
...,...,...,...,...,...,...
299707,19999828,F,46,2147,2017 - 2019,
299708,19999829,F,28,2186,2008 - 2010,
299709,19999840,M,58,2164,2008 - 2010,2164-09-17
299710,19999914,F,49,2158,2017 - 2019,


Preprocessing `meta_df`:

In [6]:
original_size = len(meta_df)
print('Available views: ', meta_df.ViewPosition.unique())
meta_df = meta_df.dropna(subset=['ViewPosition'])
meta_df = meta_df.rename(columns={'ViewPosition': 'view'})
meta_df = meta_df[meta_df['view'].isin(['AP', 'PA'])]
meta_df['view'] = meta_df['view'].str.lower()
meta_df = meta_df[['dicom_id', 'subject_id', 'study_id', 'view']]
meta_df = meta_df.reset_index(drop=True)
dropped_size = len(meta_df)
print(f'meta_df size: original {original_size}, new {dropped_size}, dropped {original_size - dropped_size}')
meta_df

Available views:  ['PA' 'LATERAL' 'AP' 'LL' nan 'LAO' 'RAO' 'AP AXIAL' 'SWIMMERS' 'PA LLD'
 'AP LLD' 'XTABLE LATERAL' 'AP RLD' 'PA RLD' 'LPO']
meta_df size: original 377110, new 243334, dropped 133776


Unnamed: 0,dicom_id,subject_id,study_id,view
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,pa
1,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,pa
2,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,ap
3,fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818,10000032,53911762,ap
4,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,10000032,56699142,ap
...,...,...,...,...
243329,3fcd0406-9b111603-feae7033-96632b3a-111333e5,19999733,57132437,pa
243330,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,19999733,57132437,pa
243331,58766883-376a15ce-3b323a28-6af950a0-16b793bd,19999987,55368167,ap
243332,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,19999987,58621812,ap


Preprocessing `labels_df`:

In [None]:
original_size = len(labels_df)
labels_df = labels_df[~(labels_df['No Finding'] == 1.0)]
labels_df = labels_df.drop(['No Finding'], axis=1)

labels_df = labels_df.rename(columns={
    'Pleural Effusion': 'Effusion',
    'Lung Opacity': 'Lung_opacity',
})
for pathology in PATHOLOGIES:
    labels_df = labels_df.rename(columns={pathology.capitalize(): pathology})

labels_df = labels_df[['subject_id', 'study_id'] + PATHOLOGIES]

labels_df = labels_df.fillna(0)
labels_df = labels_df.replace(-1.0, 0)
labels_df = labels_df.astype(int)

labels_df = labels_df.reset_index(drop=True)
dropped_size = len(labels_df)
print(f'labels_df size: original {original_size}, new {dropped_size}, dropped {original_size - dropped_size}')
labels_df

labels_df size: original 227827, new 152372, dropped 75455


Unnamed: 0,subject_id,study_id,atelectasis,cardiomegaly,consolidation,edema,effusion,lung_opacity,pneumonia,pneumothorax
0,10000764,57375967,0,0,1,0,0,0,0,0
1,10000935,50578979,0,0,0,0,1,0,1,0
2,10000935,51178377,0,0,0,0,0,1,0,0
3,10000935,56164612,0,0,0,0,0,1,0,0
4,10000935,56522600,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
152367,19999287,58938059,0,0,0,0,0,1,0,0
152368,19999442,58497551,1,0,0,0,0,1,0,0
152369,19999987,55368167,1,0,0,0,0,0,0,0
152370,19999987,58621812,1,0,0,0,0,0,0,0


Preprocessing `patients_df`:

In [None]:
original_size = len(patients_df)
patients_df = patients_df.rename(columns={'anchor_age': 'age', 'gender': 'sex'})
patients_df = patients_df.dropna(subset=['age', 'sex'])
patients_df = patients_df[patients_df['age'].between(*AGE_INTERVAL, inclusive='both')]
patients_df = patients_df[['subject_id', 'age', 'sex']]
patients_df['sex'] = patients_df['sex'].str.lower()
patients_df = patients_df.reset_index(drop=True)
dropped_size = len(patients_df)
print(f'patients_df size: original {original_size}, new {dropped_size}, dropped {original_size - dropped_size}')
patients_df

patients_df size: original 299712, new 274579, dropped 25133


Unnamed: 0,subject_id,age,sex
0,10000032,52,f
1,10000048,23,f
2,10000068,19,f
3,10000084,72,m
4,10000102,27,f
...,...,...,...
274574,19999828,46,f
274575,19999829,28,f
274576,19999840,58,m
274577,19999914,49,f


Join `meta_df` and `patients_df`:

In [None]:
meta_patients_df = pd.merge(meta_df, patients_df, how='inner', on=['subject_id'])
print(f'Drop {len(patients_df) - len(meta_patients_df)} records')
meta_patients_df

Drop 75877 records


Unnamed: 0,dicom_id,subject_id,study_id,view,age,sex
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,pa,52,f
1,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,pa,52,f
2,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,ap,52,f
3,fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818,10000032,53911762,ap,52,f
4,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,10000032,56699142,ap,52,f
...,...,...,...,...,...,...
198697,3fcd0406-9b111603-feae7033-96632b3a-111333e5,19999733,57132437,pa,19,f
198698,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,19999733,57132437,pa,19,f
198699,58766883-376a15ce-3b323a28-6af950a0-16b793bd,19999987,55368167,ap,57,f
198700,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,19999987,58621812,ap,57,f


Join all:

In [None]:
all_df = pd.merge(meta_patients_df, labels_df, how='inner', on=['subject_id', 'study_id'])
print(f'Drop {len(meta_df) - len(labels_df)} records')
all_df

Drop 90962 records


Unnamed: 0,dicom_id,subject_id,study_id,view,age,sex,atelectasis,cardiomegaly,consolidation,edema,effusion,lung_opacity,pneumonia,pneumothorax
0,d0b71acc-b5a62046-bbb5f6b8-7b173b85-65cdf738,10000935,50578979,ap,52,f,0,0,0,0,1,0,1,0
1,9b314ad7-fbcb0422-6db62dfc-732858d0-a5527d8b,10000935,51178377,ap,52,f,0,0,0,0,0,1,0,0
2,8e3f2822-0c1d4b71-2a265bbf-5b96e531-ccf5fa30,10000935,56164612,ap,52,f,0,0,0,0,0,1,0,0
3,f1adcae3-2921c0a8-5d9652f9-4191ecd7-f2a96f35,10000935,56522600,pa,52,f,0,0,0,0,0,0,0,0
4,88498b37-c21dc7ba-bc202800-b517a62d-f7ac5bcf,10000935,58219844,ap,52,f,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127903,2eb70dfe-52fa728e-a36e09be-ec0ed3cf-0a2ea7f0,19999287,58938059,ap,71,f,0,0,0,0,0,1,0,0
127904,ee9155f3-944c056b-c76c73d0-3f792f2c-92ae461e,19999442,58497551,ap,41,m,1,0,0,0,0,1,0,0
127905,58766883-376a15ce-3b323a28-6af950a0-16b793bd,19999987,55368167,ap,57,f,1,0,0,0,0,0,0,0
127906,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,19999987,58621812,ap,57,f,1,0,0,0,0,0,0,0


Final df:

In [11]:
all_df['name'] = (
    'p' +
    all_df['subject_id'].astype('string').str[:2] +
    '/' +
    'p' +
    all_df['subject_id'].astype('string') +
    '/' +
    's' +
    all_df['study_id'].astype('string') +
    '/' +
    all_df['dicom_id']
)
if UNIQUE_STUDIES:
    all_df = all_df.groupby(['subject_id']).first()
cols = ['name', 'age', 'sex', 'view'] + PATHOLOGIES
df = all_df[cols]
df.insert(0, 'dataset', 'mimic', True)
df

Unnamed: 0,dataset,name,age,sex,view,atelectasis,cardiomegaly,consolidation,edema,effusion,lung_opacity,pneumonia,pneumothorax
0,mimic,p10/p10000935/s50578979/d0b71acc-b5a62046-bbb5...,52,f,ap,0,0,0,0,1,0,1,0
1,mimic,p10/p10000935/s51178377/9b314ad7-fbcb0422-6db6...,52,f,ap,0,0,0,0,0,1,0,0
2,mimic,p10/p10000935/s56164612/8e3f2822-0c1d4b71-2a26...,52,f,ap,0,0,0,0,0,1,0,0
3,mimic,p10/p10000935/s56522600/f1adcae3-2921c0a8-5d96...,52,f,pa,0,0,0,0,0,0,0,0
4,mimic,p10/p10000935/s58219844/88498b37-c21dc7ba-bc20...,52,f,ap,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
127903,mimic,p19/p19999287/s58938059/2eb70dfe-52fa728e-a36e...,71,f,ap,0,0,0,0,0,1,0,0
127904,mimic,p19/p19999442/s58497551/ee9155f3-944c056b-c76c...,41,m,ap,1,0,0,0,0,1,0,0
127905,mimic,p19/p19999987/s55368167/58766883-376a15ce-3b32...,57,f,ap,1,0,0,0,0,0,0,0
127906,mimic,p19/p19999987/s58621812/7ba273af-3d290f8d-e28d...,57,f,ap,1,0,0,0,0,0,0,0


In [12]:
wf_df = df[df[PATHOLOGIES].any(axis=1)]
nf_df = df[~df[PATHOLOGIES].any(axis=1)]

### Overview

In [13]:
print(
    f'        Total: {df.shape[0]}\n'
    f'With findings: {wf_df.shape[0]:6d}\n'
    f'  No findings: {nf_df.shape[0]:6d}'
)
df[PATHOLOGIES].sum()

        Total: 127908
With findings: 111869
  No findings:  16039


atelectasis      38297
cardiomegaly     36512
consolidation     9183
edema            21894
effusion         43544
lung_opacity     42779
pneumonia        13679
pneumothorax      9215
dtype: int64

### Saving


With findings:

In [14]:
wf_filepath = join(metachest_dir, f'{ds_name}.csv')
wf_df.to_csv(wf_filepath, index=False)
wf_filepath

'/data/datasets/metachest/mimic.csv'

No findings and MTL partition:

In [None]:
def generate_mtl_nf_partition(nf_df, seed=0, mset=(0, 1, 2)):
    n_mtrn = 380503
    n_mval = 6793
    n_mtst = 209198
    n_total = n_mtrn + n_mval + n_mtst
    pct_mtrn = n_mtrn / n_total
    pct_mval = n_mval / n_total
    pct_mtst = n_mtst / n_total

    nf_df = nf_df.iloc[:, :5]
    nf_df = nf_df.sample(frac=1)

    n_total = nf_df.shape[0]
    n_mtrn = int(n_total * pct_mtrn)
    n_mtst = int(n_total * pct_mtst)
    n_mval = n_total - (n_mtrn + n_mtst)

    mtrn_df = nf_df.iloc[:n_mtrn].copy()
    mval_df = nf_df.iloc[n_mtrn:n_mtrn+n_mval].copy()
    mtst_df = nf_df.iloc[n_mtrn+n_mval:].copy()
    mtrn_df['mset'] = mset[0]
    mval_df['mset'] = mset[1]
    mtst_df['mset'] = mset[2]

    nf_mtl_df = pd.concat([mtrn_df, mval_df, mtst_df])
    nf_filepath = join(metachest_dir, f'{ds_name}_nf.csv')
    nf_mtl_df.to_csv(nf_filepath, index=False)

    final_pct_mtrn = mtrn_df.shape[0] / nf_mtl_df.shape[0]
    final_pct_mval = mval_df.shape[0] / nf_mtl_df.shape[0]
    final_pct_mtst = mtst_df.shape[0] / nf_mtl_df.shape[0]

    print(f'Original: '
        f'mtrn={pct_mtrn*100:5.2f}% '
        f'mval={pct_mval*100:5.2f}% '
        f'mtst={pct_mtst*100:5.2f}%\n'
        f'     New: '
        f'mtrn={final_pct_mtrn*100:5.2f}% '
        f'mval={final_pct_mval*100:5.2f}% '
        f'mtst={final_pct_mtst*100:5.2f}%\n'
        f'     New: '
        f'mtrn={mtrn_df.shape[0]} '
        f'mval={mval_df.shape[0]} '
        f'mtst={mtst_df.shape[0]}'
    )

    print(f'Saved to {nf_filepath}')


generate_mtl_nf_partition(nf_df)


Original: mtrn=63.79% mval= 1.14% mtst=35.07%
     New: mtrn=63.79% mval= 1.14% mtst=35.07%
     New: mtrn=10231 mval=183 mtst=5625
Saved to /data/datasets/metachest/mimic_nf.csv
