# MIMIC CSV

This notebook generates `mimic.csv` assuming the following are downloaded:

* [MIMIC-CXR-JPG](https://physionet.org/content/mimic-cxr-jpg/2.0.0/) dataset


In [None]:
from os.path import join

import pandas as pd

from common import read_toml
from common import AGE_INTERVAL
from common import MIMIC_PATHOLOGIES as PATHOLOGIES

UNIQUE_STUDIES = False

### Listing files

In [None]:
config = read_toml('config.toml')
base_dir = join(config['mimic_dir'], 'physionet.org/files/mimic-cxr-jpg/2.0.0')
!ls -hs1 {base_dir}

### Generating CSV

In [None]:
meta_csv_path = join(base_dir, 'mimic-cxr-2.0.0-metadata.csv.gz')
meta_df = pd.read_csv(meta_csv_path, compression='gzip', low_memory=False)
meta_df

In [None]:
labels_csv_path = join(base_dir, 'mimic-cxr-2.0.0-chexpert.csv.gz')
labels_df = pd.read_csv(labels_csv_path, compression='gzip', low_memory=False)
labels_df

In [None]:
# https://github.com/MIT-LCP/mimic-iv-website/blob/master/content/core/patients.md
# https://physionet.org/content/mimiciv/2.2/hosp/patients.csv.gz
patients_csv_path = join(base_dir, 'patients.csv.gz')
patients_df = pd.read_csv(patients_csv_path, compression='gzip', low_memory=False)
patients_df

Preprocessing `meta_df`:

In [None]:
original_size = len(meta_df)
print('Available views: ', meta_df.ViewPosition.unique())
meta_df = meta_df.dropna(subset=['ViewPosition'])
meta_df = meta_df.rename(columns={'ViewPosition': 'view'})
meta_df = meta_df[meta_df['view'].isin(['AP', 'PA'])]
meta_df['view'] = meta_df['view'].str.lower()
meta_df = meta_df[['dicom_id', 'subject_id', 'study_id', 'view']]
meta_df = meta_df.reset_index(drop=True)
dropped_size = len(meta_df)
print(f'meta_df size: original {original_size}, new {dropped_size}, dropped {original_size - dropped_size}')
meta_df

Preprocessing `labels_df`:

In [None]:
original_size = len(labels_df)
labels_df = labels_df[~(labels_df['No Finding'] == 1.0)]
labels_df = labels_df.drop(['No Finding'], axis=1)

labels_df = labels_df.rename(columns={
    'Pleural Effusion': 'Effusion',
    'Lung Opacity': 'Lung_opacity',
})
for pathology in PATHOLOGIES:
    labels_df = labels_df.rename(columns={pathology.capitalize(): pathology})

labels_df = labels_df[['subject_id', 'study_id'] + PATHOLOGIES]

labels_df = labels_df.fillna(0)
labels_df = labels_df.replace(-1.0, 0)
labels_df = labels_df.astype(int)
labels_df = labels_df[labels_df[PATHOLOGIES].any(axis=1)]

labels_df = labels_df.reset_index(drop=True)
dropped_size = len(labels_df)
print(f'labels_df size: original {original_size}, new {dropped_size}, dropped {original_size - dropped_size}')
labels_df

Preprocessing `patients_df`:

In [None]:
original_size = len(patients_df)
patients_df = patients_df.rename(columns={'anchor_age': 'age', 'gender': 'sex'})
patients_df = patients_df.dropna(subset=['age', 'sex'])
patients_df = patients_df[patients_df['age'].between(*AGE_INTERVAL, inclusive='both')]
patients_df = patients_df[['subject_id', 'age', 'sex']]
patients_df['sex'] = patients_df['sex'].str.lower()
patients_df = patients_df.reset_index(drop=True)
dropped_size = len(patients_df)
print(f'patients_df size: original {original_size}, new {dropped_size}, dropped {original_size - dropped_size}')
patients_df

Join `meta_df` and `patients_df`:

In [None]:
meta_patients_df = pd.merge(meta_df, patients_df, how='inner', on=['subject_id'])
print(f'Drop {len(patients_df) - len(meta_patients_df)} records')
meta_patients_df

Join all:

In [None]:
all_df = pd.merge(meta_patients_df, labels_df, how='inner', on=['subject_id', 'study_id'])
print(f'Drop {len(meta_df) - len(labels_df)} records')
all_df

Final df:

In [None]:
all_df['name'] = (
    'p' +
    all_df['subject_id'].astype('string').str[:2] +
    '/' +
    'p' +
    all_df['subject_id'].astype('string') +
    '/' +
    's' +
    all_df['study_id'].astype('string') +
    '/' +
    all_df['dicom_id']
)
if UNIQUE_STUDIES:
    all_df = all_df.groupby(['subject_id']).first()
cols = ['name', 'age', 'sex', 'view'] + PATHOLOGIES
df = all_df[cols]
df.insert(0, 'dataset', 'mimic', True)
df

 Check there are no normal examples:

In [None]:
df[(~df[PATHOLOGIES].astype(bool)).all(axis=1)]

### Saving CSV

In [None]:
path = join(read_toml('config.toml')['metachest_dir'], 'mimic.csv')
df.to_csv(path, index=False)
path

Overview:

In [None]:
print(f'Total: {df.shape[0]}')
df[PATHOLOGIES].sum()