# MIMIC CSV

This notebook generates `mimic.csv` and `mimic_nf.csv` assuming the following are downloaded:

* [MIMIC-CXR-JPG](https://physionet.org/content/mimic-cxr-jpg/2.0.0/) dataset


In [None]:
import sys
from os.path import join

import pandas as pd

sys.path.append('../')
from common import read_toml
from common import AGE_INTERVAL
from common import MIMIC_PATHOLOGIES as PATHOLOGIES

UNIQUE_STUDIES = False

### Listing files

In [None]:
ds_name = 'mimic'
config = read_toml('../config.toml')
metachest_dir = config['metachest_dir']
base_dir = join(config[f'{ds_name}_dir'], 'physionet.org/files/mimic-cxr-jpg/2.0.0')
!ls -hs1 {base_dir}

### Generating CSV

In [None]:
meta_csv_path = join(base_dir, 'mimic-cxr-2.0.0-metadata.csv.gz')
meta_df = pd.read_csv(meta_csv_path, compression='gzip', low_memory=False)
meta_df

In [None]:
labels_csv_path = join(base_dir, 'mimic-cxr-2.0.0-chexpert.csv.gz')
labels_df = pd.read_csv(labels_csv_path, compression='gzip', low_memory=False)
labels_df

In [None]:
# https://github.com/MIT-LCP/mimic-iv-website/blob/master/content/core/patients.md
# https://physionet.org/content/mimiciv/2.2/hosp/patients.csv.gz
patients_csv_path = join(base_dir, 'patients.csv.gz')
patients_df = pd.read_csv(patients_csv_path, compression='gzip', low_memory=False)
patients_df

Preprocessing `meta_df`:

In [None]:
original_size = len(meta_df)
print('Available views: ', meta_df.ViewPosition.unique())
meta_df = meta_df.dropna(subset=['ViewPosition'])
meta_df = meta_df.rename(columns={'ViewPosition': 'view'})
meta_df = meta_df[meta_df['view'].isin(['AP', 'PA'])]
meta_df['view'] = meta_df['view'].str.lower()
meta_df = meta_df[['dicom_id', 'subject_id', 'study_id', 'view']]
meta_df = meta_df.reset_index(drop=True)
dropped_size = len(meta_df)
print(f'meta_df size: original {original_size}, new {dropped_size}, dropped {original_size - dropped_size}')
meta_df

Preprocessing `labels_df`:

In [None]:
original_size = len(labels_df)
labels_df = labels_df[~(labels_df['No Finding'] == 1.0)]
labels_df = labels_df.drop(['No Finding'], axis=1)

labels_df = labels_df.rename(columns={
    'Pleural Effusion': 'Effusion',
    'Lung Opacity': 'Lung_opacity',
})
for pathology in PATHOLOGIES:
    labels_df = labels_df.rename(columns={pathology.capitalize(): pathology})

labels_df = labels_df[['subject_id', 'study_id'] + PATHOLOGIES]

labels_df = labels_df.fillna(0)
labels_df = labels_df.replace(-1.0, 0)
labels_df = labels_df.astype(int)

labels_df = labels_df.reset_index(drop=True)
dropped_size = len(labels_df)
print(f'labels_df size: original {original_size}, new {dropped_size}, dropped {original_size - dropped_size}')
labels_df

Preprocessing `patients_df`:

In [None]:
original_size = len(patients_df)
patients_df = patients_df.rename(columns={'anchor_age': 'age', 'gender': 'sex'})
patients_df = patients_df.dropna(subset=['age', 'sex'])
patients_df = patients_df[patients_df['age'].between(*AGE_INTERVAL, inclusive='both')]
patients_df = patients_df[['subject_id', 'age', 'sex']]
patients_df['sex'] = patients_df['sex'].str.lower()
patients_df = patients_df.reset_index(drop=True)
dropped_size = len(patients_df)
print(f'patients_df size: original {original_size}, new {dropped_size}, dropped {original_size - dropped_size}')
patients_df

Join `meta_df` and `patients_df`:

In [None]:
meta_patients_df = pd.merge(meta_df, patients_df, how='inner', on=['subject_id'])
print(f'Drop {len(patients_df) - len(meta_patients_df)} records')
meta_patients_df

Join all:

In [None]:
all_df = pd.merge(meta_patients_df, labels_df, how='inner', on=['subject_id', 'study_id'])
print(f'Drop {len(meta_df) - len(labels_df)} records')
all_df

Final df:

In [None]:
all_df['name'] = (
    'p' +
    all_df['subject_id'].astype('string').str[:2] +
    '/' +
    'p' +
    all_df['subject_id'].astype('string') +
    '/' +
    's' +
    all_df['study_id'].astype('string') +
    '/' +
    all_df['dicom_id']
)
if UNIQUE_STUDIES:
    all_df = all_df.groupby(['subject_id']).first()
cols = ['name', 'age', 'sex', 'view'] + PATHOLOGIES
df = all_df[cols]
df.insert(0, 'dataset', 'mimic', True)
df

In [None]:
wf_df = df[df[PATHOLOGIES].any(axis=1)]
nf_df = df[~df[PATHOLOGIES].any(axis=1)]

### Overview

In [None]:
print(
    f'        Total: {df.shape[0]}\n'
    f'With findings: {wf_df.shape[0]:6d}\n'
    f'  No findings: {nf_df.shape[0]:6d}'
)
df[PATHOLOGIES].sum()

### Saving


With findings:

In [None]:
wf_filepath = join(metachest_dir, f'{ds_name}.csv')
wf_df.to_csv(wf_filepath, index=False)
wf_filepath

No findings and MTL partition:

In [None]:
def generate_mtl_nf_partition(nf_df, seed=0, mset=(0, 1, 2)):
    n_mtrn = 380503
    n_mval = 6793
    n_mtst = 209198
    n_total = n_mtrn + n_mval + n_mtst
    pct_mtrn = n_mtrn / n_total
    pct_mval = n_mval / n_total
    pct_mtst = n_mtst / n_total

    nf_df = nf_df.iloc[:, :5]
    nf_df = nf_df.sample(frac=1)

    n_total = nf_df.shape[0]
    n_mtrn = int(n_total * pct_mtrn)
    n_mtst = int(n_total * pct_mtst)
    n_mval = n_total - (n_mtrn + n_mtst)

    mtrn_df = nf_df.iloc[:n_mtrn].copy()
    mval_df = nf_df.iloc[n_mtrn:n_mtrn+n_mval].copy()
    mtst_df = nf_df.iloc[n_mtrn+n_mval:].copy()
    mtrn_df['mset'] = mset[0]
    mval_df['mset'] = mset[1]
    mtst_df['mset'] = mset[2]

    nf_mtl_df = pd.concat([mtrn_df, mval_df, mtst_df])
    nf_filepath = join(metachest_dir, f'{ds_name}_nf.csv')
    nf_mtl_df.to_csv(nf_filepath, index=False)

    final_pct_mtrn = mtrn_df.shape[0] / nf_mtl_df.shape[0]
    final_pct_mval = mval_df.shape[0] / nf_mtl_df.shape[0]
    final_pct_mtst = mtst_df.shape[0] / nf_mtl_df.shape[0]

    print(f'Original: '
        f'mtrn={pct_mtrn*100:5.2f}% '
        f'mval={pct_mval*100:5.2f}% '
        f'mtst={pct_mtst*100:5.2f}%\n'
        f'     New: '
        f'mtrn={final_pct_mtrn*100:5.2f}% '
        f'mval={final_pct_mval*100:5.2f}% '
        f'mtst={final_pct_mtst*100:5.2f}%\n'
        f'     New: '
        f'mtrn={mtrn_df.shape[0]} '
        f'mval={mval_df.shape[0]} '
        f'mtst={mtst_df.shape[0]}'
    )

    print(f'Saved to {nf_filepath}')


generate_mtl_nf_partition(nf_df)
