This pipeline generates the 1st stage of the preprocessed dataset that is meant to be fed to models later on. 

Structure (json)

Each level is a patient

- A patient may contain several admissions
    - each admission has information:
        - codes
            - can be formatted in different ways
        - timestamps
        - clinical notes (todo)

# Data structure

(easier to visualize in md editor rather than md compiled)

{metadata : {groupings: \[list_of_icd9_groupings\] <br>
 data: { <br>
  patient_1: { <br>
   grouping1: \[code1,code2,code3\], <br>
   ... <br>
   groupingn: \[code1,code2,code3\] <br>
  }, <br>
  ..., <br>
  patient_n: {<br>
  ...<br>
  }<br>
}

### What is the advantage of having data stored in this way?


- data available on demand to input to models, no need to constantly perform data retrieval and preprocessing

In [1]:
import os
cwd = os.getcwd()

# protection against running this cell multiple times
assert os.path.dirname(cwd).split('/')[-1] == 'master-thesis','Oops, directory already changed previously as indended. Ignoring...'

# change working directory (if assert passed)
new_cwd = os.path.dirname(cwd) # parent directory
os.chdir(new_cwd)

In [2]:
# show all outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
import os
from MimicIII import MimicIII
from MimicIV import MimicIV
from ICDCodesGrouper import ICDCodesGrouper

import pandas as pd
import numpy as np

#from tqdm.notebook import tqdm

from torch.utils.data import Dataset
from torch import nn
import torch
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence, pack_sequence
import torch.nn.functional as F

import json

from config import Settings; settings = Settings()

In [4]:
grouper = ICDCodesGrouper(settings)
mimicIII = MimicIII(settings=settings,grouper=grouper)
mimicIV = MimicIV(settings=settings,grouper=grouper)

## Get patients eligible for modelling

In [6]:
filename = 'mimicIII_eligible_patients_exc.nodiag_single.adm.txt'
filepath = os.path.join(settings.data_base,settings.eligible_patients_folder,filename)

mimic_III_eligible_patients = np.loadtxt(filepath,dtype=int)
print(f"{len(mimic_III_eligible_patients)=}")

len(mimic_III_eligible_patients)=7499


## Build dictionary of data

In [7]:
adm = mimicIII.read_admissions().where(lambda df: df.SUBJECT_ID.isin(mimic_III_eligible_patients)).dropna(how='all')[['SUBJECT_ID','ADMITTIME','HADM_ID']]
adm.head(1)
adm.shape

Unnamed: 0,SUBJECT_ID,ADMITTIME,HADM_ID
25361,20957.0,2100-06-24 22:37:00,113808.0


(19917, 3)

In [18]:
# where it all begins
data = {}

# dumb, error prone way of getting only the columns of icd codings (eg.: icd9,ccs,chapters,etc)
grouping_columns = [col for col in mimicIII.read_diagnoses() if col not in ["ROW_ID","SUBJECT_ID","HADM_ID","SEQ_NUM"]]

print_every = 0.2 # percent
current = print_every
for idx,p in enumerate(mimic_III_eligible_patients):
    
    p = int(p)
    
    data[p] = {}
    
    p_adm_data = adm[adm.SUBJECT_ID == p]
    
    # sanity check that all admissions are sorted inside each patient data
    assert p_adm_data.ADMITTIME.is_monotonic_increasing, f'Oopsie, p={p}'
    
    for hadm in p_adm_data['HADM_ID']:
        #diagnoses data
        diag_data = mimicIII.get_diagnoses_for_admission(hadm)
        
        # if no information about diagnostics then ignore
        if diag_data.ICD9_CODE.isna().all():
            print('No diagnoses found for this admission. Skipping')
            print('patient',p)
            print('admission',int(hadm))
            print('-----\n')
            continue
        
        # sanity check that diagnostics are sorted (important for future experiments)
        assert diag_data.SEQ_NUM.is_monotonic_increasing, f'Oopsie, p={p}, hadm_id={hadm}'
        
        for grouping in grouping_columns:
            if grouping not in data[p].keys():
                data[p][grouping] = [diag_data[grouping].tolist()]
            else:
                data[p][grouping].append(diag_data[grouping].tolist())
    if (idx+1)/len(mimic_III_eligible_patients) >= current:
        print(f'{int(current*100)}% done.\n')
        current += print_every

No diagnoses found for this admission. Skipping
patient 690
admission 174817
-----

No diagnoses found for this admission. Skipping
patient 3369
admission 126808
-----

20% done.

No diagnoses found for this admission. Skipping
patient 11438
admission 154602
-----

40% done.

No diagnoses found for this admission. Skipping
patient 24975
admission 109963
-----

60% done.

No diagnoses found for this admission. Skipping
patient 31928
admission 153208
-----

80% done.

No diagnoses found for this admission. Skipping
patient 73686
admission 112990
-----

100% done.



In [19]:
# add metadata of available groupings
metadata = {'groupings':grouping_columns}
data = {'metadata':metadata,'data':data}

# save

In [20]:
data_id = 'diag_only'
datapath = os.path.join(settings.data_base,settings.model_ready_dataset_folder,data_id)

# create folder of this dataset
if not os.path.isdir(datapath):
    os.mkdir(datapath)

In [21]:
dataset_filename = 'dataset.json'
dataset_filepath = os.path.join(datapath,dataset_filename)

with open(dataset_filepath, 'w') as fp:
    json.dump(data, fp)

In [23]:
dataset_filepath

'data/model_ready_dataset/diag_only/dataset.json'

# Now for MIMIC-IV

## Get patients elegible for modelling

In [5]:
filename = 'mimicIV_eligible_patients_exc.nodiag_single.adm_no.icd10.txt'
filepath = os.path.join(settings.data_base,settings.eligible_patients_folder,filename)

mimic_IV_eligible_patients = np.loadtxt(filepath,dtype=int)
print(f"{len(mimic_IV_eligible_patients)=}")

len(mimic_IV_eligible_patients)=55483


## Build dictionary of data

In [6]:
adm = mimicIV.read_admissions().where(lambda df: df.subject_id.isin(mimic_IV_eligible_patients)).dropna(how='all')[['subject_id','admittime','hadm_id']]
diagnoses = mimicIV.read_diagnoses()
adm.head(1)
adm.shape

Unnamed: 0,subject_id,admittime,hadm_id
14588,16233333.0,2109-08-31 04:20:00,26733622.0


(262727, 3)

In [21]:
# where it all begins
data = {int(p):{} for p in mimic_IV_eligible_patients} 

# dumb, error prone way of getting only the columns of icd codings (eg.: icd9,ccs,chapters,etc)
grouping_columns = [col for col in mimicIV.read_diagnoses() if col not in ["icd_version","icd9_chapters","icd9_level3","subject_id","hadm_id","seq_num","hadm_index"]]

# include only eligible patients
diagnoses_eligible = diagnoses[diagnoses.subject_id.isin(mimic_IV_eligible_patients)]

for grouping in grouping_columns:
    res = (diagnoses_eligible
           .groupby(['subject_id','hadm_index'])
           .apply(lambda subdf:subdf[grouping].tolist())
           .groupby('subject_id')
           .apply(list)
          )
    for idx,(p,diags) in enumerate(res.iteritems()):
        data[p][grouping] = diags
        
    print(f'{grouping} done')

icd_code done
ccs done
icd9chapters done


## Save

In [24]:
# add metadata of available groupings
metadata = {'groupings':grouping_columns}
data = {'metadata':metadata,'data':data}

In [25]:
data_id = 'diag_only'
datapath = os.path.join(settings.data_base,settings.model_ready_dataset_folder,data_id)

# create folder of this dataset
if not os.path.isdir(datapath):
    os.mkdir(datapath)

In [26]:
dataset_filename = 'mimic_iv_quick_baseline_dataset.json'
dataset_filepath = os.path.join(datapath,dataset_filename)

with open(dataset_filepath, 'w') as fp:
    json.dump(data, fp)

# Test

### Read

In [27]:
with open(dataset_filepath,'r') as fp:
    data = json.load(fp)

### read all patient ids

In [28]:
patient_ids = list(data['data'].keys()) # patient id's
len(patient_ids)

55483