This pipeline generates the 1st stage of the preprocessed dataset that is meant to be fed to models later on. 

Structure (json)

Each level is a patient

- A patient may contain several admissions
    - each admission has information:
        - codes
            - can be formatted in different ways
        - timestamps
        - clinical notes (todo)

# Data structure

(easier to visualize in md editor rather than md compiled)

{metadata : {groupings: \[list_of_icd9_groupings\] <br>
 data: { <br>
  patient_1: { <br>
   grouping1: \[code1,code2,code3\], <br>
   ... <br>
   groupingn: \[code1,code2,code3\] <br>
  }, <br>
  ..., <br>
  patient_n: {<br>
  ...<br>
  }<br>
}

### What is the advantage of having data stored in this way?


- data available on demand to input to models, no need to constantly perform data retrieval and preprocessing

In [1]:
import os
from Mimic import Mimic
from ICDCodesGrouper import ICDCodesGrouper

import pandas as pd
import numpy as np

#from tqdm.notebook import tqdm

from torch.utils.data import Dataset
from torch import nn
import torch
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence, pack_sequence
import torch.nn.functional as F

import json

from config import Settings; settings = Settings()

In [2]:
grouper = ICDCodesGrouper(settings)
mimic = Mimic(settings=settings,grouper=grouper)

## Get patients eligible for modelling

In [3]:
filename = 'eligible_patients_exc.nodiag_single.adm.txt'
filepath = os.path.join(settings.data_base,settings.eligible_patients_folder,filename)

eligible_patients = np.loadtxt(filepath,dtype=int)
print(f"{len(eligible_patients)=}")

len(eligible_patients)=7499


## Build dictionary of data

In [4]:
adm = mimic.read_admissions().where(lambda df: df.SUBJECT_ID.isin(eligible_patients)).dropna(how='all')[['SUBJECT_ID','ADMITTIME','HADM_ID']]
adm.head(1)
adm.shape

(19917, 3)

In [5]:
# where it all begins
data = {}

# dumb, error prone way of getting only the columns of icd codings (eg.: icd9,ccs,chapters,etc)
grouping_columns = [col for col in mimic.read_diagnoses() if col not in ["ROW_ID","SUBJECT_ID","HADM_ID","SEQ_NUM"]]

print_every = 0.2 # percent
current = print_every
for idx,p in enumerate(eligible_patients):
    
    p = int(p)
    
    data[p] = {}
    
    p_adm_data = adm[adm.SUBJECT_ID == p]
    
    # sanity check that all admissions are sorted inside each patient data
    assert p_adm_data.ADMITTIME.is_monotonic_increasing, f'Oopsie, p={p}'
    
    for hadm in p_adm_data['HADM_ID']:
        #diagnoses data
        diag_data = mimic.get_diagnoses_for_admission(hadm)
        
        # if no information about diagnostics then ignore
        if diag_data.ICD9_CODE.isna().all():
            print('sanity check. shouldn\'t hit here')
            continue
        
        # sanity check that diagnostics are sorted (important for future experiments)
        assert diag_data.SEQ_NUM.is_monotonic_increasing, f'Oopsie, p={p}, hadm_id={hadm}'
        
        for grouping in grouping_columns:
            if grouping not in data[p].keys():
                data[p][grouping] = [diag_data[grouping].tolist()]
            else:
                data[p][grouping].append(diag_data[grouping].tolist())
    if (idx+1)/len(eligible_patients) >= current:
        print(f'{int(current*100)}% done.')
        current += print_every

sanity check. shouldn't hit here
sanity check. shouldn't hit here
20% done.
sanity check. shouldn't hit here
40% done.
sanity check. shouldn't hit here
60% done.
sanity check. shouldn't hit here
80% done.
sanity check. shouldn't hit here
100% done.


In [6]:
# add metadata of available groupings
metadata = {'groupings':grouping_columns}
data = {'metadata':metadata,'data':data}

# save

In [12]:
data_id = 'diag_only'
datapath = os.path.join(settings.data_base,settings.model_ready_dataset_folder,data_id)

# create folder of this dataset
if not os.path.isdir(datapath):
    os.mkdir(datapath)

In [17]:
dataset_filename = 'dataset.json'
dataset_filepath = os.path.join(datapath,dataset_filename)

with open(dataset_filepath, 'w') as fp:
    json.dump(data, fp)

# Test

### Read

In [19]:
with open(dataset_filepath,'r') as fp:
    data = json.load(fp)

### read all patient ids

In [20]:
patient_ids = list(data['data'].keys()) # patient id's
len(patient_ids)

7499