This pipeline generates the 1st stage of the preprocessed dataset that is meant to be fed to models later on. 

Structure (json)

Each level is a patient

- A patient may contain several admissions
    - each admission has information:
        - codes
            - can be formatted in different ways
        - timestamps
        - clinical notes (todo)

# Data structure

(easier to visualize in md editor rather than md compiled)

{metadata : {groupings: \[list_of_icd9_groupings\] <br>
 data: { <br>
  patient_1: { <br>
   grouping1: \[code1,code2,code3\], <br>
   ... <br>
   groupingn: \[code1,code2,code3\] <br>
  }, <br>
  ..., <br>
  patient_n: {<br>
  ...<br>
  }<br>
}

### What is the advantage of having data stored in this way?


- data available on demand to input to models, no need to constantly perform data retrieval and preprocessing

In [1]:
import os
cwd = os.getcwd()

# protection against running this cell multiple times
assert os.path.dirname(cwd).split('/')[-1] == 'master-thesis','Oops, directory already changed previously as indended. Ignoring...'

# change working directory (if assert passed)
new_cwd = os.path.dirname(cwd) # parent directory
os.chdir(new_cwd)

In [2]:
# show all outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [17]:
import os
from MimicIII import MimicIII
from MimicIV import MimicIV
from ICDCodesGrouper import ICDCodesGrouper

import pandas as pd
import numpy as np

#from tqdm.notebook import tqdm

from torch.utils.data import Dataset
from torch import nn
import torch
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence, pack_sequence
import torch.nn.functional as F

import matplotlib.pyplot as plt
import json

from config import Settings; settings = Settings()

In [4]:
grouper = ICDCodesGrouper(settings)
mimicIII = MimicIII(settings=settings,grouper=grouper)
mimicIV = MimicIV(settings=settings,grouper=grouper)

## Get admissions eligible for modelling

In [5]:
#possible m [1,3,6,12]
m = 6
filename = f'mimicIII_eligible_admissions_time_window_{m}m_exc.nodiag_single.window.txt'
filepath = os.path.join(settings.data_base,settings.eligible_patients_folder,filename)

mimic_III_window_eligible_admissions = np.loadtxt(filepath,dtype=int)
print(f"{len(mimic_III_window_eligible_admissions)=}")

len(mimic_III_window_eligible_admissions)=13261


# Timewindows

In [6]:
admissions = mimicIII.read_admissions()
diagnoses = mimicIII.read_diagnoses()
admissions = admissions[admissions.HADM_ID.isin(mimic_III_window_eligible_admissions)]
admissions.head(3)

print('patients:',admissions.SUBJECT_ID.nunique())

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
25361,20957,113808,2100-06-24 22:37:00,2100-07-03 12:31:00,NaT,EMERGENCY,EMERGENCY ROOM ADMIT,HOME HEALTH CARE,Private,,PROTESTANT QUAKER,MARRIED,WHITE,2100-06-24 13:37:00,2100-06-25 00:10:00,BILATERAL PNEUMONIA,0,1
7378,4521,167070,2100-06-28 19:29:00,2100-07-30 11:02:00,NaT,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Medicare,,CATHOLIC,SINGLE,WHITE,NaT,NaT,ISCHEMIC ULCER R GREAT TOE;DIABETES,0,0
11061,9319,137275,2100-07-01 12:00:00,2100-07-15 16:30:00,NaT,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,ENGL,PROTESTANT QUAKER,WIDOWED,WHITE,NaT,NaT,LEFT RENAL CA/SDA,0,1


patients: 4433


In [7]:
timewindows = (admissions
               .groupby('SUBJECT_ID')
               .apply(lambda subdf: (subdf
                                     .set_index('ADMITTIME')
                                     .sort_index()
                                     .resample(f'{m*30}d')
                                     .HADM_ID
                                     .apply(list)
                                    )
                     )
              )

In [8]:
timewindows.apply(len).value_counts().rename('nº windows per patient distribution')

0     19176
1      9045
2      1375
3       325
4        69
5        25
6         7
7         4
11        1
9         1
Name: nº windows per patient distribution, dtype: int64

In [21]:
admissions.groupby('SUBJECT_ID').ADMITTIME.apply(lambda x: x.diff().median().days).describe().rename('MIMIC-III Median difference between admissions in all patients')

count    4433.000000
mean      737.257162
std       724.537495
min         4.000000
25%       231.000000
50%       464.000000
75%      1017.000000
max      4121.000000
Name: MIMIC-III Median difference between admissions in all patients, dtype: float64

In [20]:
diagnostic_cols_mimicIII = [col for col in diagnoses if col not in ['ROW_ID','SUBJECT_ID','HADM_ID','SEQ_NUM']]

In [22]:
# create mask of the targets. 
# Then for each target gather the features (history up to that point) This will define our datapoints

pred_window = m # months

windows_w_admissions_mask = timewindows.apply(lambda x: len(x) > 0)
targets_mask = windows_w_admissions_mask.groupby('SUBJECT_ID').shift(pred_window,fill_value=False)

targets = timewindows[targets_mask]

print('targets')
targets

targets


SUBJECT_ID  ADMITTIME 
23          2156-08-18          []
34          2189-07-02          []
36          2134-04-14    [165660]
85          2165-02-14          []
107         2118-02-04          []
                            ...   
98347       2124-10-11    [177195]
98761       2188-12-31          []
98813       2131-10-27          []
99088       2175-12-06          []
99650       2155-05-23          []
Name: HADM_ID, Length: 2347, dtype: object

In [23]:
targets.apply(len).value_counts()

0    1740
1     480
2      91
3      23
4      10
5       2
6       1
Name: HADM_ID, dtype: int64

In [24]:
targets

SUBJECT_ID  ADMITTIME 
23          2156-08-18          []
34          2189-07-02          []
36          2134-04-14    [165660]
85          2165-02-14          []
107         2118-02-04          []
                            ...   
98347       2124-10-11    [177195]
98761       2188-12-31          []
98813       2131-10-27          []
99088       2175-12-06          []
99650       2155-05-23          []
Name: HADM_ID, Length: 2347, dtype: object

# How many admissions within each target

In [12]:
targets.apply(len).value_counts()

0    8581
1    1847
2     123
3       3
Name: HADM_ID, dtype: int64

In [143]:
idx = pd.IndexSlice
#admissions_w_index = admissions.set_index(['SUBJECT_ID','ADMITTIME']).sort_index() # to speed up queries below
targets.to_frame().reset_index().apply(lambda row: 
                         (admissions_w_index
                          .loc[idx[row.SUBJECT_ID,:row.ADMITTIME],'HADM_ID'],
                         ),
                         axis=1
                        )

0                ([194023],)
1                      ([],)
2                ([109451],)
3                      ([],)
4                ([152223],)
                ...         
16945                  ([],)
16946            ([164914],)
16947                  ([],)
16948            ([151454],)
16949    ([151454, 112748],)
Length: 16950, dtype: object

In [148]:
admissions[(admissions.SUBJECT_ID == 21) & (admissions.ADMITTIME < '2134-09-11')]

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA


In [145]:
targets.to_frame().reset_index()

Unnamed: 0,SUBJECT_ID,ADMITTIME,HADM_ID
0,17,2135-01-26,[]
1,21,2134-09-11,[109451]
2,21,2134-10-11,[]
3,23,2153-09-03,[152223]
4,23,2153-10-03,[]
...,...,...,...
16945,99923,2201-02-23,[164914]
16946,99923,2201-03-25,[]
16947,99982,2156-11-28,[151454]
16948,99982,2156-12-28,[112748]


In [None]:
targets.to_frame().apply(lambda row: ,axis=1)

In [None]:
pd.Series().diff()

In [42]:
(timewindows
 .groupby('SUBJECT_ID')
 .apply(lambda subdf: )

SUBJECT_ID  ADMITTIME 
17          2134-12-27    [194023]
            2135-01-26          []
            2135-02-25          []
            2135-03-27          []
            2135-04-26    [161087]
                            ...   
99923       2201-03-25          []
            2201-04-24    [192053]
99982       2156-11-28    [151454]
            2156-12-28    [112748]
            2157-01-27    [183791]
Name: HADM_ID, Length: 175257, dtype: object

In [28]:
res = (timewindows.iloc[:2]
       .apply(lambda row: 
              pd.DataFrame(
                  [
                      diagnoses
                      .loc[diagnoses.HADM_ID == adm, diagnostic_cols_mimicIII]
                      .to_dict(orient='list')
                      for adm in row # obtain a list of diagnoses of each admission inside each window
                  ]
              ).sum() # concat all lists of diagnoses of each admission inside each window to make one big list per window
             )
      )

In [29]:
res = (timewindows.iloc[:50]
       .apply(lambda row: 
              pd.DataFrame(
                  [
                      diagnoses
                      .loc[diagnoses.HADM_ID == adm, diagnostic_cols_mimicIII]
                      .to_dict(orient='list')
                      for adm in row # obtain a list of diagnoses of each admission inside each window
                  ]
              ).sum() # concat all lists of diagnoses of each admission inside each window to make one big list per window
             )
      )

In [32]:
admissions[admissions.SUBJECT_ID == 23]

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
1,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,NaT,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,CATHOLIC,MARRIED,WHITE,NaT,NaT,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1
2,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,NaT,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,NaT,NaT,BRAIN MASS,0,1


In [35]:
admissions.head(2)

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
25361,20957,113808,2100-06-24 22:37:00,2100-07-03 12:31:00,NaT,EMERGENCY,EMERGENCY ROOM ADMIT,HOME HEALTH CARE,Private,,PROTESTANT QUAKER,MARRIED,WHITE,2100-06-24 13:37:00,2100-06-25 00:10:00,BILATERAL PNEUMONIA,0,1
7378,4521,167070,2100-06-28 19:29:00,2100-07-30 11:02:00,NaT,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Medicare,,CATHOLIC,SINGLE,WHITE,NaT,NaT,ISCHEMIC ULCER R GREAT TOE;DIABETES,0,0


In [34]:
admissions.ADMITTIME.diff()

25361                 NaT
7378      3 days 20:52:00
11061     2 days 16:31:00
37595     1 days 07:28:00
17400    12 days 01:24:00
               ...       
56600    84 days 04:42:00
3611    174 days 04:45:00
29422   155 days 09:35:00
31085    16 days 16:16:00
30835   382 days 03:22:00
Name: ADMITTIME, Length: 17680, dtype: timedelta64[ns]

In [39]:
res = (admissions
 .groupby('SUBJECT_ID')
 .ADMITTIME
 .apply(lambda series: series.diff().max())
)

In [40]:
res

SUBJECT_ID
17       133 days 06:56:00
21       141 days 08:33:00
23      1506 days 12:19:00
34      1680 days 12:37:00
36      1093 days 15:41:00
               ...        
99822     16 days 23:23:00
99883    265 days 06:55:00
99897    331 days 17:28:00
99923     80 days 16:30:00
99982     42 days 00:04:00
Name: ADMITTIME, Length: 6397, dtype: timedelta64[ns]

In [41]:
res.describe()

count                           6397
mean     688 days 10:45:04.905424408
std      760 days 23:17:29.296653968
min                 16 days 23:23:00
25%                117 days 07:46:00
50%                391 days 12:15:00
75%               1011 days 12:53:00
max               4121 days 03:16:00
Name: ADMITTIME, dtype: object

In [37]:
res.days.describe()

AttributeError: 'DataFrame' object has no attribute 'days'

## Build dictionary of data

In [21]:
adm = mimicIII.read_admissions().where(lambda df: df.SUBJECT_ID.isin(mimic_III_eligible_patients)).dropna(how='all')[['SUBJECT_ID','ADMITTIME','HADM_ID']]
adm.head(1)
adm.shape

(19917, 3)

In [22]:
# where it all begins
data = {}

# dumb, error prone way of getting only the columns of icd codings (eg.: icd9,ccs,chapters,etc)
grouping_columns = [col for col in mimicIII.read_diagnoses() if col not in ["ROW_ID","SUBJECT_ID","HADM_ID","SEQ_NUM"]]

print_every = 0.2 # percent
current = print_every
for idx,p in enumerate(eligible_patients):
    
    p = int(p)
    
    data[p] = {}
    
    p_adm_data = adm[adm.SUBJECT_ID == p]
    
    # sanity check that all admissions are sorted inside each patient data
    assert p_adm_data.ADMITTIME.is_monotonic_increasing, f'Oopsie, p={p}'
    
    for hadm in p_adm_data['HADM_ID']:
        #diagnoses data
        diag_data = mimic.get_diagnoses_for_admission(hadm)
        
        # if no information about diagnostics then ignore
        if diag_data.ICD9_CODE.isna().all():
            print('No diagnoses found for this admission. Skipping')
            print('patient',p)
            print('admission',int(hadm))
            print('-----\n')
            continue
        
        # sanity check that diagnostics are sorted (important for future experiments)
        assert diag_data.SEQ_NUM.is_monotonic_increasing, f'Oopsie, p={p}, hadm_id={hadm}'
        
        for grouping in grouping_columns:
            if grouping not in data[p].keys():
                data[p][grouping] = [diag_data[grouping].tolist()]
            else:
                data[p][grouping].append(diag_data[grouping].tolist())
    if (idx+1)/len(eligible_patients) >= current:
        print(f'{int(current*100)}% done.\n')
        current += print_every

No diagnoses found for this admission. Skipping
patient 690
admission 174817
-----

No diagnoses found for this admission. Skipping
patient 3369
admission 126808
-----

20% done.

No diagnoses found for this admission. Skipping
patient 11438
admission 154602
-----

40% done.

No diagnoses found for this admission. Skipping
patient 24975
admission 109963
-----

60% done.

No diagnoses found for this admission. Skipping
patient 31928
admission 153208
-----

80% done.

No diagnoses found for this admission. Skipping
patient 73686
admission 112990
-----

100% done.



In [23]:
# add metadata of available groupings
metadata = {'groupings':grouping_columns}
data = {'metadata':metadata,'data':data}

# save

In [24]:
data_id = 'diag_only'
datapath = os.path.join(settings.data_base,settings.model_ready_dataset_folder,data_id)

# create folder of this dataset
if not os.path.isdir(datapath):
    os.mkdir(datapath)

In [25]:
dataset_filename = 'dataset.json'
dataset_filepath = os.path.join(datapath,dataset_filename)

with open(dataset_filepath, 'w') as fp:
    json.dump(data, fp)

# Now for MIMIC-IV

## Get patients elegible for modelling

In [5]:
filename = 'mimicIV_eligible_patients_exc.nodiag_single.adm_no.icd10.txt'
filepath = os.path.join(settings.data_base,settings.eligible_patients_folder,filename)

mimic_IV_eligible_patients = np.loadtxt(filepath,dtype=int)
print(f"{len(mimic_IV_eligible_patients)=}")

len(mimic_IV_eligible_patients)=55483


## Build dictionary of data

In [6]:
adm = mimicIV.read_admissions().where(lambda df: df.subject_id.isin(mimic_IV_eligible_patients)).dropna(how='all')[['subject_id','admittime','hadm_id']]
diagnoses = mimicIV.read_diagnoses()
adm.head(1)
adm.shape

Unnamed: 0,subject_id,admittime,hadm_id
14588,16233333.0,2109-08-31 04:20:00,26733622.0


(262727, 3)

In [21]:
# where it all begins
data = {int(p):{} for p in mimic_IV_eligible_patients} 

# dumb, error prone way of getting only the columns of icd codings (eg.: icd9,ccs,chapters,etc)
grouping_columns = [col for col in mimicIV.read_diagnoses() if col not in ["icd_version","icd9_chapters","icd9_level3","subject_id","hadm_id","seq_num","hadm_index"]]

# include only eligible patients
diagnoses_eligible = diagnoses[diagnoses.subject_id.isin(mimic_IV_eligible_patients)]

for grouping in grouping_columns:
    res = (diagnoses_eligible
           .groupby(['subject_id','hadm_index'])
           .apply(lambda subdf:subdf[grouping].tolist())
           .groupby('subject_id')
           .apply(list)
          )
    for idx,(p,diags) in enumerate(res.iteritems()):
        data[p][grouping] = diags
        
    print(f'{grouping} done')

icd_code done
ccs done
icd9chapters done


## Save

In [24]:
# add metadata of available groupings
metadata = {'groupings':grouping_columns}
data = {'metadata':metadata,'data':data}

In [25]:
data_id = 'diag_only'
datapath = os.path.join(settings.data_base,settings.model_ready_dataset_folder,data_id)

# create folder of this dataset
if not os.path.isdir(datapath):
    os.mkdir(datapath)

In [26]:
dataset_filename = 'mimic_iv_quick_baseline_dataset.json'
dataset_filepath = os.path.join(datapath,dataset_filename)

with open(dataset_filepath, 'w') as fp:
    json.dump(data, fp)

# Test

### Read

In [27]:
with open(dataset_filepath,'r') as fp:
    data = json.load(fp)

### read all patient ids

In [28]:
patient_ids = list(data['data'].keys()) # patient id's
len(patient_ids)

55483