In [1]:
import os
cwd = os.getcwd()

# protection against running this cell multiple times
assert os.path.dirname(cwd).split('/')[-1] == 'master-thesis','Oops, directory already changed previously as indended. Ignoring...'

# change working directory (if assert passed)
new_cwd = os.path.dirname(cwd) # parent directory
os.chdir(new_cwd)

In [2]:
# show all outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
import os
import json

from rnn_utils import DiagnosesDataset, split_dataset, MYCOLLATE
from rnn_utils import RNN, train_one_epoch, eval_model

from config import Settings; settings = Settings()

import torch
from torch.utils.data import Dataset, DataLoader, random_split

from sklearn.model_selection import ParameterGrid, ParameterSampler

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

import wandb

In [9]:
df = pd.DataFrame(index=['Total ed visits by all 972 High Users'],data=[[447,350,542,250]],columns=['Monthly average 3 years prior', 'Monthly min 3 years prior','Monthly max 3 years prior', "Month of March 2020"])
df

Unnamed: 0,Monthly average 3 years prior,Monthly min 3 years prior,Monthly max 3 years prior,Month of March 2020
Total ed visits by all 972 High Users,447,350,542,250


In [28]:
os.listdir('../Datasets/physionet.org/files/mimiciv/1.0/icu')

['chartevents.csv.gz',
 'procedureevents.csv.gz',
 'icustays.csv.gz',
 'index.html',
 'datetimeevents.csv.gz',
 'd_items.csv.gz',
 'outputevents.csv.gz',
 'inputevents.csv.gz']

In [26]:
os.listdir('../Datasets/physionet.org/files/mimiciv/1.0/core')

['index.html', 'patients.csv.gz', 'admissions.csv.gz', 'transfers.csv.gz']

In [24]:
os.listdir('../Datasets/physionet.org/files/mimiciv/1.0/hosp')

['services.csv.gz',
 'hcpcsevents.csv.gz',
 'd_icd_diagnoses.csv.gz',
 'labevents.csv.gz',
 'drgcodes.csv.gz',
 'pharmacy.csv.gz',
 'index.html',
 'emar.csv.gz',
 'poe.csv.gz',
 'diagnoses_icd.csv.gz',
 'microbiologyevents.csv.gz',
 'poe_detail.csv.gz',
 'emar_detail.csv.gz',
 'prescriptions.csv.gz',
 'd_hcpcs.csv.gz',
 'd_labitems.csv.gz',
 'd_icd_procedures.csv.gz',
 'procedures_icd.csv.gz']

In [None]:
mimic

In [84]:
%%file MimicIV.py
import numpy as np
import pandas as pd
from config import Settings; settings = Settings()
import os

class MimicIV:
    def __init__(self,settings,grouper=None):
        
        self.grouper = grouper
        self.path_dataset_folder = os.path.join(settings.mimic_path,settings.mimic_iv_path)

        assert os.path.isdir(self.path_dataset_folder), f'Error: Please input a valid path to the dataset. Got: {self.path_dataset_folder}'
        
        self.filepath_admissions = 'core/admissions.csv.gz'
        
        self.filepath_meta_diagnoses = 'hosp/d_icd_diagnoses.csv.gz'
        self.filepath_diagnoses = 'hosp/diagnoses_icd.csv.gz'
        
        
        self.admissions = self.__read_admissions() # important that this is ran first
        self.diagnoses = self.__read_diagnoses()
        
    def __read_admissions(self):
        filepath = os.path.join(self.path_dataset_folder,self.filepath_admissions)
        
        date_parser=lambda x: pd.to_datetime(x,format='%Y-%m-%d %H:%M:%S')
        
        df = pd.read_csv(filepath,
                         parse_dates=[
                             'admittime',
                             'dischtime'
                         ],
                         date_parser=date_parser,
                         compression='gzip')
        
        df = df.sort_values('admittime', ascending=True)
        df['hadm_index'] = df.groupby('subject_id').admittime.cumcount()
        return df
    
    def read_meta_diagnoses(self):
        filepath = os.path.join(self.path_dataset_folder,self.filepath_meta_diagnoses)
        df = pd.read_csv(filepath,
                         compression='gzip')
        return df
    
    def get_diagnoses_for_admission(self,hadm_id: int) -> pd.DataFrame:
        return self.diagnoses[self.diagnoses.hadm_id == hadm_id]
    
    def __read_diagnoses(self):
        filepath = os.path.join(self.path_dataset_folder,self.filepath_diagnoses)
        df = pd.read_csv(filepath,
                         compression='gzip')
        
        # temporary measure: remove all diagnoses with icd10 coding
        df = df[df.icd_version == 9]
        
        #df = pd.merge(df,self.admissions[['hadm_id','admittime']],left_on='hadm_id',right_on='hadm_id')
        
        if self.grouper is not None:
            groups = self.grouper.get_available_groupers()
            
            for g in groups:
                df[g] = self.grouper.lookup(g,df['icd_code'])
                
        # sorting which helps for other operations
        df = df.sort_values('seq_num',ascending=True)
        df = pd.merge(df,self.admissions[['hadm_id','hadm_index']],left_on='hadm_id',right_on='hadm_id')
        df = df.sort_values('hadm_index')
        return df
    
    def read_diagnoses(self):
        return self.diagnoses
    def read_admissions(self):
        return self.admissions

Overwriting MimicIV.py


In [78]:
from ICDCodesGrouper import ICDCodesGrouper
grouper = ICDCodesGrouper(settings)
mimic = MimicIV(settings,grouper=grouper)

In [68]:
admissions = mimic.read_admissions()

In [69]:
admissions

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,marital_status,ethnicity,edregtime,edouttime,hospital_expire_flag,hadm_index
453341,16904137,21081215,2105-10-04 17:26:00,2105-10-12 11:11:00,,URGENT,TRANSFER FROM HOSPITAL,HOME,Other,ENGLISH,MARRIED,OTHER,,,0,0
14588,16233333,26733622,2109-08-31 04:20:00,2109-08-31 07:51:00,,EU OBSERVATION,EMERGENCY ROOM,,Other,ENGLISH,SINGLE,WHITE,2109-08-31 02:46:00,2109-08-31 07:51:00,0,0
503485,12024697,20302177,2109-12-14 22:50:00,2110-01-15 14:53:00,,EW EMER.,EMERGENCY ROOM,REHAB,Other,ENGLISH,MARRIED,WHITE,2109-12-14 19:31:00,2109-12-15 01:56:00,0,0
459453,13308789,22079847,2110-01-11 00:57:00,2110-01-13 12:45:00,,ELECTIVE,,HOME,Other,ENGLISH,,BLACK/AFRICAN AMERICAN,,,0,0
405361,15350437,20383396,2110-01-11 08:02:00,2110-01-12 18:45:00,,EU OBSERVATION,TRANSFER FROM HOSPITAL,,Other,ENGLISH,SINGLE,WHITE,2110-01-11 03:43:00,2110-01-11 08:41:00,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37662,15273135,25809426,2211-12-02 23:03:00,2211-12-09 16:29:00,,EW EMER.,WALK-IN/SELF REFERRAL,HOME HEALTH CARE,Medicare,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,2211-12-02 16:10:00,2211-12-03 00:23:00,0,8
252723,16573705,26923952,2212-01-12 23:47:00,2212-01-17 18:17:00,,EW EMER.,WALK-IN/SELF REFERRAL,HOME HEALTH CARE,Other,ENGLISH,WIDOWED,WHITE,2212-01-12 12:47:00,2212-01-13 01:18:00,0,18
422069,11973788,27306647,2212-01-19 15:43:00,2212-01-23 17:21:00,,OBSERVATION ADMIT,PHYSICIAN REFERRAL,HOME HEALTH CARE,Medicare,ENGLISH,MARRIED,WHITE,2212-01-19 09:04:00,2212-01-19 17:59:00,0,6
419519,11973788,23238116,2212-01-28 12:08:00,2212-02-01 17:48:00,,EW EMER.,PHYSICIAN REFERRAL,HOME HEALTH CARE,Medicare,ENGLISH,MARRIED,WHITE,2212-01-27 20:34:00,2212-01-28 13:17:00,0,7


In [91]:
groups = diagnoses.groupby(['subject_id','hadm_index']).apply(lambda subdf:subdf.ccs.tolist())

In [96]:
res = groups.groupby('subject_id').apply(list)

In [99]:
a = res.to_dict()

In [100]:
a

{10000019: [[218, 10, 224, 256]],
 10000032: [[151, 151, 663, 651, 657, 127, 6, 151],
  [663, 5, 151, 127, 55, 62, 151, 6],
  [131, 127, 657, 151, 259, 5, 663, 55, 238, 6, 52, 55, 151],
  [6, 151, 131, 151, 55, 55, 5, 663, 155, 127]],
 10000068: [[660]],
 10000074: [[218, 10]],
 10000108: [[137, 136]],
 10000200: [[10, 218]],
 10000248: [[59, 62, 2621, 2603, 239, 239]],
 10000280: [[197]],
 10000306: [[218, 214, 10]],
 10000560: [[33, 663, 1]],
 10000635: [[98, 49, 106]],
 10000674: [[10, 215, 218]],
 10000719: [[10, 196, 181, 193, 195]],
 10000724: [[2616, 238, 95, 2606, 229, 211, 229]],
 10000730: [[218], [222]],
 10000764: [[53,
   244,
   63,
   101,
   134,
   245,
   62,
   157,
   100,
   228,
   53,
   101,
   98,
   96,
   663,
   257,
   114,
   2621,
   2603]],
 10000771: [[10, 256, 224, 218]],
 10000816: [[224, 10, 218]],
 10000826: [[151, 122, 660, 151, 159, 52, 55, 651, 59, 155, 660, 55, 660, 130],
  [660, 2, 151, 55, 159, 660, 660, 663, 130],
  [151, 55, 660, 62, 660, 20

In [88]:
diagnoses[diagnoses.subject_id == 15464144].groupby('hadm_id').apply(lambda subdf: subdf.ccs.to_list()).to_list()

[[259, 660, 235, 2603],
 [660, 255],
 [660],
 [660],
 [255, 660],
 [660],
 [255, 660],
 [660],
 [660],
 [660],
 [660, 95],
 [660],
 [204, 660],
 [660],
 [259, 660],
 [259, 660],
 [660],
 [660],
 [51, 660, 204],
 [660],
 [660],
 [660, 85],
 [204, 259, 660],
 [660],
 [660],
 [660],
 [660],
 [660],
 [660],
 [259, 660, 255],
 [660],
 [255, 660],
 [660],
 [660],
 [95, 660],
 [660],
 [660],
 [660],
 [660],
 [660],
 [660],
 [660],
 [660],
 [255, 660],
 [660],
 [660],
 [660],
 [660],
 [660],
 [660],
 [660],
 [660],
 [660],
 [660, 259],
 [259, 660],
 [255, 660],
 [255, 660],
 [660],
 [660],
 [660, 204],
 [660],
 [660],
 [660],
 [660],
 [660],
 [255, 660],
 [660],
 [660, 255],
 [204, 660],
 [660, 259],
 [660],
 [204, 660],
 [660, 133],
 [660],
 [660],
 [660],
 [660],
 [660, 255],
 [660],
 [660],
 [660],
 [660],
 [660],
 [660, 660],
 [660],
 [204, 660],
 [660],
 [660, 204, 211],
 [660],
 [660],
 [660],
 [660],
 [204, 660],
 [259, 660],
 [660],
 [2603, 235, 660, 2621],
 [660],
 [204, 259, 660],
 [

In [79]:
diagnoses = mimic.read_diagnoses()

In [80]:
res = diagnoses.groupby('subject_id').hadm_index.is_monotonic_increasing

In [76]:
admissions[(admissions.hadm_id == 21033226) | (admissions.hadm_id == 26071774)]

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,marital_status,ethnicity,edregtime,edouttime,hospital_expire_flag,hadm_index
383585,19999840,26071774,2164-07-25 00:27:00,2164-07-28 12:15:00,,EW EMER.,EMERGENCY ROOM,HOME,Other,ENGLISH,WIDOWED,WHITE,2164-07-24 21:16:00,2164-07-25 01:20:00,0,0
374265,19999840,21033226,2164-09-10 13:47:00,2164-09-17 13:42:00,2164-09-17 13:42:00,EW EMER.,EMERGENCY ROOM,DIED,Other,ENGLISH,WIDOWED,WHITE,2164-09-10 11:09:00,2164-09-10 14:46:00,1,1


In [74]:
diagnoses[diagnoses.subject_id == 19999840]

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,ccs,icd9chapters,icd9_level3,hadm_index
778422,19999840,21033226,1,3453,9,83,6,345,1
778423,19999840,21033226,2,51881,9,131,8,518,1
778424,19999840,21033226,3,5070,9,129,8,507,1
778425,19999840,21033226,4,5180,9,130,8,518,1
778426,19999840,21033226,5,42741,9,107,7,427,1
778427,19999840,21033226,6,43821,9,113,7,438,1
778428,19999840,21033226,7,43811,9,113,7,438,1
778429,19999840,21033226,8,4019,9,98,7,401,1
778430,19999840,21033226,9,2724,9,53,3,272,1
778431,19999840,21033226,10,4589,9,117,7,458,1


In [63]:
admissions['hadm_index'] = admissions.groupby('subject_id').admittime.cumcount()

In [64]:
admissions.groupby('subject_id').size()

subject_id
10000019     1
10000032     4
10000044     1
10000068     1
10000074     1
            ..
19999768     1
19999784    18
19999828     2
19999840     2
19999987     1
Length: 256878, dtype: int64

In [65]:
admissions[admissions.subject_id == 19999784][['hadm_id','admittime','rank']]

Unnamed: 0,hadm_id,admittime,rank
180273,26194817,2119-06-18 21:08:00,1
143988,24935234,2119-07-09 22:31:00,2
164568,23664472,2119-07-24 03:59:00,3
160815,25715748,2119-08-11 11:36:00,4
157622,21739106,2119-09-05 11:20:00,5
175305,28216091,2119-09-19 10:22:00,6
160223,23519817,2119-10-03 09:43:00,7
168488,29355057,2119-10-17 10:28:00,8
192810,23064891,2119-10-31 09:47:00,9
190978,29234099,2119-12-05 09:31:00,10


In [44]:
diag = mimic.read_diagnoses()
#adm = mimic.read_admissions()

In [45]:
diag

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,ccs,icd9chapters,icd9_level3
3106108,18564065,21575927,1,67484,9,195,11,674
1841572,18056803,22962834,1,59010,9,159,10,590
1841580,16315616,23277859,1,34510,9,83,6,345
1841582,16944861,28880103,1,78650,9,102,16,786
1841593,14108013,20449979,1,34982,9,95,6,349
...,...,...,...,...,...,...,...,...
83788,14716782,23027657,39,V1254,9,117,18,V12
1914234,17069955,25919612,39,E8788,9,2616,19,E87
1408673,19206592,21154874,39,5680,9,155,9,568
1885556,12935838,25150751,39,73390,9,212,13,733


In [41]:
diag.head(2)

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,ccs,icd9chapters,icd9_level3
0,15734973,20475282,3,2825,9,61,4,282
1,15734973,20475282,2,V0251,9,10,18,V02


In [33]:
g

<ICDCodesGrouper.ICDCodesGrouper at 0x7f9e46c47c10>

In [None]:
diag

In [12]:
diag.head(2)

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version
0,15734973,20475282,3,2825,9
1,15734973,20475282,2,V0251,9


In [13]:
adm.head(2)

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,marital_status,ethnicity,edregtime,edouttime,hospital_expire_flag
453341,16904137,21081215,2105-10-04 17:26:00,2105-10-12 11:11:00,,URGENT,TRANSFER FROM HOSPITAL,HOME,Other,ENGLISH,MARRIED,OTHER,,,0
14588,16233333,26733622,2109-08-31 04:20:00,2109-08-31 07:51:00,,EU OBSERVATION,EMERGENCY ROOM,,Other,ENGLISH,SINGLE,WHITE,2109-08-31 02:46:00,2109-08-31 07:51:00,0


In [56]:
df = pd.merge(diag,adm[['hadm_id','admission_type']],left_on='hadm_id',right_on='hadm_id')
df

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,admission_type
0,15734973,20475282,3,2825,9,URGENT
1,15734973,20475282,2,V0251,9,URGENT
2,15734973,20475282,5,V270,9,URGENT
3,15734973,20475282,1,64891,9,URGENT
4,15734973,20475282,4,66481,9,URGENT
...,...,...,...,...,...,...
5280342,13747041,25594844,6,R531,10,OBSERVATION ADMIT
5280343,13747041,25594844,8,R0902,10,OBSERVATION ADMIT
5280344,13747041,25594844,4,F1120,10,OBSERVATION ADMIT
5280345,13747041,25594844,2,J189,10,OBSERVATION ADMIT


In [62]:
df.groupby(['subject_id']).hadm_id.nunique().value_counts()[:10]

1     169813
2      38904
3      16945
4       9224
5       5502
6       3546
7       2413
8       1784
9       1362
10       992
Name: hadm_id, dtype: int64

In [57]:
df[df.icd_version == 9].admission_type.value_counts()

EW EMER.                       1595068
EU OBSERVATION                  395117
URGENT                          339303
ELECTIVE                        223544
SURGICAL SAME DAY ADMISSION     206450
DIRECT EMER.                    192002
DIRECT OBSERVATION               64229
AMBULATORY OBSERVATION           51289
OBSERVATION ADMIT                23368
Name: admission_type, dtype: int64

In [35]:
df.head(3)
df.shape

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,marital_status,ethnicity,edregtime,edouttime,hospital_expire_flag
0,14679932,21038362,2139-09-26 14:16:00,2139-09-28 11:30:00,,ELECTIVE,,HOME,Other,ENGLISH,SINGLE,UNKNOWN,,,0
1,15585972,24941086,2123-10-07 23:56:00,2123-10-12 11:22:00,,ELECTIVE,,HOME,Other,ENGLISH,,WHITE,,,0
2,11989120,21965160,2147-01-14 09:00:00,2147-01-17 14:25:00,,ELECTIVE,,HOME,Other,ENGLISH,,UNKNOWN,,,0


(523740, 15)

In [36]:
df.admission_type.value_counts()

EW EMER.                       157896
EU OBSERVATION                 100445
ELECTIVE                        72072
OBSERVATION ADMIT               55497
URGENT                          47930
SURGICAL SAME DAY ADMISSION     41074
DIRECT EMER.                    21581
DIRECT OBSERVATION              19991
AMBULATORY OBSERVATION           7254
Name: admission_type, dtype: int64