In [1]:
import os
cwd = os.getcwd()

# protection against running this cell multiple times
assert os.path.dirname(cwd).split('/')[-1] == 'master-thesis','Oops, directory already changed previously as indended. Ignoring...'

# change working directory (if assert passed)
new_cwd = os.path.dirname(cwd) # parent directory
os.chdir(new_cwd)

In [2]:
# show all outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
import os
from MimicIII import MimicIII
from MimicIV import MimicIV
from ICDCodesGrouper import ICDCodesGrouper

import pandas as pd
import numpy as np

#from tqdm.notebook import tqdm

from torch.utils.data import Dataset
from torch import nn
import torch
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence, pack_sequence
import torch.nn.functional as F

from datetime import datetime,timedelta
import matplotlib.pyplot as plt
import json
from tqdm.notebook import tqdm

from config import Settings; settings = Settings()

In [4]:
grouper = ICDCodesGrouper(settings)
mimicIII = MimicIII(settings=settings,grouper=grouper)
mimicIV = MimicIV(settings=settings,grouper=grouper)

## Get admissions eligible for modelling

In [11]:
eligible_pats_mimic_III

SUBJECT_ID
17       2.0
21       2.0
23       2.0
34       2.0
36       3.0
        ... 
99822    3.0
99883    2.0
99897    2.0
99923    2.0
99982    3.0
Length: 7537, dtype: float64

In [15]:
eligible_pats_mimic_III = mimicIII.read_admissions().groupby('SUBJECT_ID').size().where(lambda x: x > 1).dropna().index
eligible_pats_mimic_III

Int64Index([   17,    21,    23,    34,    36,    61,    67,    68,    84,
               85,
            ...
            99660, 99712, 99756, 99781, 99783, 99822, 99883, 99897, 99923,
            99982],
           dtype='int64', name='SUBJECT_ID', length=7537)

In [22]:
res_III = (mimicIII
           .read_admissions()
           .where(lambda x: # subjects with more than 1 visit
                  x.SUBJECT_ID.isin(mimicIII.read_admissions().groupby('SUBJECT_ID').size().where(lambda x: x > 1).dropna().index)
                 )
           .groupby('SUBJECT_ID')
           .ADMITTIME
           .apply(lambda x: x.diff().median().days).describe().rename('MIMIC-III')
          )

res_IV = (mimicIV
          .read_admissions()
          .where(lambda x: # subjects with more than 1 visit
                 x.subject_id.isin(mimicIV.read_admissions().groupby('subject_id').size().where(lambda x: x > 1).dropna().index)
                )
          .groupby('subject_id')
          .admittime
          .apply(lambda x: x.diff().median().days).describe().rename('MIMIC-IV')
         )

In [26]:
mimicIV.read_admissions().admission_type.value_counts()

EW EMER.                       157896
EU OBSERVATION                 100445
ELECTIVE                        72072
OBSERVATION ADMIT               55497
URGENT                          47930
SURGICAL SAME DAY ADMISSION     41074
DIRECT EMER.                    21581
DIRECT OBSERVATION              19991
AMBULATORY OBSERVATION           7254
Name: admission_type, dtype: int64

In [40]:
m = 1 # months
timewindows = (mimicIII
               .read_admissions()
               .where(lambda x: # subjects with more than 1 visit
                 x.SUBJECT_ID.isin(mimicIII.read_admissions().groupby('SUBJECT_ID').size().where(lambda x: x > 1).dropna().index)
                )
               .dropna(how='all')
               .groupby('SUBJECT_ID')
               .apply(lambda subdf: (subdf
                                     .set_index('ADMITTIME')
                                     .sort_index()
                                     .resample(f'{m*30}d')
                                     .HADM_ID
                                     .apply(list)
                                    )
                     )
              )

KeyboardInterrupt: 

In [29]:
timewindows.apply(len)

SUBJECT_ID  ADMITTIME 
17.0        2134-12-27    1
            2135-01-26    0
            2135-02-25    0
            2135-03-27    0
            2135-04-26    1
                         ..
99923.0     2201-03-25    0
            2201-04-24    1
99982.0     2156-11-28    1
            2156-12-28    1
            2157-01-27    1
Name: HADM_ID, Length: 176544, dtype: int64

In [None]:
pd.DataFrame().dropna(

In [59]:
from tqdm.notebook import tqdm

In [5]:
pd.Series([1,2,3]).size

3

In [23]:
#trying to reduce the number of 
timesteps = [1,2,3] #months
for ts in tqdm(timesteps):
    resIV = (mimicIV
               .read_admissions()
               .where(lambda x: # subjects with more than 1 visit
                      x.subject_id.isin(mimicIV.read_admissions().groupby('subject_id').size().where(lambda x: x > 1).dropna().index)
                     )
               .dropna(how='all')
             .groupby('subject_id')
               .apply(lambda subdf: (subdf
                                         .set_index('admittime')
                                         .sort_index()
                                         .resample(f'{ts*30}d')
                                         .hadm_id
                                         .apply(list)
                                        )
                     )
              )
    resIV.to_csv(f'data/mimicIV_ts.{ts}.month')

In [38]:
df.hadm_id.iloc[0]

'[22595853.0]'

In [50]:
from ast import literal_eval
df = pd.read_csv('mimicIV_ts.1.month',index_col=[0,1]).sort_index().hadm_id
df = df.apply(literal_eval)

In [57]:
df.apply(len).where(lambda x: x > 0).dropna()

subject_id  admittime 
10000032.0  2180-05-06    1.0
            2180-06-05    1.0
            2180-07-05    1.0
            2180-08-04    1.0
10000084.0  2160-11-21    1.0
                         ... 
19999784.0  2121-05-08    1.0
19999828.0  2147-07-18    1.0
            2149-01-08    1.0
19999840.0  2164-07-25    1.0
            2164-08-24    1.0
Name: hadm_id, Length: 296088, dtype: float64

In [58]:
(mimicIV
.read_admissions()
.where(lambda x: # subjects with more than 1 visit
      x.subject_id.isin(mimicIV.read_admissions().groupby('subject_id').size().where(lambda x: x > 1).dropna().index)
     )
.dropna(how='all')
)

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,marital_status,ethnicity,edregtime,edouttime,hospital_expire_flag,hadm_index
453341,16904137.0,21081215.0,2105-10-04 17:26:00,2105-10-12 11:11:00,,URGENT,TRANSFER FROM HOSPITAL,HOME,Other,ENGLISH,MARRIED,OTHER,,,0.0,0.0
14588,16233333.0,26733622.0,2109-08-31 04:20:00,2109-08-31 07:51:00,,EU OBSERVATION,EMERGENCY ROOM,,Other,ENGLISH,SINGLE,WHITE,2109-08-31 02:46:00,2109-08-31 07:51:00,0.0,0.0
503485,12024697.0,20302177.0,2109-12-14 22:50:00,2110-01-15 14:53:00,,EW EMER.,EMERGENCY ROOM,REHAB,Other,ENGLISH,MARRIED,WHITE,2109-12-14 19:31:00,2109-12-15 01:56:00,0.0,0.0
389518,17195991.0,23542772.0,2110-01-11 22:47:00,2110-01-18 10:25:00,,EW EMER.,EMERGENCY ROOM,CHRONIC/LONG TERM ACUTE CARE,Other,ENGLISH,SINGLE,UNABLE TO OBTAIN,2110-01-11 21:42:00,2110-01-12 00:54:00,0.0,0.0
369970,17922008.0,25929249.0,2110-01-12 00:34:00,2110-01-23 15:00:00,,EW EMER.,EMERGENCY ROOM,REHAB,Medicare,ENGLISH,MARRIED,WHITE,2110-01-11 18:01:00,2110-01-12 01:30:00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37662,15273135.0,25809426.0,2211-12-02 23:03:00,2211-12-09 16:29:00,,EW EMER.,WALK-IN/SELF REFERRAL,HOME HEALTH CARE,Medicare,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,2211-12-02 16:10:00,2211-12-03 00:23:00,0.0,8.0
252723,16573705.0,26923952.0,2212-01-12 23:47:00,2212-01-17 18:17:00,,EW EMER.,WALK-IN/SELF REFERRAL,HOME HEALTH CARE,Other,ENGLISH,WIDOWED,WHITE,2212-01-12 12:47:00,2212-01-13 01:18:00,0.0,18.0
422069,11973788.0,27306647.0,2212-01-19 15:43:00,2212-01-23 17:21:00,,OBSERVATION ADMIT,PHYSICIAN REFERRAL,HOME HEALTH CARE,Medicare,ENGLISH,MARRIED,WHITE,2212-01-19 09:04:00,2212-01-19 17:59:00,0.0,6.0
419519,11973788.0,23238116.0,2212-01-28 12:08:00,2212-02-01 17:48:00,,EW EMER.,PHYSICIAN REFERRAL,HOME HEALTH CARE,Medicare,ENGLISH,MARRIED,WHITE,2212-01-27 20:34:00,2212-01-28 13:17:00,0.0,7.0


In [17]:
%%timeit -r 2
resIV = (mimicIV
           .read_admissions()
           .where(lambda x: # subjects with more than 1 visit
                  x.subject_id.isin(mimicIV.read_admissions().groupby('subject_id').size().where(lambda x: x > 1).dropna().index)
                 )
           .dropna(how='all')
         .iloc[:10000]
         .groupby('subject_id')
           .apply(lambda subdf: (subdf
                                     .set_index('admittime')
                                     .sort_index()
                                     .resample(f'{1*30}d')
                                     .hadm_id
                                     .apply(list)
                                    )
                 )
          )

10.2 s ± 32.1 ms per loop (mean ± std. dev. of 2 runs, 1 loop each)


In [16]:
admissionsIIIv2 = mimicIII.read_admissions().set_index(['SUBJECT_ID','ADMITTIME']).sort_index()
admissionsIVv2 = mimicIV.read_admissions().set_index(['subject_id','admittime']).sort_index()
all_res_III_dist = []
all_res_III = []
all_res_IV_dist = []
all_res_IV = []
idx = pd.IndexSlice
for pred_window in tqdm([1,3,6,12]):
    resIII = (mimicIII
           .read_admissions()
           .where(lambda x: # subjects with more than 1 visit
                  x.SUBJECT_ID.isin(mimicIII.read_admissions().groupby('SUBJECT_ID').size().where(lambda x: x > 1).dropna().index)
                 )
           .dropna(how='all')
           .groupby(['SUBJECT_ID','ADMITTIME'])
           .apply(lambda subdf: admissionsIIIv2.loc[idx[subdf.name[0],subdf.name[1]+timedelta(days=1):subdf.name[1]+timedelta(days=pred_window*30)],'HADM_ID'].tolist())
          )
    all_res_III.append(resIII)
    resIII_eligible_dist = resIII.apply(len).where(lambda x: x > 0).dropna().describe()
    resIII_eligible_dist = pd.concat([resIII_eligible_dist,pd.Series([resIII.apply(len).where(lambda x: x == 0).dropna().size],index=['Empty admissions'])]).rename(f'{pred_window} months')
    all_res_III_dist.append(resIII_eligible_dist)
    
    resIV = (mimicIV
           .read_admissions()
           .where(lambda x: # subjects with more than 1 visit
                  x.subject_id.isin(mimicIV.read_admissions().groupby('subject_id').size().where(lambda x: x > 1).dropna().index)
                 )
           .dropna(how='all')
           .groupby(['subject_id','admittime'])
           .apply(lambda subdf: admissionsIVv2.loc[idx[subdf.name[0],subdf.name[1]+timedelta(days=1):subdf.name[1]+timedelta(days=pred_window*30)],'hadm_id'].tolist())
          )
    all_res_IV.append(resIV)
    resIV_eligible_dist = resIV.apply(len).where(lambda x: x > 0).dropna().describe()
    resIV_eligible_dist = pd.concat([resIV_eligible_dist,pd.Series([resIV.apply(len).where(lambda x: x == 0).dropna().size],index=['Empty admissions'])]).rename(f'{pred_window} months')
    all_res_mimicIV.append(resIV_eligible_dist)

  0%|          | 0/4 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [11]:
all_res_III_dist[0]

count                2464.000000
mean                    1.044237
std                     0.211503
min                     1.000000
25%                     1.000000
50%                     1.000000
75%                     1.000000
max                     3.000000
Empty admissions    17529.000000
dtype: float64

In [56]:
res.apply(len).describe()

count    19993.000000
mean         0.316111
std          0.596952
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max          7.000000
dtype: float64

In [54]:
res.apply(len).value_counts()

0    14723
1     4464
2      636
3      121
4       31
5       12
6        5
7        1
dtype: int64

In [32]:
windows_w_admissions_mask = timewindows.apply(lambda x: len(x) > 0)
targets_mask = windows_w_admissions_mask.groupby('SUBJECT_ID').shift(m,fill_value=False)

targets = timewindows[targets_mask]

targets_at_least_one_mask = targets.apply(lambda x: len(x) > 0)

In [33]:
targets[targets_at_least_one_mask]

SUBJECT_ID  ADMITTIME 
109.0       2140-01-23              [108375.0]
            2140-04-22              [175347.0]
            2141-09-14    [172335.0, 126055.0]
            2141-12-13    [140167.0, 135923.0]
            2142-01-12    [124657.0, 176760.0]
                                  ...         
99756.0     2191-07-15              [145054.0]
99783.0     2125-10-13              [126090.0]
99822.0     2197-06-29              [195871.0]
99982.0     2156-12-28              [112748.0]
            2157-01-27              [183791.0]
Name: HADM_ID, Length: 2014, dtype: object

In [19]:
timewindows.groupby(['SUBJECT_ID']).size().describe()

count    7537.000000
mean       23.423643
std        28.331397
min         1.000000
25%         3.000000
50%        11.000000
75%        35.000000
max       141.000000
Name: HADM_ID, dtype: float64

In [16]:
timewindows.groupby(['SUBJECT_ID']).size().groupby('SUBJECT_ID').max().describe()

count    7537.0
mean        1.0
std         0.0
min         1.0
25%         1.0
50%         1.0
75%         1.0
max         1.0
Name: HADM_ID, dtype: float64

In [24]:
res = pd.concat([res_III,res_IV],axis=1)
res.index.name = 'Median difference between visits in each (eligible) patient'
res

Unnamed: 0_level_0,MIMIC-III,MIMIC-IV
Median difference between visits in each (eligible) patient,Unnamed: 1_level_1,Unnamed: 2_level_1
count,7537.0,85798.0
mean,455.38238,424.935243
std,650.218604,628.835643
min,0.0,0.0
25%,46.0,35.0
50%,172.0,146.0
75%,580.0,559.0
max,4121.0,4330.0


In [31]:
print(res.style.to_latex())

\begin{tabular}{lrr}
 & MIMIC-III & MIMIC-IV \\
Median difference between visits in each (eligible) patient &  &  \\
count & 7537.000000 & 85798.000000 \\
mean & 455.382380 & 424.935243 \\
std & 650.218604 & 628.835643 \\
min & 0.000000 & 0.000000 \\
25% & 46.000000 & 35.000000 \\
50% & 172.000000 & 146.000000 \\
75% & 580.000000 & 559.000000 \\
max & 4121.000000 & 4330.000000 \\
\end{tabular}



In [19]:
(mimicIV
 .read_admissions()
 .where(lambda x: 
        x.subject_id.isin(mimicIV.read_admissions().groupby('subject_id').size().where(lambda x: x > 1).dropna().index)
       )
 .groupby('subject_id')
 .admittime
 .apply(lambda x: x.diff().median().days).describe().rename('MIMIC-IV Median difference between admissions in all patients')
)

count    85798.000000
mean       424.935243
std        628.835643
min          0.000000
25%         35.000000
50%        146.000000
75%        559.000000
max       4330.000000
Name: MIMIC-IV Median difference between admissions in all patients, dtype: float64

In [20]:
diagnostic_cols_mimicIII = [col for col in diagnoses if col not in ['ROW_ID','SUBJECT_ID','HADM_ID','SEQ_NUM']]

In [22]:
# create mask of the targets. 
# Then for each target gather the features (history up to that point) This will define our datapoints

pred_window = m # months

windows_w_admissions_mask = timewindows.apply(lambda x: len(x) > 0)
targets_mask = windows_w_admissions_mask.groupby('SUBJECT_ID').shift(pred_window,fill_value=False)

targets = timewindows[targets_mask]

print('targets')
targets

targets


SUBJECT_ID  ADMITTIME 
23          2156-08-18          []
34          2189-07-02          []
36          2134-04-14    [165660]
85          2165-02-14          []
107         2118-02-04          []
                            ...   
98347       2124-10-11    [177195]
98761       2188-12-31          []
98813       2131-10-27          []
99088       2175-12-06          []
99650       2155-05-23          []
Name: HADM_ID, Length: 2347, dtype: object

In [23]:
targets.apply(len).value_counts()

0    1740
1     480
2      91
3      23
4      10
5       2
6       1
Name: HADM_ID, dtype: int64

In [24]:
targets

SUBJECT_ID  ADMITTIME 
23          2156-08-18          []
34          2189-07-02          []
36          2134-04-14    [165660]
85          2165-02-14          []
107         2118-02-04          []
                            ...   
98347       2124-10-11    [177195]
98761       2188-12-31          []
98813       2131-10-27          []
99088       2175-12-06          []
99650       2155-05-23          []
Name: HADM_ID, Length: 2347, dtype: object

# How many admissions within each target

In [12]:
targets.apply(len).value_counts()

0    8581
1    1847
2     123
3       3
Name: HADM_ID, dtype: int64

In [143]:
idx = pd.IndexSlice
#admissions_w_index = admissions.set_index(['SUBJECT_ID','ADMITTIME']).sort_index() # to speed up queries below
targets.to_frame().reset_index().apply(lambda row: 
                         (admissions_w_index
                          .loc[idx[row.SUBJECT_ID,:row.ADMITTIME],'HADM_ID'],
                         ),
                         axis=1
                        )

0                ([194023],)
1                      ([],)
2                ([109451],)
3                      ([],)
4                ([152223],)
                ...         
16945                  ([],)
16946            ([164914],)
16947                  ([],)
16948            ([151454],)
16949    ([151454, 112748],)
Length: 16950, dtype: object

In [148]:
admissions[(admissions.SUBJECT_ID == 21) & (admissions.ADMITTIME < '2134-09-11')]

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA


In [145]:
targets.to_frame().reset_index()

Unnamed: 0,SUBJECT_ID,ADMITTIME,HADM_ID
0,17,2135-01-26,[]
1,21,2134-09-11,[109451]
2,21,2134-10-11,[]
3,23,2153-09-03,[152223]
4,23,2153-10-03,[]
...,...,...,...
16945,99923,2201-02-23,[164914]
16946,99923,2201-03-25,[]
16947,99982,2156-11-28,[151454]
16948,99982,2156-12-28,[112748]


In [None]:
targets.to_frame().apply(lambda row: ,axis=1)

In [None]:
pd.Series().diff()

In [42]:
(timewindows
 .groupby('SUBJECT_ID')
 .apply(lambda subdf: )

SUBJECT_ID  ADMITTIME 
17          2134-12-27    [194023]
            2135-01-26          []
            2135-02-25          []
            2135-03-27          []
            2135-04-26    [161087]
                            ...   
99923       2201-03-25          []
            2201-04-24    [192053]
99982       2156-11-28    [151454]
            2156-12-28    [112748]
            2157-01-27    [183791]
Name: HADM_ID, Length: 175257, dtype: object

In [28]:
res = (timewindows.iloc[:2]
       .apply(lambda row: 
              pd.DataFrame(
                  [
                      diagnoses
                      .loc[diagnoses.HADM_ID == adm, diagnostic_cols_mimicIII]
                      .to_dict(orient='list')
                      for adm in row # obtain a list of diagnoses of each admission inside each window
                  ]
              ).sum() # concat all lists of diagnoses of each admission inside each window to make one big list per window
             )
      )

In [29]:
res = (timewindows.iloc[:50]
       .apply(lambda row: 
              pd.DataFrame(
                  [
                      diagnoses
                      .loc[diagnoses.HADM_ID == adm, diagnostic_cols_mimicIII]
                      .to_dict(orient='list')
                      for adm in row # obtain a list of diagnoses of each admission inside each window
                  ]
              ).sum() # concat all lists of diagnoses of each admission inside each window to make one big list per window
             )
      )

In [32]:
admissions[admissions.SUBJECT_ID == 23]

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
1,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,NaT,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,CATHOLIC,MARRIED,WHITE,NaT,NaT,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1
2,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,NaT,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,NaT,NaT,BRAIN MASS,0,1


In [35]:
admissions.head(2)

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
25361,20957,113808,2100-06-24 22:37:00,2100-07-03 12:31:00,NaT,EMERGENCY,EMERGENCY ROOM ADMIT,HOME HEALTH CARE,Private,,PROTESTANT QUAKER,MARRIED,WHITE,2100-06-24 13:37:00,2100-06-25 00:10:00,BILATERAL PNEUMONIA,0,1
7378,4521,167070,2100-06-28 19:29:00,2100-07-30 11:02:00,NaT,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Medicare,,CATHOLIC,SINGLE,WHITE,NaT,NaT,ISCHEMIC ULCER R GREAT TOE;DIABETES,0,0


In [34]:
admissions.ADMITTIME.diff()

25361                 NaT
7378      3 days 20:52:00
11061     2 days 16:31:00
37595     1 days 07:28:00
17400    12 days 01:24:00
               ...       
56600    84 days 04:42:00
3611    174 days 04:45:00
29422   155 days 09:35:00
31085    16 days 16:16:00
30835   382 days 03:22:00
Name: ADMITTIME, Length: 17680, dtype: timedelta64[ns]

In [39]:
res = (admissions
 .groupby('SUBJECT_ID')
 .ADMITTIME
 .apply(lambda series: series.diff().max())
)

In [40]:
res

SUBJECT_ID
17       133 days 06:56:00
21       141 days 08:33:00
23      1506 days 12:19:00
34      1680 days 12:37:00
36      1093 days 15:41:00
               ...        
99822     16 days 23:23:00
99883    265 days 06:55:00
99897    331 days 17:28:00
99923     80 days 16:30:00
99982     42 days 00:04:00
Name: ADMITTIME, Length: 6397, dtype: timedelta64[ns]

In [41]:
res.describe()

count                           6397
mean     688 days 10:45:04.905424408
std      760 days 23:17:29.296653968
min                 16 days 23:23:00
25%                117 days 07:46:00
50%                391 days 12:15:00
75%               1011 days 12:53:00
max               4121 days 03:16:00
Name: ADMITTIME, dtype: object

In [37]:
res.days.describe()

AttributeError: 'DataFrame' object has no attribute 'days'

## Build dictionary of data

In [21]:
adm = mimicIII.read_admissions().where(lambda df: df.SUBJECT_ID.isin(mimic_III_eligible_patients)).dropna(how='all')[['SUBJECT_ID','ADMITTIME','HADM_ID']]
adm.head(1)
adm.shape

(19917, 3)

In [22]:
# where it all begins
data = {}

# dumb, error prone way of getting only the columns of icd codings (eg.: icd9,ccs,chapters,etc)
grouping_columns = [col for col in mimicIII.read_diagnoses() if col not in ["ROW_ID","SUBJECT_ID","HADM_ID","SEQ_NUM"]]

print_every = 0.2 # percent
current = print_every
for idx,p in enumerate(eligible_patients):
    
    p = int(p)
    
    data[p] = {}
    
    p_adm_data = adm[adm.SUBJECT_ID == p]
    
    # sanity check that all admissions are sorted inside each patient data
    assert p_adm_data.ADMITTIME.is_monotonic_increasing, f'Oopsie, p={p}'
    
    for hadm in p_adm_data['HADM_ID']:
        #diagnoses data
        diag_data = mimic.get_diagnoses_for_admission(hadm)
        
        # if no information about diagnostics then ignore
        if diag_data.ICD9_CODE.isna().all():
            print('No diagnoses found for this admission. Skipping')
            print('patient',p)
            print('admission',int(hadm))
            print('-----\n')
            continue
        
        # sanity check that diagnostics are sorted (important for future experiments)
        assert diag_data.SEQ_NUM.is_monotonic_increasing, f'Oopsie, p={p}, hadm_id={hadm}'
        
        for grouping in grouping_columns:
            if grouping not in data[p].keys():
                data[p][grouping] = [diag_data[grouping].tolist()]
            else:
                data[p][grouping].append(diag_data[grouping].tolist())
    if (idx+1)/len(eligible_patients) >= current:
        print(f'{int(current*100)}% done.\n')
        current += print_every

No diagnoses found for this admission. Skipping
patient 690
admission 174817
-----

No diagnoses found for this admission. Skipping
patient 3369
admission 126808
-----

20% done.

No diagnoses found for this admission. Skipping
patient 11438
admission 154602
-----

40% done.

No diagnoses found for this admission. Skipping
patient 24975
admission 109963
-----

60% done.

No diagnoses found for this admission. Skipping
patient 31928
admission 153208
-----

80% done.

No diagnoses found for this admission. Skipping
patient 73686
admission 112990
-----

100% done.



In [23]:
# add metadata of available groupings
metadata = {'groupings':grouping_columns}
data = {'metadata':metadata,'data':data}

# save

In [24]:
data_id = 'diag_only'
datapath = os.path.join(settings.data_base,settings.model_ready_dataset_folder,data_id)

# create folder of this dataset
if not os.path.isdir(datapath):
    os.mkdir(datapath)

In [25]:
dataset_filename = 'dataset.json'
dataset_filepath = os.path.join(datapath,dataset_filename)

with open(dataset_filepath, 'w') as fp:
    json.dump(data, fp)

# Now for MIMIC-IV

## Get patients elegible for modelling

In [5]:
filename = 'mimicIV_eligible_patients_exc.nodiag_single.adm_no.icd10.txt'
filepath = os.path.join(settings.data_base,settings.eligible_patients_folder,filename)

mimic_IV_eligible_patients = np.loadtxt(filepath,dtype=int)
print(f"{len(mimic_IV_eligible_patients)=}")

len(mimic_IV_eligible_patients)=55483


## Build dictionary of data

In [6]:
adm = mimicIV.read_admissions().where(lambda df: df.subject_id.isin(mimic_IV_eligible_patients)).dropna(how='all')[['subject_id','admittime','hadm_id']]
diagnoses = mimicIV.read_diagnoses()
adm.head(1)
adm.shape

Unnamed: 0,subject_id,admittime,hadm_id
14588,16233333.0,2109-08-31 04:20:00,26733622.0


(262727, 3)

In [21]:
# where it all begins
data = {int(p):{} for p in mimic_IV_eligible_patients} 

# dumb, error prone way of getting only the columns of icd codings (eg.: icd9,ccs,chapters,etc)
grouping_columns = [col for col in mimicIV.read_diagnoses() if col not in ["icd_version","icd9_chapters","icd9_level3","subject_id","hadm_id","seq_num","hadm_index"]]

# include only eligible patients
diagnoses_eligible = diagnoses[diagnoses.subject_id.isin(mimic_IV_eligible_patients)]

for grouping in grouping_columns:
    res = (diagnoses_eligible
           .groupby(['subject_id','hadm_index'])
           .apply(lambda subdf:subdf[grouping].tolist())
           .groupby('subject_id')
           .apply(list)
          )
    for idx,(p,diags) in enumerate(res.iteritems()):
        data[p][grouping] = diags
        
    print(f'{grouping} done')

icd_code done
ccs done
icd9chapters done


## Save

In [24]:
# add metadata of available groupings
metadata = {'groupings':grouping_columns}
data = {'metadata':metadata,'data':data}

In [25]:
data_id = 'diag_only'
datapath = os.path.join(settings.data_base,settings.model_ready_dataset_folder,data_id)

# create folder of this dataset
if not os.path.isdir(datapath):
    os.mkdir(datapath)

In [26]:
dataset_filename = 'mimic_iv_quick_baseline_dataset.json'
dataset_filepath = os.path.join(datapath,dataset_filename)

with open(dataset_filepath, 'w') as fp:
    json.dump(data, fp)

# Test

### Read

In [27]:
with open(dataset_filepath,'r') as fp:
    data = json.load(fp)

### read all patient ids

In [28]:
patient_ids = list(data['data'].keys()) # patient id's
len(patient_ids)

55483