In [1]:
import pandas as pd
import numpy as np
import csv

In [2]:
# Define MIMIC demo data path
data_path = $your_mimic_datapath$

In [3]:
# Load tables
input_events = pd.read_csv(data_path+'INPUTEVENTS_MV.csv', header=0)
input_events.head()

Unnamed: 0,row_id,subject_id,hadm_id,icustay_id,starttime,endtime,itemid,amount,amountuom,rate,...,totalamountuom,isopenbag,continueinnextdept,cancelreason,statusdescription,comments_editedby,comments_canceledby,comments_date,originalamount,originalrate
0,118897,42367,139932,250305,2147-10-29 16:45:00,2147-10-29 16:46:00,225799,60.0,ml,,...,ml,0,0,0,FinishedRunning,,,,60.0,60.0
1,118898,42367,139932,250305,2147-10-20 13:17:00,2147-10-20 13:18:00,223258,10.0,units,,...,,0,0,1,Rewritten,,RN,2147-10-20 13:18:00,10.0,10.0
2,118899,42367,139932,250305,2147-10-29 03:23:00,2147-10-29 03:53:00,226089,99.999999,ml,199.999998,...,ml,0,0,0,FinishedRunning,,,,100.0,200.0
3,118900,42367,139932,250305,2147-10-22 22:00:00,2147-10-22 22:01:00,225799,40.0,ml,,...,ml,0,0,0,FinishedRunning,,,,40.0,40.0
4,118901,42367,139932,250305,2147-10-16 06:21:00,2147-10-17 06:10:00,225936,1309.899995,ml,54.9993,...,ml,0,0,0,FinishedRunning,,,,1309.9,54.999298


In [4]:
procedures = pd.read_csv(data_path+'PROCEDURES_ICD.csv', header=0)
procedures.head()

Unnamed: 0,row_id,subject_id,hadm_id,seq_num,icd9_code
0,3994,10114,167957,1,3605
1,3995,10114,167957,2,3722
2,3996,10114,167957,3,8856
3,3997,10114,167957,4,9920
4,3998,10114,167957,5,9671


In [5]:
# Group by 'subject_id', and sort by 'starttime' and then 'itemid'
drug_events_only = input_events.groupby(by='subject_id').apply(lambda x: x.sort_values('starttime'))['itemid'].reset_index(level=[1], drop=True)

In [6]:
drug_events_by_patient = drug_events_only.groupby(by='subject_id').apply(list)
drug_events_by_patient[:5]

subject_id
40124    [220949, 220949, 225152, 225152, 225943, 22216...
40177    [225158, 225893, 220949, 225158, 225158, 22515...
40204    [225158, 225974, 225158, 225859, 220949, 22597...
40277    [226452, 226452, 226452, 225161, 226452, 22645...
40286    [225158, 226375, 225828, 225168, 225168, 22582...
Name: itemid, dtype: object

In [7]:
# Group by subject_id and sort by admittime (first) and seq_num (second)
procedure_codes = procedures.groupby(by='subject_id').apply(lambda x: x.sort_values(['hadm_id', 'seq_num']))['icd9_code'].reset_index(level=[1], drop=True)

In [8]:
procedures_by_patient = procedure_codes.groupby(by='subject_id', axis=0).apply(list)
procedures_by_patient[:5]

subject_id
10006    [9749, 5491, 3895, 3995, 3893, 9907, 14]
10011                                [9915, 3893]
10013                                      [3891]
10017                                [8181, 9904]
10019                      [9671, 17, 3995, 9904]
Name: icd9_code, dtype: object

In [9]:
# Merge drug events and procedures
drug_events_procedures_merged = pd.merge(drug_events_by_patient, procedures_by_patient, how='inner', on='subject_id')

In [10]:
drug_events_procedures_merged.head()

Unnamed: 0_level_0,itemid,icd9_code
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1
40124,"[220949, 220949, 225152, 225152, 225943, 22216...","[9671, 9671, 9604, 3323, 3491]"
40177,"[225158, 225893, 220949, 225158, 225158, 22515...","[5198, 5110, 9671, 9604]"
40204,"[225158, 225974, 225158, 225859, 220949, 22597...",[3995]
40277,"[226452, 226452, 226452, 225161, 226452, 22645...",[3142]
40286,"[225158, 226375, 225828, 225168, 225168, 22582...","[8151, 7869, 9904]"


In [11]:
drug_events_procedures_merged.shape

(42, 2)

In [12]:
# Load patients table
patients = pd.read_csv(data_path+'PATIENTS.csv', header=0)
patients.head()

Unnamed: 0,row_id,subject_id,gender,dob,dod,dod_hosp,dod_ssn,expire_flag
0,9467,10006,F,2094-03-05 00:00:00,2165-08-12 00:00:00,2165-08-12 00:00:00,2165-08-12 00:00:00,1
1,9472,10011,F,2090-06-05 00:00:00,2126-08-28 00:00:00,2126-08-28 00:00:00,,1
2,9474,10013,F,2038-09-03 00:00:00,2125-10-07 00:00:00,2125-10-07 00:00:00,2125-10-07 00:00:00,1
3,9478,10017,F,2075-09-21 00:00:00,2152-09-12 00:00:00,,2152-09-12 00:00:00,1
4,9479,10019,M,2114-06-20 00:00:00,2163-05-15 00:00:00,2163-05-15 00:00:00,2163-05-15 00:00:00,1


In [13]:
# expire_flag: 1 indicates death in the hospital, and 0 indicates survival to hospital discharge.
sum(patients['expire_flag'] == 1) # no patients survived in the demo data

100

In [14]:
# Make a sample with replacement, to construct the toy example
pos_data = drug_events_procedures_merged.sample(n=100, random_state=3, replace=True)
neg_data = drug_events_procedures_merged.sample(n=100, random_state=1, replace=True)

In [15]:
pos_data.head()

Unnamed: 0_level_0,itemid,icd9_code
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1
42346,"[226452, 225798, 220949, 221744, 221744, 22515...",[4685]
40277,"[226452, 226452, 226452, 225161, 226452, 22645...",[3142]
40595,"[225158, 221429, 221468, 220949, 225152, 22094...","[9605, 3391, 9656, 3323, 9671, 3491, 9605, 960..."
40124,"[220949, 220949, 225152, 225152, 225943, 22216...","[9671, 9671, 9604, 3323, 3491]"
42281,"[220949, 225879, 225168, 225158, 226361, 22636...","[4233, 9672, 5491, 4233, 4513, 4311, 966, 4823..."


In [16]:
neg_data.head()

Unnamed: 0_level_0,itemid,icd9_code
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1
43927,"[221749, 225158, 225943, 222168, 226364, 22637...","[3612, 3615, 3961]"
41795,"[225158, 225158, 226361, 226452, 223257, 22179...","[9672, 9604, 3893, 3893, 4513, 966]"
40595,"[225158, 221429, 221468, 220949, 225152, 22094...","[9605, 3391, 9656, 3323, 9671, 3491, 9605, 960..."
40601,"[220949, 225152, 221749, 225158, 225158, 22174...","[3723, 8856, 3893]"
40687,"[225158, 225893, 220949, 225893, 220949, 22589...","[9390, 3491]"


In [17]:
pos_data2 = pd.DataFrame()

pos_data2['drug_events'] = pos_data['itemid'].apply(lambda x: ' '.join(str(i) for i in x))
pos_data2['procedure_codes'] = pos_data['icd9_code'].apply(lambda x: ' '.join(str(i) for i in x))

In [18]:
# Export to folder './toy_example/' 
pos_data2.to_csv(path_or_buf='./toy_example/train_pos.txt', index=False, header=False, sep=' ', quoting = csv.QUOTE_NONE, escapechar = ' ')
pos_data2.to_csv(path_or_buf='./toy_example/validation_pos.txt', index=False, header=False, sep=' ', quoting = csv.QUOTE_NONE, escapechar = ' ')

In [19]:
neg_data2 = pd.DataFrame()

neg_data2['drug_events'] = neg_data['itemid'].apply(lambda x: ' '.join(str(i) for i in x))
neg_data2['procedure_codes'] = neg_data['icd9_code'].apply(lambda x: ' '.join(str(i) for i in x))

In [20]:
neg_data2.to_csv(path_or_buf='./toy_example/train_neg.txt', index=False, header=False, sep=' ', quoting = csv.QUOTE_NONE, escapechar = ' ')
neg_data2.to_csv(path_or_buf='./toy_example/validation_neg.txt', index=False, header=False, sep=' ', quoting = csv.QUOTE_NONE, escapechar = ' ')