# Readmissions data processing.

In [1]:
### Import packages.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#%matplotlib inline 
import csv
import seaborn as sns
import os
import tensorflow
import keras

### MIMIC processing.

In [2]:
### Load files.

# MIMIC.
mimic = pd.read_csv('cohort.csv', header = 0)
print(mimic.shape)

mimic_features_nonlab = pd.read_csv('features_nonlab.csv', header = 0)
print(mimic_features_nonlab.shape)

mimic_labs_last = pd.read_csv('labs_last_icu.csv', header = 0)
print(mimic_labs_last.shape)

mimic_labs_max = pd.read_csv('labs_max_icu.csv', header = 0)
print(mimic_labs_max.shape)

mimic_labs_min = pd.read_csv('labs_min_icu.csv', header = 0)
print(mimic_labs_min.shape)

mimic_labs_counts_low = pd.read_csv('labs_counts_low_icu.csv', header = 0)
print(mimic_labs_counts_low.shape)

mimic_labs_counts_high = pd.read_csv('labs_counts_high_icu.csv', header = 0)
print(mimic_labs_counts_high.shape)

mimic_labs_counts_high2 = pd.read_csv('labs_counts_high_icu2.csv', header = 0)
print(mimic_labs_counts_high2.shape)



(13841, 18)
(13841, 18)
(325754, 6)
(325563, 5)
(325563, 5)
(231315, 11)
(1428712, 6)
(1427696, 6)


In [3]:
mimic['year'] = pd.DatetimeIndex(mimic['intime']).year
mimic.year.value_counts().sort_index()

2110    118
2111    146
2112    135
2113    121
2114    144
       ... 
2207      1
2208      1
2209      1
2210      1
2211      1
Name: year, Length: 102, dtype: int64

In [4]:
mimic.head()

Unnamed: 0,subject_id,hadm_id,stay_id,admittime,dischtime,intime,outtime,admission_location,discharge_location,first_careunit,last_careunit,insurance,age,gender,ethnicity,los,code,readmit,year
0,11218297,29892836,30894115,2161-06-17T15:35:00,2161-07-17T13:57:00,2161-06-17T15:44:00,2161-07-02T17:33:44,EMERGENCY ROOM,PSYCH FACILITY,Trauma SICU (TSICU),Trauma SICU (TSICU),Other,65,M,WHITE,15.076204,Full code,1,2161
1,18633699,26578317,39203081,2113-05-06T16:49:00,2113-05-19T13:44:00,2113-05-06T17:03:00,2113-05-10T21:00:58,EMERGENCY ROOM,,Trauma SICU (TSICU),Trauma SICU (TSICU),Other,74,M,WHITE,4.165255,Full code,1,2113
2,16571396,21265242,39226730,2146-10-17T06:39:00,2146-10-24T14:00:00,2146-10-17T08:23:00,2146-10-18T17:45:31,EMERGENCY ROOM,OTHER FACILITY,Trauma SICU (TSICU),Trauma SICU (TSICU),Medicaid,53,M,WHITE,1.390637,Full code,1,2146
3,14207241,28533530,33400289,2118-05-28T03:27:00,2118-06-09T15:30:00,2118-05-28T05:01:00,2118-05-30T18:52:46,EMERGENCY ROOM,PSYCH FACILITY,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),Medicaid,42,F,UNKNOWN,2.577616,Full code,1,2118
4,16280495,23168926,37779710,2164-01-10T01:57:00,2164-01-15T13:11:00,2164-01-10T15:22:22,2164-01-11T21:48:55,EMERGENCY ROOM,AGAINST ADVICE,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),Medicaid,40,M,WHITE,1.268438,Full code,1,2164


In [5]:
### Add admit_ed, hospital_los. Rename los to icu_los.

# Add admit_ed.
#mimic['readmit'] = np.where((mimic.no_icustays == 1), 0, 1)
mimic['admit_ed'] = np.where((mimic.admission_location == "EMERGENCY ROOM"), 1, 0)

# Add hospital_los. Take longer los between icu_los and hospital_los (admittime > intime depending on when MD inputs).
from datetime import datetime
hosp_len = []
for i in range(mimic.shape[0]):
    transfertime = datetime.strptime(mimic['outtime'][i], '%Y-%m-%dT%H:%M:%S')
    admittime = datetime.strptime(mimic['admittime'][i], '%Y-%m-%dT%H:%M:%S')
    delta = transfertime - admittime
    temp = delta.total_seconds()/(60*60*24)
    if mimic['los'][i] > temp:
        hosp_len.append(mimic['los'][i])
    else:
        hosp_len.append(temp)
    #hosp_len.append(delta.total_seconds()/(60*60*24))

mimic['hospital_los'] = hosp_len

# Rename los to icu_los.
mimic = mimic.rename(columns = {'los': 'icu_los'})

mimic.head()

Unnamed: 0,subject_id,hadm_id,stay_id,admittime,dischtime,intime,outtime,admission_location,discharge_location,first_careunit,...,insurance,age,gender,ethnicity,icu_los,code,readmit,year,admit_ed,hospital_los
0,11218297,29892836,30894115,2161-06-17T15:35:00,2161-07-17T13:57:00,2161-06-17T15:44:00,2161-07-02T17:33:44,EMERGENCY ROOM,PSYCH FACILITY,Trauma SICU (TSICU),...,Other,65,M,WHITE,15.076204,Full code,1,2161,1,15.082454
1,18633699,26578317,39203081,2113-05-06T16:49:00,2113-05-19T13:44:00,2113-05-06T17:03:00,2113-05-10T21:00:58,EMERGENCY ROOM,,Trauma SICU (TSICU),...,Other,74,M,WHITE,4.165255,Full code,1,2113,1,4.174977
2,16571396,21265242,39226730,2146-10-17T06:39:00,2146-10-24T14:00:00,2146-10-17T08:23:00,2146-10-18T17:45:31,EMERGENCY ROOM,OTHER FACILITY,Trauma SICU (TSICU),...,Medicaid,53,M,WHITE,1.390637,Full code,1,2146,1,1.462859
3,14207241,28533530,33400289,2118-05-28T03:27:00,2118-06-09T15:30:00,2118-05-28T05:01:00,2118-05-30T18:52:46,EMERGENCY ROOM,PSYCH FACILITY,Medical Intensive Care Unit (MICU),...,Medicaid,42,F,UNKNOWN,2.577616,Full code,1,2118,1,2.642894
4,16280495,23168926,37779710,2164-01-10T01:57:00,2164-01-15T13:11:00,2164-01-10T15:22:22,2164-01-11T21:48:55,EMERGENCY ROOM,AGAINST ADVICE,Medical Intensive Care Unit (MICU),...,Medicaid,40,M,WHITE,1.268438,Full code,1,2164,1,1.82772


In [6]:
### Keep necessary cols.
mimic = mimic[['subject_id', 'hadm_id', 'stay_id', 'admittime', 'dischtime', 'intime', 'outtime', 'age', 'gender', 'ethnicity', 'icu_los', 'hospital_los', 'admit_ed', 'readmit']]
mimic.head()


Unnamed: 0,subject_id,hadm_id,stay_id,admittime,dischtime,intime,outtime,age,gender,ethnicity,icu_los,hospital_los,admit_ed,readmit
0,11218297,29892836,30894115,2161-06-17T15:35:00,2161-07-17T13:57:00,2161-06-17T15:44:00,2161-07-02T17:33:44,65,M,WHITE,15.076204,15.082454,1,1
1,18633699,26578317,39203081,2113-05-06T16:49:00,2113-05-19T13:44:00,2113-05-06T17:03:00,2113-05-10T21:00:58,74,M,WHITE,4.165255,4.174977,1,1
2,16571396,21265242,39226730,2146-10-17T06:39:00,2146-10-24T14:00:00,2146-10-17T08:23:00,2146-10-18T17:45:31,53,M,WHITE,1.390637,1.462859,1,1
3,14207241,28533530,33400289,2118-05-28T03:27:00,2118-06-09T15:30:00,2118-05-28T05:01:00,2118-05-30T18:52:46,42,F,UNKNOWN,2.577616,2.642894,1,1
4,16280495,23168926,37779710,2164-01-10T01:57:00,2164-01-15T13:11:00,2164-01-10T15:22:22,2164-01-11T21:48:55,40,M,WHITE,1.268438,1.82772,1,1


In [7]:
# Nonlab features processing.
mimic_features_nonlab = mimic_features_nonlab.rename(columns = {'xr': 'xray', 'ekg': 'ecg'}) # Rename nonlab features.
mimic_features_nonlab = mimic_features_nonlab.fillna(0) # Fill NAs as 0 for count data. 
mimic_features_nonlab['transfusions'] = mimic_features_nonlab['transfusions'] + mimic_features_nonlab['massive_transfusions'] # Adjust transfusion feature.
mimic_features_nonlab = mimic_features_nonlab.drop(['massive_transfusions'], axis=1) # Remove massive transfusions.

mimic_features_nonlab.head() # Display nonlab features.

Unnamed: 0,subject_id,gcs_max,gcs_min,gcs_last,intub_days,extub_hrs,diuretics,antihypertensives,inotrope_pressor,sedatives,ecg,cxr,xray,ct,mri,filtration,transfusions
0,15667867,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0
1,11693046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,9.0,0.0,0.0,0.0,0.0,0.0,0,0.0
2,14818825,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
3,11395095,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,0,2.0
4,10249829,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0,0.0


In [8]:
mimic_features_nonlab.describe()

Unnamed: 0,subject_id,gcs_max,gcs_min,gcs_last,intub_days,extub_hrs,diuretics,antihypertensives,inotrope_pressor,sedatives,ecg,cxr,xray,ct,mri,filtration,transfusions
count,13841.0,13841.0,13841.0,13841.0,13841.0,13841.0,13841.0,13841.0,13841.0,13841.0,13841.0,13841.0,13841.0,13841.0,13841.0,13841.0,13841.0
mean,14950520.0,14.549527,8.776533,13.955422,1.081714,29.481107,1.638321,0.874865,0.649953,3.117766,0.714905,1.780363,1.958746,0.45582,0.129398,0.028683,0.932375
std,2883794.0,1.664668,5.123587,2.716885,2.934258,73.957472,3.277435,1.758038,1.547677,4.113105,1.157253,3.072278,3.339523,1.094628,0.400257,0.16692,2.715518
min,10001720.0,0.0,0.0,0.0,-4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,12478390.0,15.0,3.0,15.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,14951130.0,15.0,8.0,15.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
75%,17422000.0,15.0,14.0,15.0,1.0,32.0,2.0,1.0,1.0,4.0,1.0,2.0,2.0,1.0,0.0,0.0,1.0
max,19999990.0,15.0,15.0,15.0,54.0,1489.0,67.0,26.0,27.0,51.0,22.0,61.0,67.0,20.0,6.0,1.0,57.0


In [9]:
mimic_labs_last.head()

Unnamed: 0,subject_id,hadm_id,stay_id,charttime,feature,last_val
0,13681438,24548459,37064397,2110-06-08 02:49:00,AST,106.0
1,13681438,24548459,37064397,2110-06-08 18:26:00,lac,4.2
2,13681438,24548459,37064397,2110-05-28 20:51:00,lipase,358.0
3,13681438,24548459,37064397,2110-06-09 22:00:00,rr,20.0
4,13681438,24548459,37064397,2110-06-08 02:49:00,tbili,20.9


In [10]:
mimic_last = mimic_labs_last.pivot_table('last_val', ['subject_id', 'hadm_id', 'stay_id'], 'feature')
mimic_last.reset_index(drop = False, inplace = True)
mimic_last = mimic_last.rename_axis(None, axis=1)
#mimic_last.drop(mimic_last.columns[[19, 20, 21]], axis = 1, inplace = True)
mimic_last = mimic_last.drop(['ht','transfusion_consent','wgt'], axis = 1)
mimic_last.columns = ['subject_id', 'hadm_id', 'stay_id', 'alt_last', 'ast_last', 'alkphos_last', 'ammonia_last', 'anion_gap_last', 'hco3_last', 'bnp_last', 'bun_last', 'cl_last', 'ck_last', 'cortisol_last', 'cr_last', 'crp_last', 'dbili_last', 'esr_last', 'fio2_last', 'glu_last', 'hgb_last', 'hr_last', 'inr_last', 'k_last', 'lac_last', 'lipase_last', 'map_last', 'na_last', 'ph_last', 'paco2_last', 'pao2_last', 'plt_last', 'rass_last', 'rr_last', 'sao2_last', 'spo2_last', 'tbili_last', 'temp_last', 'tropT_last', 'wbc_last']
print(mimic_last.shape)
mimic_last.head()

(13841, 40)


Unnamed: 0,subject_id,hadm_id,stay_id,alt_last,ast_last,alkphos_last,ammonia_last,anion_gap_last,hco3_last,bnp_last,...,pao2_last,plt_last,rass_last,rr_last,sao2_last,spo2_last,tbili_last,temp_last,tropT_last,wbc_last
0,10001725,25563031,31205490,,,,,16.0,24.0,,...,,299.0,1.0,23.0,,97.0,,98.4,,20.1
1,10002013,23581541,39060235,,,,,12.0,23.0,,...,102.0,248.0,1.0,14.0,97.0,99.0,,97.8,,17.9
2,10002223,22494570,39638202,,,,,14.0,26.0,,...,,130.0,0.0,19.0,,99.0,,97.6,,10.1
3,10002428,28662225,33987268,,,,,20.0,16.0,,...,,230.0,0.0,22.0,,98.0,,97.5,,20.4
4,10002527,29112696,37121704,,,,,9.0,25.0,,...,107.0,183.0,,26.0,96.0,97.0,,98.4,,10.5


In [11]:
mimic_last.columns

Index(['subject_id', 'hadm_id', 'stay_id', 'alt_last', 'ast_last',
       'alkphos_last', 'ammonia_last', 'anion_gap_last', 'hco3_last',
       'bnp_last', 'bun_last', 'cl_last', 'ck_last', 'cortisol_last',
       'cr_last', 'crp_last', 'dbili_last', 'esr_last', 'fio2_last',
       'glu_last', 'hgb_last', 'hr_last', 'inr_last', 'k_last', 'lac_last',
       'lipase_last', 'map_last', 'na_last', 'ph_last', 'paco2_last',
       'pao2_last', 'plt_last', 'rass_last', 'rr_last', 'sao2_last',
       'spo2_last', 'tbili_last', 'temp_last', 'tropT_last', 'wbc_last'],
      dtype='object')

In [12]:
mimic_labs_max.head()

Unnamed: 0,subject_id,hadm_id,stay_id,feature,max_val
0,13681438,24548459,37064397,temp,100.8
1,13681438,24548459,37064397,rr,59.0
2,13681438,24548459,37064397,rass,3.0
3,13681438,24548459,37064397,spo2,100.0
4,13681438,24548459,37064397,fio2,100.0


In [13]:
mimic_max = mimic_labs_max.pivot_table('max_val', ['subject_id', 'hadm_id', 'stay_id'], 'feature')

mimic_max.reset_index(drop = False, inplace = True)
mimic_max = mimic_max.rename_axis(None, axis=1)
#mimic_max.drop(mimic_max.columns[[19, 20, 21]], axis = 1, inplace = True)
mimic_max = mimic_max.drop(['hgb','ht','transfusion_consent','wgt'], axis = 1)
#df1 = df1.drop(['B', 'C'], axis=1)
mimic_max.columns = ['subject_id', 'hadm_id', 'stay_id', 'alt_max', 'ast_max', 'alkphos_max', 'ammonia_max', 'anion_gap_max', 'hco3_max', 'bnp_max', 'bun_max', 'cl_max', 'ck_max', 'cortisol_max', 'cr_max', 'crp_max', 'dbili_max', 'esr_max', 'fio2_max', 'glu_max', 'hr_max', 'inr_max', 'k_max', 'lac_max', 'lipase_max', 'map_max', 'na_max', 'ph_max', 'paco2_max', 'pao2_max', 'plt_max', 'rass_max', 'rr_max', 'sao2_max', 'spo2_max', 'tbili_max', 'temp_max', 'tropT_max', 'wbc_max']
print(mimic_max.shape)
mimic_max.head()

(13841, 39)


Unnamed: 0,subject_id,hadm_id,stay_id,alt_max,ast_max,alkphos_max,ammonia_max,anion_gap_max,hco3_max,bnp_max,...,pao2_max,plt_max,rass_max,rr_max,sao2_max,spo2_max,tbili_max,temp_max,tropT_max,wbc_max
0,10001725,25563031,31205490,,,,,16.0,24.0,,...,,330.0,1.0,23.0,,100.0,,98.4,,20.1
1,10002013,23581541,39060235,,,,,12.0,23.0,,...,462.0,254.0,1.0,23.0,99.0,100.0,,97.8,,20.2
2,10002223,22494570,39638202,,,,,14.0,26.0,,...,,130.0,0.0,32.0,,100.0,,98.6,,10.1
3,10002428,28662225,33987268,,,,,21.0,20.0,,...,,299.0,0.0,34.0,,100.0,,102.9,,33.4
4,10002527,29112696,37121704,,,,,9.0,25.0,,...,389.0,183.0,,30.0,96.0,100.0,,100.0,,10.5


In [14]:
mimic_max.columns

Index(['subject_id', 'hadm_id', 'stay_id', 'alt_max', 'ast_max', 'alkphos_max',
       'ammonia_max', 'anion_gap_max', 'hco3_max', 'bnp_max', 'bun_max',
       'cl_max', 'ck_max', 'cortisol_max', 'cr_max', 'crp_max', 'dbili_max',
       'esr_max', 'fio2_max', 'glu_max', 'hr_max', 'inr_max', 'k_max',
       'lac_max', 'lipase_max', 'map_max', 'na_max', 'ph_max', 'paco2_max',
       'pao2_max', 'plt_max', 'rass_max', 'rr_max', 'sao2_max', 'spo2_max',
       'tbili_max', 'temp_max', 'tropT_max', 'wbc_max'],
      dtype='object')

In [15]:
mimic_labs_min.head()

Unnamed: 0,subject_id,hadm_id,stay_id,feature,min_val
0,10001725,25563031,31205490,hr,55.0
1,10001725,25563031,31205490,anion_gap,14.0
2,10001725,25563031,31205490,spo2,94.0
3,10001725,25563031,31205490,rr,13.0
4,10001725,25563031,31205490,rass,0.0


In [16]:
mimic_min = mimic_labs_min.pivot_table('min_val', ['subject_id', 'hadm_id', 'stay_id'], 'feature')

mimic_min.reset_index(drop = False, inplace = True)
mimic_min = mimic_min.rename_axis(None, axis=1)
#mimic_min.drop(mimic_min.columns[[19, 20, 21]], axis = 1, inplace = True)
mimic_min = mimic_min.drop(['ht','transfusion_consent','wgt'], axis = 1)
mimic_min.columns = ['subject_id', 'hadm_id', 'stay_id', 'alt_min', 'ast_min', 'alkphos_min', 'ammonia_min', 'anion_gap_min', 'hco3_min', 'bnp_min', 'bun_min', 'cl_min', 'ck_min', 'cortisol_min', 'cr_min', 'crp_min', 'dbili_min', 'esr_min', 'fio2_min', 'glu_min', 'hgb_min', 'hr_min', 'inr_min', 'k_min', 'lac_min', 'lipase_min', 'map_min', 'na_min', 'ph_min', 'paco2_min', 'pao2_min', 'plt_min', 'rass_min', 'rr_min', 'sao2_min', 'spo2_min', 'tbili_min', 'temp_min', 'tropT_min', 'wbc_min']
print(mimic_min.shape)
mimic_min.head()

(13841, 40)


Unnamed: 0,subject_id,hadm_id,stay_id,alt_min,ast_min,alkphos_min,ammonia_min,anion_gap_min,hco3_min,bnp_min,...,pao2_min,plt_min,rass_min,rr_min,sao2_min,spo2_min,tbili_min,temp_min,tropT_min,wbc_min
0,10001725,25563031,31205490,,,,,14.0,24.0,,...,,299.0,0.0,13.0,,94.0,,97.5,,17.0
1,10002013,23581541,39060235,,,,,12.0,23.0,,...,90.0,248.0,0.0,0.0,96.0,91.0,,97.2,,17.9
2,10002223,22494570,39638202,,,,,14.0,26.0,,...,,130.0,-1.0,12.0,,92.0,,96.9,,10.1
3,10002428,28662225,33987268,,,,,15.0,15.0,,...,,230.0,0.0,12.0,,87.0,,97.5,,20.4
4,10002527,29112696,37121704,,,,,9.0,23.0,,...,107.0,151.0,,0.0,96.0,92.0,,98.4,,7.0


In [17]:
mimic_min.columns

Index(['subject_id', 'hadm_id', 'stay_id', 'alt_min', 'ast_min', 'alkphos_min',
       'ammonia_min', 'anion_gap_min', 'hco3_min', 'bnp_min', 'bun_min',
       'cl_min', 'ck_min', 'cortisol_min', 'cr_min', 'crp_min', 'dbili_min',
       'esr_min', 'fio2_min', 'glu_min', 'hgb_min', 'hr_min', 'inr_min',
       'k_min', 'lac_min', 'lipase_min', 'map_min', 'na_min', 'ph_min',
       'paco2_min', 'pao2_min', 'plt_min', 'rass_min', 'rr_min', 'sao2_min',
       'spo2_min', 'tbili_min', 'temp_min', 'tropT_min', 'wbc_min'],
      dtype='object')

In [18]:
mimic_labs_counts_low.head()

Unnamed: 0,subject_id,hadm_id,stay_id,charttime,value,valuenum,valueuom,feature,item_id,intime,outtime
0,13681438,24548459,37064397,2110-05-31 14:00:00,0.0,0.0,insp/min,rr,224689,2110-05-28 20:06:54,2110-06-10 00:57:45
1,13681438,24548459,37064397,2110-06-03 12:00:00,0.0,0.0,insp/min,rr,224689,2110-05-28 20:06:54,2110-06-10 00:57:45
2,13681438,24548459,37064397,2110-05-28 20:51:00,19.0,19.0,mEq/L,bicarb,227443,2110-05-28 20:06:54,2110-06-10 00:57:45
3,13681438,24548459,37064397,2110-06-02 03:18:00,80.0,80.0,K/uL,plt,227457,2110-05-28 20:06:54,2110-06-10 00:57:45
4,13681438,24548459,37064397,2110-06-08 02:49:00,69.0,69.0,K/uL,plt,227457,2110-05-28 20:06:54,2110-06-10 00:57:45


In [19]:
mimic_labs_low = mimic_labs_counts_low.pivot_table('valuenum', ['subject_id', 'hadm_id', 'stay_id'], 'feature', aggfunc = 'count')

mimic_labs_low.reset_index(drop = False, inplace = True)
mimic_labs_low = mimic_labs_low.rename_axis(None, axis=1)
#mimic_labs_low.drop(labs_low.columns[[19, 20, 21]], axis = 1, inplace = True)
mimic_labs_low.columns = ['subject_id', 'hadm_id', 'stay_id', 'hco3_low', 'hgb_low', 'hr_low', 'k_low', 'map_low', 'na_low', 'plt_low', 'rr_low', 'temp_low', 'wbc_low']
mimic_labs_low = mimic_labs_low.fillna(0)
print(mimic_labs_low.shape)
mimic_labs_low.head()

(12312, 13)


Unnamed: 0,subject_id,hadm_id,stay_id,hco3_low,hgb_low,hr_low,k_low,map_low,na_low,plt_low,rr_low,temp_low,wbc_low
0,10002013,23581541,39060235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
1,10002223,22494570,39638202,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,10002428,28662225,33987268,8.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
3,10002527,29112696,37121704,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0
4,10002760,28094813,31831386,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0


In [20]:
mimic_labs_low.describe()

Unnamed: 0,subject_id,hadm_id,stay_id,hco3_low,hgb_low,hr_low,k_low,map_low,na_low,plt_low,rr_low,temp_low,wbc_low
count,12312.0,12312.0,12312.0,12312.0,12312.0,12312.0,12312.0,12312.0,12312.0,12312.0,12312.0,12312.0,12312.0
mean,14953900.0,25002620.0,34963910.0,1.8404,1.335201,1.477989,0.023554,0.048571,0.509503,1.593242,9.710201,1.955084,0.294022
std,2879168.0,2894882.0,2889377.0,4.517778,3.842839,7.872316,0.311318,0.869942,2.712961,5.382132,21.409131,6.257181,1.638761
min,10002010.0,20001360.0,30000650.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,12490660.0,22487620.0,32449160.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,14964010.0,24972550.0,34950680.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0
75%,17418070.0,27522430.0,37451960.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,9.0,2.0,0.0
max,19999990.0,29999620.0,39999230.0,95.0,79.0,385.0,25.0,55.0,71.0,142.0,867.0,235.0,51.0


In [21]:
mimic_labs_counts_high.head()

Unnamed: 0,subject_id,hadm_id,stay_id,feature,valuenum,valueuom
0,11750559,29378082,30114654,rr,24.0,insp/min
1,11750559,29378082,30114654,rr,21.0,insp/min
2,11750559,29378082,30114654,rr,23.0,insp/min
3,11750559,29378082,30114654,rr,28.0,insp/min
4,11750559,29378082,30114654,rr,24.0,insp/min


In [22]:
# BQ import duplicated.
mimic_labs_counts_high2.equals(mimic_labs_counts_high)


False

In [23]:
mimic_labs_high = mimic_labs_counts_high.pivot_table('valuenum', ['subject_id', 'hadm_id', 'stay_id'], 'feature', aggfunc = 'count')

mimic_labs_high.reset_index(drop = False, inplace = True)
mimic_labs_high = mimic_labs_high.rename_axis(None, axis=1)
#labs_low.drop(labs_low.columns[[19, 20, 21]], axis = 1, inplace = True)
# took out map_high. need to check map query.
mimic_labs_high.columns = ['subject_id', 'hadm_id', 'stay_id', 'hco3_high', 'bnp_high', 'bun_high', 'crp_high', 'esr_high', 'glu_high', 'hr_high', 'k_high', 'lac_high', 'na_high', 'rr_high', 'temp_high', 'tropT_high', 'wbc_high']
mimic_labs_high = mimic_labs_high.fillna(0)
print(mimic_labs_high.shape)
mimic_labs_high.head()

ValueError: Length mismatch: Expected axis has 18 elements, new values have 17 elements

In [None]:
mimic_labs_high = mimic_labs_counts_high.pivot_table('valuenum', ['subject_id', 'hadm_id', 'stay_id'], 'feature', aggfunc = 'count')

mimic_labs_high.reset_index(drop = False, inplace = True)
mimic_labs_high = mimic_labs_high.rename_axis(None, axis=1)
#mimic_labs_high.drop(labs_low.columns[[19, 20, 21]], axis = 1, inplace = True)
mimic_labs_high.columns = ['subject_id', 'hadm_id', 'stay_id', 'hco3_high', 'bnp_high', 'bun_high', 'crp_high', 'esr_high', 'glu_high', 'hr_high', 'k_high', 'lac_high', 'map_high', 'na_high', 'rr_high', 'temp_high', 'trop_high', 'wbc_high']
mimic_labs_high = mimic_labs_high.fillna(0)
print(mimic_labs_high.shape)
mimic_labs_high.head()

### STARR processing.

In [None]:
### Load files. 
starr = pd.read_csv('starr_cohort.csv', header = 0)
starr_labs_counts = pd.read_csv('starr_labs_counts.csv', header = 0)
starr_labs_last = pd.read_csv('starr_labs_last.csv', header = 0)
starr_labs_minmax = pd.read_csv('starr_labs_minmax.csv', header = 0)
starr_nonlab_features = pd.read_csv('starr_nonlab_features.csv', header = 0)

print('starr cohort: ', starr.shape)
print('starr lab counts: ', starr_labs_counts.shape)
print('starr last labs: ', starr_labs_last.shape)
print('starr min/max labs: ', starr_labs_minmax.shape)
print('starr nonlab features: ', starr_nonlab_features.shape)



In [None]:
from datetime import datetime 

starr['year'] = pd.DatetimeIndex(starr['icu_admit_time']).year
starr.year.value_counts().sort_index()

In [None]:
starr.head()

In [None]:
starr_labs_counts.head()

In [None]:
starr_labs_counts.isna().sum()

In [None]:
starr_labs_counts = starr_labs_counts.drop(['admit_time', 'trans_out_time', 'disch_time'], axis=1)
starr_labs_counts = starr_labs_counts.fillna(0)
starr_labs_counts.head()

In [None]:
starr_labs_counts.isna().sum()

In [None]:
starr_labs_last = starr_labs_last.drop(['admit_time', 'trans_out_time', 'disch_time'], axis=1)
starr_labs_last.head()

In [None]:
starr_labs_minmax = starr_labs_minmax.drop(['admit_time', 'trans_out_time', 'disch_time'], axis=1)
starr_labs_minmax.head()

In [None]:
starr_nonlab_features.head()

In [None]:
starr_nonlab_features.isna().sum()

In [None]:
starr_nonlab_features = starr_nonlab_features.fillna(0)
starr_nonlab_features = starr_nonlab_features.drop(['hospital_los', 'icu_los', 'age', 'readmit', 'icu_admit_time', 'trans_out_time', 'disch_time'], axis=1)
starr_nonlab_features.head()

In [None]:
starr_nonlab_features.isna().sum()

### Sync MIMIC + STARR features.

In [None]:
##### Sync MIMIC + STARR.
### Nonlab features.
# icu_los, hospital_los, admit_ed, gender, age, ethnicity, ecg, xray, cxr, mri, ct, diuretics, antihypertensives, inotrope_pressor, sedatives, intub_days, extub_hrs
starr_nonlab_features.isna().sum()
mimic_features_nonlab.isna().sum()
mimic.isna().sum()
starr.isna().sum()

mimic_features_nonlab = mimic_features_nonlab.drop(['filtration', 'transfusions'], axis = 1)


In [None]:
starr_labs_last.isna().sum()

In [None]:
##### Sync MIMIC + STARR.
### Last labs.
# alt, ast, hco3, bnp, bun, ck, cr, crp, dbili, esr, fio2, inr, hr, k, lac, ph, lipase, na, map, pao2, paco2, plt, rr, sao2, spo2, temp, tbili, wbc, trop, hgb, gcs, pafio2

starr_labs_last.isna().sum()
mimic_last.isna().sum()
#mimic_features_nonlab.isna().sum()

starr_labs_last = starr_labs_last.drop(['bmi_last', 'sao2_time', 'spo2_time', 'ibili_last'], axis = 1) #ibili, bmi, sao2_time, spo2_time

mimic_last = mimic_last.rename(columns = {'tropT_last': 'trop_last', 'pH':'ph'})
mimic_last = mimic_last.drop(['alkphos_last', 'ammonia_last', 'anion_gap_last', 'cl_last', 'cortisol_last', 'glu_last'], axis = 1) # alkphos, ammonia, anion_gap, cl, cortisol, glu
mimic_last['pafio2_last'] = mimic_last['pao2_last']/mimic_last['fio2_last']





In [None]:
##### Sync MIMIC + STARR.
### Min/max labs.
# max: alt, ast, hco3, bnp, bun, ck, cr, crp, esr, dbili, hr, k, inr, cortisol, lipase, lac, map, ph, na, paco2, pao2, rr, spo2, sao2, tbili, trop, temp, wbc, gcs 
# min: gcs, hco3, cort, hr, inr, k, map, ph, na, paco2, pao2, plt, rr, spo2, sao2, temp, wbc, hgb

starr_labs_minmax.isna().sum()
mimic_max.isna().sum()
mimic_features_nonlab.isna().sum()
mimic_min.isna().sum()

starr_labs_minmax = starr_labs_minmax.drop(['hgb_max', 'plt_max', 'bmi_max', 'bmi_min', 'ibili_max', 'tsh_min'], axis = 1)
starr_labs_minmax = starr_labs_minmax.rename(columns = {'cort_max':'cortisol_max', 'cort_min': 'cortisol_min'})
mimic_max = mimic_max.drop(['alkphos_max', 'ammonia_max', 'anion_gap_max', 'cl_max', 'fio2_max', 'glu_max', 'plt_max'], axis = 1)
mimic_max = mimic_max.rename(columns = {'bicarb_max': 'hco3_max', 'pH': 'ph', 'tropT_max':'trop_max'})
# Add min glucose?
mimic_min = mimic_min.drop(['alt_min', 'ast_min', 'alkphos_min', 'ammonia_min', 'anion_gap_min', 'bnp_min','bun_min', 'cl_min', 'ck_min', 'cr_min', 'crp_min', 'dbili_min', 'esr_min', 'fio2_min', 'glu_min', 'lac_min', 'lipase_min', 'tbili_min', 'tropT_min'], axis = 1)
mimic_min = mimic_min.rename(columns = {'pH':'ph'})


In [None]:
##### Sync MIMIC + STARR
### Lab Counts: 
# high: hco3, bnp, bun, inflamm_marker, glu, hr, k, na, lac, temp, trop, wbc, map
# low: hco3, hgb, map, hr, k, plt, na, wbc, temp
starr_labs_counts.isna().sum()
mimic_labs_high.isna().sum()
mimic_labs_low.isna().sum()


# Adjust mimic high lab counts.
mimic_labs_high['inflamm_marker'] = mimic_labs_high['crp_high'] + mimic_labs_high['esr_high']
mimic_labs_high = mimic_labs_high.drop(['crp_high', 'esr_high', 'rr_high'], axis = 1)
mimic_labs_high['map_high'] = 0
mimic_labs_high = mimic_labs_high.rename(columns = {'tropT_high': 'trop_high'})

# Adjust mimic low lab counts.
mimic_labs_low = mimic_labs_low.drop(['rr_low'], axis = 1)

In [None]:
### MIMIC full dataset.

print('mimic cohort: ', mimic.shape)
print('mimic nonlab features: ', mimic_features_nonlab.shape)
print('mimic last labs: ', mimic_last.shape)
print('mimic max labs: ', mimic_max.shape)
print('mimic min labs', mimic_min.shape)
print('mimic low counts labs: ', mimic_labs_low.shape)
print('mimic high counts labs: ', mimic_labs_high.shape)


merge1 = pd.merge(mimic, mimic_features_nonlab, how = 'left', on = ['subject_id'])
merge2 = pd.merge(merge1, mimic_last, how = 'left', on = ['subject_id', 'hadm_id', 'stay_id'])
merge3 = pd.merge(merge2, mimic_max, how = 'left', on = ['subject_id', 'hadm_id', 'stay_id'])
merge4 = pd.merge(merge3, mimic_min, how = 'left', on = ['subject_id', 'hadm_id', 'stay_id'])
merge5 = pd.merge(merge4, mimic_labs_low, how = 'left', on = ['subject_id', 'hadm_id', 'stay_id'])
mimic_final = pd.merge(merge5, mimic_labs_high, how = 'left', on = ['subject_id', 'hadm_id', 'stay_id'])


In [None]:
mimic_final = mimic_final.drop(['hadm_id', 'stay_id', 'admittime', 'dischtime', 'intime', 'outtime'], axis = 1)
print('mimic final cohort: ', mimic_final.shape)
mimic_final.head()

In [None]:
mimic_final.describe()

In [None]:
### STARR full dataset.

starr_merge = pd.merge(starr, starr_nonlab_features, how = 'left', on = ['id'])
starr_merge2 = pd.merge(starr_merge, starr_labs_counts, how = 'left', on = ['id'])
starr_merge3 = pd.merge(starr_merge2, starr_labs_last, how = 'left', on = ['id'])
starr_final = pd.merge(starr_merge3, starr_labs_minmax, how = 'left', on = ['id'])


In [None]:
starr_final = starr_final.drop(['icu_admit_time', 'trans_out_time', 'disch_time'], axis = 1)
print('starr final cohort: ', starr_final.shape)
starr_final.head()

In [None]:
starr_final.describe()

In [None]:
feature_order = ['age', 'gender', 'ethnicity', 'icu_los', 'hospital_los', 'admit_ed', 
                 'ecg', 'xray', 'cxr', 'mri', 'ct', 'intub_days', 'extub_hrs', 
                 'diuretics', 'antihypertensives', 'inotrope_pressor', 'sedatives', 
                 'alt_last', 'ast_last', 'hco3_last', 'bnp_last', 'bun_last', 'ck_last', 'cr_last', 'crp_last', 
                 'dbili_last', 'esr_last', 'fio2_last', 'inr_last', 'hr_last', 'k_last', 'lac_last', 'ph_last', 
                 'lipase_last', 'na_last', 'map_last', 'pao2_last', 'paco2_last', 'plt_last', 'rr_last', 'sao2_last', 
                 'spo2_last', 'temp_last', 'tbili_last', 'wbc_last', 'trop_last', 'hgb_last', 'gcs_last', 'pafio2_last',
                 'alt_max', 'ast_max', 'hco3_max', 'bnp_max', 'bun_max', 'ck_max', 'cr_max', 'crp_max', 'esr_max', 
                 'dbili_max', 'hr_max', 'k_max', 'inr_max', 'cortisol_max', 'lipase_max', 'lac_max', 'map_max', 'ph_max',
                 'na_max', 'paco2_max', 'pao2_max', 'rr_max', 'spo2_max', 'sao2_max', 'tbili_max', 'trop_max', 'temp_max',
                 'wbc_max', 'gcs_max', 'gcs_min', 'hco3_min', 'cortisol_min', 'hr_min', 'inr_min', 'k_min', 'map_min', 'ph_min', 
                 'na_min', 'paco2_min', 'pao2_min', 'plt_min', 'rr_min', 'spo2_min', 'sao2_min', 'temp_min', 'wbc_min', 
                 'hgb_min', 'hco3_high', 'bnp_high', 'bun_high', 'inflamm_marker', 'glu_high', 'hr_high', 'k_high', 'na_high', 
                 'lac_high', 'temp_high', 'trop_high', 'wbc_high', 'map_high', 'hco3_low', 'hgb_low', 'map_low', 'hr_low', 'k_low', 
                 'plt_low', 'na_low', 'wbc_low', 'temp_low']


In [None]:
#mimic_final[feature_order]
mimic_final.columns.values

In [None]:
starr_final = starr_final.rename(columns = {'pafio2': 'pafio2_last'})
#starr_final[feature_order]
starr_final.columns.values

### Sync MIMIC + STARR feature format/units.

Ethnicity: White, AAPI, Black, Native American, Unknown, Other

In [None]:
mimic_final['ethnicity'].value_counts()

In [None]:
starr_final['ethnicity'].value_counts()

In [None]:
starr_final['race'].value_counts()
#starr_final.columns.values

In [None]:
#mimic_final['gender'][mimic_final.gender == 'M'] = 1
#mimic_final['gender'][mimic_final.gender == 'F'] = 0

temp_gender = []
temp = []
for i in range(mimic_final.shape[0]):
    if mimic_final['gender'][i] == 'M':
        temp_gender.append(1)
    else:
        temp_gender.append(0)
    if mimic_final['ethnicity'][i] == 'WHITE':
        temp.append(1)
    elif mimic_final['ethnicity'][i] == 'BLACK/AFRICAN AMERICAN':
        temp.append(2)
    elif mimic_final['ethnicity'][i] == 'ASIAN':
        temp.append(3)
    elif mimic_final['ethnicity'][i] == 'HISPANIC/LATINO':
        temp.append(4)
    elif mimic_final['ethnicity'][i] == 'AMERICAN INDIAN/ALASKA NATIVE':
        temp.append(5)
    elif mimic_final['ethnicity'][i] == 'OTHER':
        temp.append(6)
    else: 
        temp.append(7)
mimic_final['ethnicity'] = temp
mimic_final['gender'] = temp_gender

mimic_final.head()

In [None]:
# STARR ethnicity
temp = []
for i in range(starr_final.shape[0]):
    if starr_final['ethnicity'][i] == 'Hispanic/Latino':
        temp.append(4)
    else:
        if starr_final['race'][i] == 'White':
            temp.append(1)
        elif starr_final['race'][i] == 'Black':
            temp.append(2)
        elif starr_final['race'][i] == 'Asian':
            temp.append(3)
        elif starr_final['race'][i] == 'Pacific Islander':
            temp.append(3)
        elif starr_final['race'][i] == 'Native American':
            temp.append(5)
        elif starr_final['race'][i] == 'Other':
            temp.append(6)
        else: 
            temp.append(7)
            
starr_final = starr_final.drop(['ethnicity', 'race'], axis = 1)
starr_final['ethnicity'] = temp

starr_final.head()


In [None]:
print(starr_final.shape)
print(mimic_final.shape)

In [None]:
### Write results to csv file.
# name of csv file 
filename = "starr.csv"
    
# writing to csv file 
with open(filename, 'w') as csvfile: 
    # creating a csv writer object 
    csvwriter = csv.writer(csvfile) 
        
    # writing the header
    csvwriter.writerow(starr_final.columns) 
        
    # writing the data rows 
    for i in range(starr_final.shape[0]):
        csvwriter.writerow(starr_final.iloc[i])

In [None]:
### Write results to csv file.
# name of csv file 
filename = "mimic.csv"
    
# writing to csv file 
with open(filename, 'w') as csvfile: 
    # creating a csv writer object 
    csvwriter = csv.writer(csvfile) 
        
    # writing the header
    csvwriter.writerow(mimic_final.columns) 
        
    # writing the data rows 
    for i in range(mimic_final.shape[0]):
        csvwriter.writerow(mimic_final.iloc[i])

### MICE

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
test_proportion = 0.2
# val_proportion is in relation to the train set
# that is 1.0 is the entire train set, not the entire data set
val_proportion = 0.25

# to make sure features from MIMIC and STARR dataframes are in the same order
feature_order = ["intub_days", "female", "ED_admit", "COPD_hx", "gcs", "lactate", "rass", 
                 "temp_f", "total_los_binned", "icu_los_binned", "age_binned", 
                 "hct_binned", "wbc_binned", "inr_binned", "cr_binned", "pafio2_binned", 
                 "pao2_binned", "plt_binned", "sao2_binned", "hr_binned", "map_binned",
                 "race_numeric"]

X = starr_data.drop(columns=["id", "readmit"]+cols_binned+cols_numeric).astype("float")
X = X[feature_order]
y = starr_data[["readmit"]]

# split into train+val and test
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=test_proportion, random_state=seed)

# split into train and val 
X_train, X_val, y_train, y_val = model_selection.train_test_split(X_train, y_train, test_size=val_proportion, random_state=seed)

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# X = features, y = readmit
# remove subject_id, id
# split into train, validate, test set

# define model
model = RandomForestClassifier()

# define imputer
imputer = IterativeImputer() # default is described below
imputer = IterativeImputer(estimator=BayesianRidge(), n_nearest_features=None, imputation_order='ascending')

# define pipeline
pipeline = Pipeline(steps=[('i', imputer), ('m', model)])

# define model evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# evaluate model
scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Mean Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

# fit on the dataset
imputer.fit(X)
# transform the dataset
Xtrans = imputer.transform(X)




In [None]:
### Helpful code.
mimic.loc[np.where((mimic.hospital_los == 0))]