In [6]:
import pandas as pd
import numpy as np

In [7]:
admissions = pd.read_csv('./data/admissions_ibd.csv')
diagnosis = pd.read_csv('./data/diagnoses_ibd.csv')

In [8]:
icd_codes = ['V1582', '5849', '5990', '3051', '2761', '25000', 'I10', '56089', '4280', '5866', '2875', '496', '99592', 'D649', '7850', '51881']

mapping = {
    "V1582": "Personal history of tobacco use",
    "5849": "Acute kidney failure",
    "5990": "Urinary tract infection",
    "3051": "Tobacco use disorder",
    "2761": "Hyposmolality and/or hyponatremia",
    "25000": "Diabetes mellitus",
    "I10": "Essential (primary) hypertension",
    "56089": "Other specified intestinal obstruction",
    "4280": "Congestive heart failure",
    "5866": "Long-term (current) use of aspirin",
    "2875": "Thrombocytopenia",
    "496": "Other finger(s) amputation status",
    "99592": "Severe sepsis",
    "D649": "Anemia",
    "7850": "Tachycardia",
    "51881": "Acute respiratory failure"
}

icd = pd.merge(diagnosis, admissions.loc[:, ['hadm_id', 'admittime']], on='hadm_id', how='left')

icd

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,admittime
0,10098672,21229395,1,9975,9,2142-05-16 04:04:00
1,10098672,21229395,2,5990,9,2142-05-16 04:04:00
2,10098672,21229395,3,5849,9,2142-05-16 04:04:00
3,10098672,21229395,4,5559,9,2142-05-16 04:04:00
4,10098672,21229395,5,5793,9,2142-05-16 04:04:00
...,...,...,...,...,...,...
122135,14458834,29475856,26,V5864,9,2161-06-20 16:25:00
122136,14458834,29475856,27,V5866,9,2161-06-20 16:25:00
122137,14458834,29475856,28,V1254,9,2161-06-20 16:25:00
122138,14458834,29475856,29,V1588,9,2161-06-20 16:25:00


# First ICU Record

In [11]:
data = pd.read_csv('./data_processed/data_first_record.csv', index_col=0)

print(data.shape)
print(data.head())

(652, 25)
      subject_id               intime      los  gender       age  heart_rate  \
327     10024331  2141-03-18 19:36:08  4.10571       1  0.622768   -0.922776   
1022    10025647  2176-09-22 17:57:15  1.96810       1  1.364195   -0.922776   
896     10037975  2185-01-17 19:12:12  4.87824       1 -0.225030    0.088031   
908     10056223  2122-09-23 15:08:45  5.04106       1 -0.824804    0.037490   
559     10063856  2174-03-08 01:04:16  1.37475       0 -0.473584   -2.287366   

      respiratory_rate  hematocrit       rdw  platelet  ...  language_ENGLISH  \
327          -0.562083   -0.203896 -1.315486 -0.620134  ...                 1   
1022         -0.043414    0.163687 -0.324740 -0.356911  ...                 1   
896           3.068597    1.371460  0.309338 -1.486846  ...                 1   
908           0.129475   -0.729015 -0.285110 -1.095220  ...                 0   
559          -0.389193   -0.116376 -1.355116  0.169536  ...                 1   

      race_BLACK  race

In [14]:
icd_ = icd[icd.subject_id.isin(data.subject_id)].loc[:, ['subject_id', 'icd_code', 'admittime']]

icd_ = icd_[icd_.icd_code.isin(icd_codes)]

t = pd.merge(icd_, data.loc[:, ['subject_id', 'intime']], on='subject_id', how='left')

t = t[(t.admittime < t.intime)].loc[:, ['subject_id', 'icd_code']]

p = pd.get_dummies(t, prefix='', prefix_sep='')

p.iloc[:, 1:] = p.iloc[:, 1:].astype(int)

p = p.groupby('subject_id').max().reset_index()

p

1       0
2       0
3       0
4       0
       ..
4638    0
4639    0
4640    0
4642    0
4643    0
Name: 25000, Length: 2249, dtype: int32' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  p.iloc[:, 1:] = p.iloc[:, 1:].astype(int)
1       0
2       0
3       0
4       0
       ..
4638    0
4639    1
4640    0
4642    0
4643    1
Name: 2761, Length: 2249, dtype: int32' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  p.iloc[:, 1:] = p.iloc[:, 1:].astype(int)
1       0
2       0
3       1
4       0
       ..
4638    0
4639    0
4640    0
4642    0
4643    0
Name: 2875, Length: 2249, dtype: int32' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  p.iloc[:, 1:] = p.iloc[:, 1:].astype(int)
1       0
2       0
3       0
4       0
       ..
4638    0
4639    0
4640    0
4642    0
4643    0
Name: 3051, Length: 2249, dtype: int32' has dtype incompatible with bool, please explicit

Unnamed: 0,subject_id,25000,2761,2875,3051,4280,496,51881,56089,5849,5990,7850,99592,D649,I10,V1582
0,10024331,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0
1,10025647,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
2,10037975,0,0,0,1,0,1,1,0,0,0,0,1,0,0,0
3,10056223,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1
4,10063856,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
520,19878969,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1
521,19890872,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1
522,19923690,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0
523,19940147,0,1,1,0,0,0,1,0,0,1,0,0,0,0,1


In [21]:
(
    data.merge(
        p,
        on='subject_id',
        how='left'
    ).fillna(0.0)
    .drop(columns=[
        'subject_id', 'intime',
        'race_BLACK', 'race_HISPANIC/LATINO', 'race_OTHER', 'race_WHITE', 
        'marital_status_SINGLE', 'marital_status_WIDOWED',
    ])
    .rename(columns=mapping)
    .to_csv('./data_processed/data_first_record_with_commorbidities.csv')
)

## Die in ICU

In [23]:
data = pd.read_csv('./data_processed/data_die_in_icu.csv', index_col=0)

print(data.shape)
print(data.head())

(139, 25)
      subject_id               intime       los  gender       age  heart_rate  \
896     10037975  2185-01-17 19:12:12  4.878240       1 -0.225030    0.088031   
1145    10135345  2131-09-05 17:16:00  0.942813       0 -0.892544   -1.124938   
447     10149929  2166-07-27 15:39:59  9.745490       1 -0.126950   -0.063590   
1072    10251182  2155-05-02 00:30:00  0.670891       1  0.179994   -0.922776   
139     10328470  2141-08-12 06:03:35  0.318137       0  0.584312    2.817210   

      respiratory_rate  hematocrit       rdw  platelet  ...  language_ENGLISH  \
896           3.068597    1.371460  0.309338 -1.486846  ...                 1   
1145         -0.389193    0.461254 -1.117337  0.631782  ...                 1   
447          -0.562083   -1.324150  0.230078 -0.774216  ...                 1   
1072         -0.389193    1.038885 -0.602149 -0.042327  ...                 1   
139           2.204149   -0.396440  0.269708  1.479233  ...                 1   

      race_BLACK

In [24]:
icd_ = icd[icd.subject_id.isin(data.subject_id)].loc[:, ['subject_id', 'icd_code', 'admittime']]

icd_ = icd_[icd_.icd_code.isin(icd_codes)]

t = pd.merge(icd_, data.loc[:, ['subject_id', 'intime']], on='subject_id', how='left')

t = t[(t.admittime < t.intime)].loc[:, ['subject_id', 'icd_code']]

p = pd.get_dummies(t, prefix='', prefix_sep='')

p.iloc[:, 1:] = p.iloc[:, 1:].astype(int)

p = p.groupby('subject_id').max().reset_index()

p

1       1
2       0
3       0
4       0
       ..
1903    0
1904    0
1905    0
1906    0
1907    0
Name: 25000, Length: 1431, dtype: int32' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  p.iloc[:, 1:] = p.iloc[:, 1:].astype(int)
1       0
2       0
3       0
4       0
       ..
1903    0
1904    1
1905    1
1906    0
1907    0
Name: 2761, Length: 1431, dtype: int32' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  p.iloc[:, 1:] = p.iloc[:, 1:].astype(int)
1       0
2       0
3       0
4       0
       ..
1903    0
1904    0
1905    0
1906    0
1907    0
Name: 2875, Length: 1431, dtype: int32' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  p.iloc[:, 1:] = p.iloc[:, 1:].astype(int)
1       0
2       0
3       0
4       0
       ..
1903    0
1904    0
1905    0
1906    0
1907    0
Name: 3051, Length: 1431, dtype: int32' has dtype incompatible with bool, please explicit

Unnamed: 0,subject_id,25000,2761,2875,3051,4280,496,51881,56089,5849,5990,7850,99592,D649,I10,V1582
0,10037975,0,0,0,1,0,1,1,0,0,0,0,1,0,0,0
1,10135345,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,10149929,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
3,10251182,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,10328470,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,19735084,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0
114,19848285,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
115,19878969,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1
116,19923690,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0


In [25]:
(
    pd.merge(
        data,
        p,
        on='subject_id',
        how='left'
    ).fillna(0.0)
    .drop(columns=[
        'subject_id', 'intime',
        'race_BLACK', 'race_HISPANIC/LATINO', 'race_OTHER', 'race_WHITE', 
        'marital_status_SINGLE', 'marital_status_WIDOWED',
    ])
    .rename(columns=mapping)
    .to_csv('./data_processed/data_die_in_icu_with_commorbidities.csv')
)

Unnamed: 0,los,gender,age,heart_rate,respiratory_rate,hematocrit,rdw,platelet,mcv,mch,...,Other finger(s) amputation status,Acute respiratory failure,Other specified intestinal obstruction,Acute kidney failure,Urinary tract infection,Tachycardia,Severe sepsis,Anemia,Essential (primary) hypertension,Personal history of tobacco use
0,4.878240,1,-0.225030,0.088031,3.068597,1.371460,0.309338,-1.486846,0.275588,0.354672,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.942813,0,-0.892544,-1.124938,-0.389193,0.461254,-1.117337,0.631782,-0.424617,-0.566844,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,9.745490,1,-0.126950,-0.063590,-0.562083,-1.324150,0.230078,-0.774216,1.325896,1.784611,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.670891,1,0.179994,-0.922776,-0.389193,1.038885,-0.602149,-0.042327,-0.774720,-0.090198,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.318137,0,0.584312,2.817210,2.204149,-0.396440,0.269708,1.479233,0.392289,-0.566844,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,1.795720,1,-0.657220,0.138571,-0.562083,0.163687,-0.047331,-0.504573,0.042186,0.386449,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
135,19.829300,0,1.393125,1.250459,0.302365,-0.116376,-0.839928,0.779444,-0.424617,-0.566844,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
136,1.972990,0,1.488736,0.290192,0.302365,0.058663,-0.839928,0.394239,-0.074515,-0.407962,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
137,1.825230,0,2.190119,0.189112,-0.389193,2.614242,-0.483259,-0.048747,1.209195,0.354672,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
