In [13]:
import pandas as pd
import numpy as np

In [14]:
admissions = pd.read_csv('./data/admissions_ibd.csv')
diagnosis = pd.read_csv('./data/diagnoses_ibd.csv')

In [15]:
icd_codes = ['V1582', '5849', '5990', '3051', '2761', '25000', 'I10', '56089', '4280', '5866', '2875', '496', '99592', 'D649', '7850', '51881']

mapping = {
    "V1582": "Personal history of tobacco use",
    "5849": "Acute kidney failure",
    "5990": "Urinary tract infection",
    "3051": "Tobacco use disorder",
    "2761": "Hyposmolality and/or hyponatremia",
    "25000": "Diabetes mellitus",
    "I10": "Essential (primary) hypertension",
    "56089": "Other specified intestinal obstruction",
    "4280": "Congestive heart failure",
    "5866": "Long-term (current) use of aspirin",
    "2875": "Thrombocytopenia",
    "496": "Other finger(s) amputation status",
    "99592": "Severe sepsis",
    "D649": "Anemia",
    "7850": "Tachycardia",
    "51881": "Acute respiratory failure"
}

icd = pd.merge(diagnosis, admissions.loc[:, ['hadm_id', 'admittime']], on='hadm_id', how='left')

icd

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,admittime
0,10098672,21229395,1,9975,9,2142-05-16 04:04:00
1,10098672,21229395,2,5990,9,2142-05-16 04:04:00
2,10098672,21229395,3,5849,9,2142-05-16 04:04:00
3,10098672,21229395,4,5559,9,2142-05-16 04:04:00
4,10098672,21229395,5,5793,9,2142-05-16 04:04:00
...,...,...,...,...,...,...
122135,14458834,29475856,26,V5864,9,2161-06-20 16:25:00
122136,14458834,29475856,27,V5866,9,2161-06-20 16:25:00
122137,14458834,29475856,28,V1254,9,2161-06-20 16:25:00
122138,14458834,29475856,29,V1588,9,2161-06-20 16:25:00


# First ICU Record

In [16]:
data = pd.read_csv('./data_processed/data_first_record.csv', index_col=0)

print(data.shape)
print(data.head())

(652, 25)
      subject_id               intime      los  gender       age  heart_rate  \
327     10024331  2141-03-18 19:36:08  4.10571       1  0.684717     0.28750   
1022    10025647  2176-09-22 17:57:15  1.96810       1  0.833860     0.28750   
896     10037975  2185-01-17 19:12:12  4.87824       1  0.514176     0.41250   
908     10056223  2122-09-23 15:08:45  5.04106       1  0.393528     0.40625   
559     10063856  2174-03-08 01:04:16  1.37475       0  0.464178     0.11875   

      respiratory_rate  hematocrit       rdw  platelet  ...  language_ENGLISH  \
327               0.32    0.425234  0.484848  0.127773  ...                 1   
1022              0.38    0.474299  0.579545  0.164153  ...                 1   
896               0.74    0.635514  0.640152  0.007986  ...                 1   
908               0.40    0.355140  0.583333  0.062112  ...                 0   
559               0.34    0.436916  0.481061  0.236912  ...                 1   

      race_BLACK  race

In [17]:
icd_ = icd[icd.subject_id.isin(data.subject_id)].loc[:, ['subject_id', 'icd_code', 'admittime']]

icd_ = icd_[icd_.icd_code.isin(icd_codes)]

t = pd.merge(icd_, data.loc[:, ['subject_id', 'intime']], on='subject_id', how='left')

t = t[(t.admittime < t.intime)].loc[:, ['subject_id', 'icd_code']]

p = pd.get_dummies(t, prefix='', prefix_sep='')

p.iloc[:, 1:] = p.iloc[:, 1:].astype(int)

p = p.groupby('subject_id').max().reset_index()

p

1       0
2       0
3       0
4       0
       ..
4638    0
4639    0
4640    0
4642    0
4643    0
Name: 25000, Length: 2249, dtype: int32' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  p.iloc[:, 1:] = p.iloc[:, 1:].astype(int)
1       0
2       0
3       0
4       0
       ..
4638    0
4639    1
4640    0
4642    0
4643    1
Name: 2761, Length: 2249, dtype: int32' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  p.iloc[:, 1:] = p.iloc[:, 1:].astype(int)
1       0
2       0
3       1
4       0
       ..
4638    0
4639    0
4640    0
4642    0
4643    0
Name: 2875, Length: 2249, dtype: int32' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  p.iloc[:, 1:] = p.iloc[:, 1:].astype(int)
1       0
2       0
3       0
4       0
       ..
4638    0
4639    0
4640    0
4642    0
4643    0
Name: 3051, Length: 2249, dtype: int32' has dtype incompatible with bool, please explicit

Unnamed: 0,subject_id,25000,2761,2875,3051,4280,496,51881,56089,5849,5990,7850,99592,D649,I10,V1582
0,10024331,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0
1,10025647,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
2,10037975,0,0,0,1,0,1,1,0,0,0,0,1,0,0,0
3,10056223,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1
4,10063856,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
520,19878969,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1
521,19890872,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1
522,19923690,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0
523,19940147,0,1,1,0,0,0,1,0,0,1,0,0,0,0,1


In [18]:
(
    data.merge(
        p,
        on='subject_id',
        how='left'
    ).fillna(0.0)
    .drop(columns=[
        'subject_id', 'intime',
        'race_BLACK', 'race_HISPANIC/LATINO', 'race_OTHER', 'race_WHITE', 
        'marital_status_SINGLE', 'marital_status_WIDOWED',
    ])
    .rename(columns=mapping)
    # .to_csv('./data_processed/data_first_record_with_commorbidities.csv')
)

Unnamed: 0,los,gender,age,heart_rate,respiratory_rate,hematocrit,rdw,platelet,mcv,mch,...,Other finger(s) amputation status,Acute respiratory failure,Other specified intestinal obstruction,Acute kidney failure,Urinary tract infection,Tachycardia,Severe sepsis,Anemia,Essential (primary) hypertension,Personal history of tobacco use
0,4.105710,1,0.684717,0.28750,0.32,0.425234,0.484848,0.127773,0.770492,0.742015,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.968100,1,0.833860,0.28750,0.38,0.474299,0.579545,0.164153,0.672131,0.680590,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.878240,1,0.514176,0.41250,0.74,0.635514,0.640152,0.007986,0.762295,0.754300,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,5.041060,1,0.393528,0.40625,0.40,0.355140,0.583333,0.062112,0.704918,0.761671,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.374750,0,0.464178,0.11875,0.34,0.436916,0.481061,0.236912,0.754098,0.783784,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
647,2.009390,0,0.741280,0.26250,0.42,0.394860,0.522727,0.192547,0.778689,0.764128,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
648,0.757627,0,0.327277,0.47500,0.32,0.292056,0.731061,0.319432,0.614754,0.555283,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
649,1.825230,0,1.000000,0.42500,0.34,0.801402,0.564394,0.206744,0.827869,0.754300,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
650,14.175100,0,0.498847,0.41250,0.42,0.296729,0.564394,0.052351,0.762295,0.796069,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


## Die in ICU

In [19]:
data = pd.read_csv('./data_processed/data_die_in_icu.csv', index_col=0)

print(data.shape)
print(data.head())

(139, 25)
      subject_id               intime       los  gender       age  heart_rate  \
896     10037975  2185-01-17 19:12:12  4.878240       1  0.514176     0.41250   
1145    10135345  2131-09-05 17:16:00  0.942813       0  0.379901     0.26250   
447     10149929  2166-07-27 15:39:59  9.745490       1  0.533906     0.39375   
1072    10251182  2155-05-02 00:30:00  0.670891       1  0.595650     0.28750   
139     10328470  2141-08-12 06:03:35  0.318137       0  0.676981     0.75000   

      respiratory_rate  hematocrit       rdw  platelet  ...  language_ENGLISH  \
896               0.74    0.635514  0.640152  0.007986  ...                 1   
1145              0.34    0.514019  0.503788  0.300799  ...                 1   
447               0.32    0.275701  0.632576  0.106477  ...                 1   
1072              0.34    0.591121  0.553030  0.207631  ...                 1   
139               0.64    0.399533  0.636364  0.417924  ...                 1   

      race_BLACK

In [20]:
icd_ = icd[icd.subject_id.isin(data.subject_id)].loc[:, ['subject_id', 'icd_code', 'admittime']]

icd_ = icd_[icd_.icd_code.isin(icd_codes)]

t = pd.merge(icd_, data.loc[:, ['subject_id', 'intime']], on='subject_id', how='left')

t = t[(t.admittime < t.intime)].loc[:, ['subject_id', 'icd_code']]

p = pd.get_dummies(t, prefix='', prefix_sep='')

p.iloc[:, 1:] = p.iloc[:, 1:].astype(int)

p = p.groupby('subject_id').max().reset_index()

p

1       1
2       0
3       0
4       0
       ..
1903    0
1904    0
1905    0
1906    0
1907    0
Name: 25000, Length: 1431, dtype: int32' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  p.iloc[:, 1:] = p.iloc[:, 1:].astype(int)
1       0
2       0
3       0
4       0
       ..
1903    0
1904    1
1905    1
1906    0
1907    0
Name: 2761, Length: 1431, dtype: int32' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  p.iloc[:, 1:] = p.iloc[:, 1:].astype(int)
1       0
2       0
3       0
4       0
       ..
1903    0
1904    0
1905    0
1906    0
1907    0
Name: 2875, Length: 1431, dtype: int32' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  p.iloc[:, 1:] = p.iloc[:, 1:].astype(int)
1       0
2       0
3       0
4       0
       ..
1903    0
1904    0
1905    0
1906    0
1907    0
Name: 3051, Length: 1431, dtype: int32' has dtype incompatible with bool, please explicit

Unnamed: 0,subject_id,25000,2761,2875,3051,4280,496,51881,56089,5849,5990,7850,99592,D649,I10,V1582
0,10037975,0,0,0,1,0,1,1,0,0,0,0,1,0,0,0
1,10135345,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,10149929,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
3,10251182,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,10328470,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,19735084,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0
114,19848285,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
115,19878969,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1
116,19923690,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0


In [21]:
(
    pd.merge(
        data,
        p,
        on='subject_id',
        how='left'
    ).fillna(0.0)
    .drop(columns=[
        'subject_id', 'intime',
        'race_BLACK', 'race_HISPANIC/LATINO', 'race_OTHER', 'race_WHITE', 
        'marital_status_SINGLE', 'marital_status_WIDOWED',
    ])
    .rename(columns=mapping)
    .to_csv('./data_processed/data_die_in_icu_with_commorbidities.csv')
)

In [30]:
data.columns

Index(['subject_id', 'intime', 'los', 'gender', 'age', 'heart_rate',
       'respiratory_rate', 'hematocrit', 'rdw', 'platelet', 'mcv', 'mch',
       'hemoglobin', 'die_in_icu', 'uc_only', 'cd_only', 'language_ENGLISH',
       'race_BLACK', 'race_HISPANIC/LATINO', 'race_OTHER', 'race_WHITE',
       'marital_status_MARRIED', 'marital_status_SINGLE',
       'marital_status_WIDOWED', 'insurance_Medicare', 'insurance_Other'],
      dtype='object')

In [29]:
data = pd.read_csv('./data_processed/data_first_record_.csv', index_col=0)

print(data.shape)
print(data.head())

(1127, 26)
      subject_id               intime      los  gender       age  heart_rate  \
327     10024331  2141-03-18 19:36:08  4.10571       1  0.684717     0.28750   
1022    10025647  2176-09-22 17:57:15  1.96810       1  0.833860     0.28750   
896     10037975  2185-01-17 19:12:12  4.87824       1  0.514176     0.41250   
908     10056223  2122-09-23 15:08:45  5.04106       1  0.393528     0.40625   
559     10063856  2174-03-08 01:04:16  1.37475       0  0.464178     0.11875   

      respiratory_rate  hematocrit       rdw  platelet  ...  language_ENGLISH  \
327               0.32    0.425234  0.484848  0.127773  ...                 1   
1022              0.38    0.474299  0.579545  0.164153  ...                 1   
896               0.74    0.635514  0.640152  0.007986  ...                 1   
908               0.40    0.355140  0.583333  0.062112  ...                 0   
559               0.34    0.436916  0.481061  0.236912  ...                 1   

      race_BLACK  rac

In [31]:
icd_ = icd[icd.subject_id.isin(data.subject_id)].loc[:, ['subject_id', 'icd_code', 'admittime']]

icd_ = icd_[icd_.icd_code.isin(icd_codes)]

t = pd.merge(icd_, data.loc[:, ['subject_id', 'intime']], on='subject_id', how='left')

t = t[(t.admittime < t.intime)].loc[:, ['subject_id', 'icd_code']]

p = pd.get_dummies(t, prefix='', prefix_sep='')

p.iloc[:, 1:] = p.iloc[:, 1:].astype(int)

p = p.groupby('subject_id').max().reset_index()

p

1        0
2        0
3        0
4        0
        ..
16476    0
16477    0
16478    0
16480    0
16481    0
Name: 25000, Length: 10901, dtype: int32' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  p.iloc[:, 1:] = p.iloc[:, 1:].astype(int)
1        0
2        0
3        0
4        0
        ..
16476    1
16477    0
16478    0
16480    0
16481    1
Name: 2761, Length: 10901, dtype: int32' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  p.iloc[:, 1:] = p.iloc[:, 1:].astype(int)
1        0
2        0
3        0
4        0
        ..
16476    0
16477    0
16478    0
16480    0
16481    0
Name: 2875, Length: 10901, dtype: int32' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  p.iloc[:, 1:] = p.iloc[:, 1:].astype(int)
1        0
2        0
3        0
4        0
        ..
16476    0
16477    0
16478    0
16480    0
16481    0
Name: 3051, Length: 10901, dtype: int32' has d

Unnamed: 0,subject_id,25000,2761,2875,3051,4280,496,51881,56089,5849,5990,7850,99592,D649,I10,V1582
0,10024331,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0
1,10025647,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
2,10037975,0,0,0,1,0,1,1,0,0,0,0,1,0,0,0
3,10056223,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1
4,10063856,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542,19878969,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1
543,19890872,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1
544,19923690,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0
545,19940147,0,1,1,0,0,0,1,0,1,1,0,0,0,0,1


In [32]:
(
    data.merge(
        p,
        on='subject_id',
        how='left'
    ).fillna(0.0)
    .drop(columns=[
        'subject_id', 'intime',
        'race_BLACK', 'race_HISPANIC/LATINO', 'race_OTHER', 'race_WHITE', 
        'marital_status_SINGLE', 'marital_status_WIDOWED',
    ])
    .rename(columns=mapping)
    .to_csv('./data_processed/data_first_record_with_commorbidities_.csv')
)