In [13]:
! pip install --upgrade tables keras tensorflow

Requirement already up-to-date: tables in /home/hcinyoung/dev/mdml/venv/lib/python3.6/site-packages (3.6.0)
Requirement already up-to-date: keras in /home/hcinyoung/dev/mdml/venv/lib/python3.6/site-packages (2.3.1)
Requirement already up-to-date: tensorflow in /home/hcinyoung/dev/mdml/venv/lib/python3.6/site-packages (2.0.0)
[33mYou are using pip version 19.0.3, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [14]:
%matplotlib inline

import matplotlib.pyplot as plt
from functools import reduce
import seaborn as sns; sns.set(rc={'figure.figsize':(15,15)})
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from sklearn.preprocessing import MinMaxScaler
engine = create_engine('postgresql://postgres:mimic@192.168.1.72:5555/mimic')

In [15]:
def get_mortality_label():
    label = pd.read_sql("""
    select icustay_id, hadm_id, date_trunc('day', outtime) as ts, hospital_expire_flag, thirtyday_expire_flag
    from sepsis3
    where excluded=0
    """, engine)
    label.set_index(['icustay_id', 'hadm_id', 'ts'], inplace=True)
    return label

def get_demo():
    demo = pd.read_sql("""
    select icustay_id, hadm_id, date_trunc('day', intime) as ts
        , age, is_male, race_white, race_black, race_hispanic, race_other
    from sepsis3
    where excluded=0
    """, engine)
    demo.set_index(['icustay_id', 'hadm_id', 'ts'], inplace=True)
    return demo

def get_admit():
    admit = pd.read_sql("""
    select icustay_id, hadm_id, date_trunc('day', intime) as ts, icu_los, hosp_los
    from sepsis3
    where excluded=0
    """, engine)
    admit.set_index(['icustay_id', 'hadm_id', 'ts'], inplace=True)
    return admit

def get_comorbidity():
    com = pd.read_sql('''
    select s.icustay_id, date_trunc('day', admittime) as ts, c.*
    from comorbidity c 
        inner join (select icustay_id, hadm_id from sepsis3 where excluded=0) s 
            on c.hadm_id=s.hadm_id
    ''', engine)
    del com['subject_id']
    del com['admittime']
    com.set_index(['icustay_id', 'hadm_id', 'ts'], inplace=True)
    
    return com

def get_gcs():
    gcs = pd.read_sql('''
    select v.* 
    from gcsdaily v
        inner join (select hadm_id from sepsis3 where excluded=0) s 
            on v.hadm_id=s.hadm_id
    where charttime_by_day is not null
    ''', engine)

    del gcs['subject_id']
    gcs.rename(columns = {'charttime_by_day': 'ts'}, inplace=True)
    gcs.set_index(['icustay_id', 'hadm_id', 'ts'], inplace=True)
    return gcs

def get_vitalsign():
    vital = pd.read_sql('''
    select v.* 
    from vitalsdaily v
        inner join (select hadm_id from sepsis3 where excluded=0) s 
            on v.hadm_id=s.hadm_id
    ''', engine)

    del vital['subject_id']
    vital.rename(columns = {'charttime_by_day': 'ts'}, inplace=True)
    vital.set_index(['icustay_id', 'hadm_id', 'ts'], inplace=True)
    return vital

def get_drug():
    drug = pd.read_sql("""
    select p.icustay_id, p.hadm_id
        , startdate as ts
        , 'prescription' as category
        , drug
        , sum((EXTRACT(EPOCH FROM enddate - startdate))/ 60 / 60 / 24) as duration
    from prescriptions p
        inner join (select hadm_id, icustay_id from sepsis3 where excluded=0) s 
            on p.hadm_id=s.hadm_id and p.icustay_id=s.icustay_id
    group by p.icustay_id, p.hadm_id, ts, drug
    """, engine)
    drug.duration = drug.duration.replace(0, 1) # avoid null of instant prescription
    pivot_drug = pd.pivot_table(drug, 
                                index=['icustay_id', 'hadm_id', 'ts'], 
                                columns=['drug'], 
                                values='duration', 
                                fill_value=0)
    return pivot_drug

def get_lab():
    lab = pd.read_sql("""
    select s.icustay_id, c.hadm_id, date_trunc('day', c.charttime) as ts
        , d.label
        , valuenum
    from labevents c
        inner join (select hadm_id, icustay_id from sepsis3 where excluded=0) s 
            on c.hadm_id=s.hadm_id
        join d_labitems d using (itemid)
    where itemid in (
        50912  -- 크레아티닌(creatinine)
        ,50905, 50906  -- LDL-콜레스테롤(LDL-cholesterol)
        ,50852  -- 당화혈색소(HbA1c/Hemoglobin A1c)
        ,50809, 50931  -- 공복혈당(fasting plasma glucose)
        ,50889  -- C-반응성 단백질(C-reactive protein)
        ,50811, 51222  -- 헤모글로빈(hemoglobin)
        ,50907  -- 총콜레스테롤(total cholesterol)
        ,50945  -- 호모시스테인(Homocysteine)
        ,51006  -- 혈액 요소 질소(blood urea nitrogen)
        ,51000  -- 중성지방(triglyceride)
        ,51105  -- 요산(uric acid)
        ,50904  -- HDL-콜레스테롤(HDL-cholesterol)
        ,51265  -- 혈소판(platelet)
        ,51288  -- 적혈구침강속도(Erythrocyte sedimentation rate)
        ,51214  -- 피브리노겐(fibrinogen)
        ,51301  -- 백혈구(white blood cell)
        ,50963  -- B형 나트륨 이뇨펩타이드(B-type Natriuretic Peptide)
        ,51002, 51003  -- 트로포닌(Troponin)
        ,50908  -- 크레아티닌키나제-MB(Creatine Kinase - Muscle Brain)
        ,50862  -- 알부민(albumin)
        ,50821  -- 동맥 산소분압(arterial pO2)
        ,50818  -- 이산화탄소분압(pCO2)
        ,50820  -- 동맥혈의 산도(arterial PH)
        ,50910  -- 크레아틴키나제(CK)
        ,51237  -- 혈액응고검사(PT (INR)/aPTT) 
        ,50885  -- 빌리루빈(bilirubin)
        ,51144  -- 대상핵세포(band cells)
        ,50863  -- 알칼리 인산염(alkaline phosphatase)
    )
    """, engine)
    
    pivot_lab = pd.pivot_table(lab, 
                                    index=['icustay_id', 'hadm_id', 'ts'], 
                                    columns=['label'], 
                                    values='valuenum', 
                                    # aggfunc=['min', 'max', np.mean]
                                    fill_value=0)
    return pivot_lab


def get_vaso():
    vaso = pd.read_sql("""
    select c.icustay_id, s.hadm_id, date_trunc('day', c.starttime) as ts
        , duration_hours as vaso_duration_hours
    from vasopressordurations c
        inner join (select hadm_id, icustay_id from sepsis3 where excluded=0) s 
            on c.icustay_id=s.icustay_id
    """, engine)
    
    vaso.set_index(['icustay_id', 'hadm_id', 'ts'], inplace=True)
    return vaso

- 패혈증 진단받은 환자수, 입원수

In [16]:
pd.read_sql(
"""
select count(distinct hadm_id), count(distinct icustay_id) from sepsis3 where excluded=0
""", engine)

Unnamed: 0,count,count.1
0,11791,11791


- ICU, 입원 기간의 최소, 최대

In [17]:
pd.read_sql(
"""
select min(icu_los), max(icu_los), min(hosp_los), max(hosp_los) from sepsis3 where excluded=0
""", engine)

Unnamed: 0,min,max,min.1,max.1
0,0.0015,101.739,-0.945139,206.425694


## 라벨
- 사망: 원내 사망, 30일 이내 사망

In [18]:
label = get_mortality_label()
label.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,hospital_expire_flag,thirtyday_expire_flag
icustay_id,hadm_id,ts,Unnamed: 3_level_1,Unnamed: 4_level_1
200021,109307,2114-12-27,0,0
200028,181955,2133-11-01,0,0
200033,198650,2198-08-21,1,1
200061,121149,2134-01-25,0,0
200075,132255,2159-09-25,0,0


## 변수 : 인구통계, 입원, 진단


In [19]:
demo = get_demo()
demo.head()    

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,age,is_male,race_white,race_black,race_hispanic,race_other
icustay_id,hadm_id,ts,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
200021,109307,2114-12-26,60.8501,1,1,0,0,0
200028,181955,2133-10-29,64.8666,1,1,0,0,0
200033,198650,2198-08-07,67.1445,1,1,0,0,0
200061,121149,2134-01-23,45.7505,1,0,0,0,1
200075,132255,2159-09-23,83.6432,0,1,0,0,0


In [20]:
admit = get_admit()
admit.head()   

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,icu_los,hosp_los
icustay_id,hadm_id,ts,Unnamed: 3_level_1,Unnamed: 4_level_1
200021,109307,2114-12-26,1.1259,1.948611
200028,181955,2133-10-29,2.9038,3.204167
200033,198650,2198-08-07,13.8771,13.722917
200061,121149,2134-01-23,2.0142,2.959722
200075,132255,2159-09-23,2.0708,8.522917


In [21]:
com = get_comorbidity()
com.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,congestive_heart_failure,cardiac_arrhythmias,valvular_disease,pulmonary_circulation,peripheral_vascular,hypertension,paralysis,other_neurological,chronic_pulmonary,hyperlipidemia,...,coagulopathy,obesity,weight_loss,fluid_electrolyte,blood_loss_anemia,deficiency_anemias,alcohol_abuse,drug_abuse,psychoses,depression
icustay_id,hadm_id,ts,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
247247,170252,2170-10-03,1.0,0.0,0.0,0.0,0.0,1,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
293876,186251,2168-07-10,1.0,0.0,0.0,0.0,1.0,0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
299666,117029,2173-03-24,1.0,1.0,1.0,1.0,0.0,1,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
246119,126769,2195-12-31,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
298039,135882,2139-10-29,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 변수 : 바이탈사인, 투약, 검사, 승압제

In [22]:
gcs = get_gcs()
gcs.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mingcs,gcsmotor,gcsverbal,gcseyes,endotrachflag
icustay_id,hadm_id,ts,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
200021,109307,2114-12-27,8.0,6.0,1.0,1.0,0
200033,198650,2198-08-10,3.0,1.0,1.0,1.0,0
200033,198650,2198-08-12,6.0,4.0,1.0,1.0,0
200033,198650,2198-08-13,3.0,1.0,1.0,1.0,0
200033,198650,2198-08-17,7.0,5.0,1.0,1.0,0


In [23]:
vital = get_vitalsign()
vital.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,heartrate_min,heartrate_max,heartrate_mean,sysbp_min,sysbp_max,sysbp_mean,diasbp_min,diasbp_max,diasbp_mean,meanbp_min,...,resprate_mean,tempc_min,tempc_max,tempc_mean,spo2_min,spo2_max,spo2_mean,glucose_min,glucose_max,glucose_mean
icustay_id,hadm_id,ts,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
247247,170252,2170-10-03,63.0,73.0,68.166667,125.0,148.0,138.5,76.0,101.0,88.833333,88.0,...,18.571429,36.277778,36.277778,36.277778,98.0,100.0,99.0,174.0,174.0,174.0
247247,170252,2170-10-04,54.0,70.0,65.882353,113.0,154.0,128.647059,61.0,96.0,77.647059,73.0,...,17.176471,36.222222,36.666667,36.455556,96.0,100.0,98.235294,163.0,163.0,163.0
293876,186251,2168-07-10,76.0,101.0,85.8125,116.0,169.0,147.2,57.0,98.0,81.933333,72.0,...,19.684211,36.388889,37.555556,37.166667,100.0,100.0,100.0,79.0,176.0,129.0
293876,186251,2168-07-11,69.0,90.0,78.666667,106.0,145.0,125.777778,53.0,83.0,70.111111,65.0,...,19.111111,36.5,37.666667,36.933333,98.0,100.0,99.611111,108.0,147.0,125.25
299666,117029,2173-04-03,50.0,64.0,58.285714,91.0,107.0,100.0,44.0,77.0,62.0,51.0,...,19.6,35.388889,36.444444,35.87037,89.0,97.0,94.0,145.0,145.0,145.0


In [24]:
drug = get_drug()
drug.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,drug,Symbicort,*NF* Abatacept,*NF* Allopurinol Sodium,*NF* Arginine HCl,*NF* Bicalutamide,*NF* Butorphanol Tart. Nasal,*NF* Capecitabine,*NF* Ceftaroline,*NF* Deferasirox,*NF* Erlotinib,...,solifenacin,temazepam,tetrabenazine (Xenazine) 25mg tab,thyroid,thyroid extract SR,timolo,tol,traZODONE,zafirlukast,zz
icustay_id,hadm_id,ts,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
200021,109307,2114-12-27,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
200021,109307,2114-12-28,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
200028,181955,2133-10-30,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
200028,181955,2133-10-31,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
200028,181955,2133-11-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
lab = get_lab()
lab.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,label,% Hemoglobin A1c,Albumin,Alkaline Phosphatase,Bands,"Bilirubin, Total",C-Reactive Protein,CK-MB Index,"Cholesterol, HDL","Cholesterol, LDL, Calculated","Cholesterol, LDL, Measured",...,Platelet Count,Sedimentation Rate,Triglycerides,Troponin T,Urea Nitrogen,"Uric Acid, Urine",White Blood Cells,pCO2,pH,pO2
icustay_id,hadm_id,ts,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
200021,109307,2114-12-26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,210.0,0.0,0.0,0.0,20.0,0.0,6.7,38.0,7.42,367.0
200021,109307,2114-12-27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,156.0,0.0,0.0,0.0,15.0,0.0,6.1,36.0,7.41,218.0
200021,109307,2114-12-28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,169.0,0.0,0.0,0.0,10.0,0.0,5.1,0.0,0.0,0.0
200028,181955,2133-10-29,0.0,3.2,116.0,0.0,0.75,0.0,0.0,0.0,0.0,0.0,...,131.0,0.0,0.0,0.0,17.5,0.0,17.8,42.555556,7.233333,186.555556
200028,181955,2133-10-30,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.0,...,83.0,0.0,0.0,0.0,14.0,0.0,14.3,27.333333,7.363333,102.333333


In [26]:
vaso = get_vaso()
vaso.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,vaso_duration_hours
icustay_id,hadm_id,ts,Unnamed: 3_level_1
200028,181955,2133-10-29,20.733333
200033,198650,2198-08-10,1.25
200033,198650,2198-08-10,0.583333
200033,198650,2198-08-10,4.666667
200033,198650,2198-08-11,0.666667


In [27]:
def get_sepsis():
    s = pd.read_sql("""
    select icustay_id, hadm_id, date_trunc('day', intime) as ts
        , sofa, qsofa
    from sepsis3
    where excluded=0
    """, engine)
    
    s.set_index(['icustay_id', 'hadm_id', 'ts'], inplace=True)
    return s
sepsis = get_sepsis()
sepsis.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sofa,qsofa
icustay_id,hadm_id,ts,Unnamed: 3_level_1,Unnamed: 4_level_1
200021,109307,2114-12-26,4,3
200028,181955,2133-10-29,10,2
200033,198650,2198-08-07,1,0
200061,121149,2134-01-23,2,1
200075,132255,2159-09-23,8,2


In [28]:
data_frames = [
    label,
    demo,
    admit,
    com,
    gcs,
    vital,
    # drug,
    lab,
    vaso,
    sepsis
]
df_merged = reduce(lambda  left,right: pd.merge(left, right, how='outer', left_index=True, right_index=True), 
                   data_frames)


In [29]:
df_merged.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,hospital_expire_flag,thirtyday_expire_flag,age,is_male,race_white,race_black,race_hispanic,race_other,icu_los,hosp_los,...,Troponin T,Urea Nitrogen,"Uric Acid, Urine",White Blood Cells,pCO2,pH,pO2,vaso_duration_hours,sofa,qsofa
icustay_id,hadm_id,ts,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
200021,109307,2114-12-26,,,60.8501,1.0,1.0,0.0,0.0,0.0,1.1259,1.948611,...,0.0,20.0,0.0,6.7,38.0,7.42,367.0,,4.0,3.0
200021,109307,2114-12-27,0.0,0.0,,,,,,,,,...,0.0,15.0,0.0,6.1,36.0,7.41,218.0,,,
200021,109307,2114-12-28,,,,,,,,,,,...,0.0,10.0,0.0,5.1,0.0,0.0,0.0,,,
200028,181955,2133-10-29,,,64.8666,1.0,1.0,0.0,0.0,0.0,2.9038,3.204167,...,0.0,17.5,0.0,17.8,42.555556,7.233333,186.555556,20.733333,10.0,2.0
200028,181955,2133-10-30,,,,,,,,,,,...,0.0,14.0,0.0,14.3,27.333333,7.363333,102.333333,,,


- 입원 기간동안 유지되는 정보만 forward fill

In [30]:
ffill_cols = demo.columns.tolist() + admit.columns.tolist() + com.columns.tolist()
df_merged[ffill_cols] = df_merged[ffill_cols].fillna(method='ffill')

- 입원 일차 붙이기

In [31]:
df_merged = df_merged.reset_index()
df_merged['rno'] = df_merged.groupby(['hadm_id', 'icustay_id'])['ts'].rank(ascending=True)
df_merged.set_index(['hadm_id', 'icustay_id', 'rno'], inplace=True)
del df_merged['ts']

In [32]:
df_merged.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,hospital_expire_flag,thirtyday_expire_flag,age,is_male,race_white,race_black,race_hispanic,race_other,icu_los,hosp_los,...,Troponin T,Urea Nitrogen,"Uric Acid, Urine",White Blood Cells,pCO2,pH,pO2,vaso_duration_hours,sofa,qsofa
hadm_id,icustay_id,rno,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
109307,200021,1.0,,,60.8501,1.0,1.0,0.0,0.0,0.0,1.1259,1.948611,...,0.0,20.0,0.0,6.7,38.0,7.42,367.0,,4.0,3.0
109307,200021,2.0,0.0,0.0,60.8501,1.0,1.0,0.0,0.0,0.0,1.1259,1.948611,...,0.0,15.0,0.0,6.1,36.0,7.41,218.0,,,
109307,200021,3.0,,,60.8501,1.0,1.0,0.0,0.0,0.0,1.1259,1.948611,...,0.0,10.0,0.0,5.1,0.0,0.0,0.0,,,
181955,200028,1.0,,,64.8666,1.0,1.0,0.0,0.0,0.0,2.9038,3.204167,...,0.0,17.5,0.0,17.8,42.555556,7.233333,186.555556,20.733333,10.0,2.0
181955,200028,2.0,,,64.8666,1.0,1.0,0.0,0.0,0.0,2.9038,3.204167,...,0.0,14.0,0.0,14.3,27.333333,7.363333,102.333333,,,


- hdf 포맷으로 저장

In [33]:
filename_sepsis = "mimiciii_sepsis.h5"

In [34]:
df_merged.to_hdf(filename_sepsis, key='all')

# Reshaping 시계열

In [73]:
df = pd.read_hdf(filename_sepsis, key='all')

In [36]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,hospital_expire_flag,thirtyday_expire_flag,age,is_male,race_white,race_black,race_hispanic,race_other,icu_los,hosp_los,...,Troponin T,Urea Nitrogen,"Uric Acid, Urine",White Blood Cells,pCO2,pH,pO2,vaso_duration_hours,sofa,qsofa
hadm_id,icustay_id,rno,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
109307,200021,1.0,,,60.8501,1.0,1.0,0.0,0.0,0.0,1.1259,1.948611,...,0.0,20.0,0.0,6.7,38.0,7.42,367.0,,4.0,3.0
109307,200021,2.0,0.0,0.0,60.8501,1.0,1.0,0.0,0.0,0.0,1.1259,1.948611,...,0.0,15.0,0.0,6.1,36.0,7.41,218.0,,,
109307,200021,3.0,,,60.8501,1.0,1.0,0.0,0.0,0.0,1.1259,1.948611,...,0.0,10.0,0.0,5.1,0.0,0.0,0.0,,,
181955,200028,1.0,,,64.8666,1.0,1.0,0.0,0.0,0.0,2.9038,3.204167,...,0.0,17.5,0.0,17.8,42.555556,7.233333,186.555556,20.733333,10.0,2.0
181955,200028,2.0,,,64.8666,1.0,1.0,0.0,0.0,0.0,2.9038,3.204167,...,0.0,14.0,0.0,14.3,27.333333,7.363333,102.333333,,,


In [38]:
hadm_list = df.index.levels[0]
icustay_list = df.index.levels[1]
day_list = df.index.levels[2]
n_icustay = len(icustay_list)
n_hadm = len(hadm_list)
n_day = len(day_list)
n_hadm, n_icustay, n_day

(11791, 12409, 238)

In [39]:
label_cols = ['hospital_expire_flag', 'thirtyday_expire_flag',
             'sofa', 'qsofa']

In [74]:
y = df[label_cols]
x = df.drop(columns=label_cols)

In [164]:
x = x.fillna(x.mean())

In [165]:
x.shape, y.shape

((99105, 98), (99105, 4))

In [167]:
def get_df_by_icustay_id(x, icustay_id):
    idx = pd.IndexSlice
    icustay = x.loc[idx[:, icustay_id, :], :].copy()
    return icustay

inputs = [None] * n_icustay
for i, icustay_id in enumerate(icustay_list):
    h = get_df_by_icustay_id(x, icustay_id)
    inputs[i] = h.fillna(0).values

In [168]:
from keras.preprocessing.sequence import pad_sequences
X = pad_sequences(inputs, maxlen=7, dtype='float32', padding='pre', truncating='post')

In [245]:
y_ = y.reset_index().groupby(['hadm_id', 'icustay_id']).agg('max').fillna(0)
y1 = y_[['hospital_expire_flag']].astype(bool).astype(int).values
y2 = y_[['thirtyday_expire_flag']].astype(bool).astype(int).values
y3_cls = pd.get_dummies(y_.qsofa).values
y3 = y_[['qsofa']].values
y3[y3 < 2.] = 0.
y3[y3 > 0.] = 1.
y4 = pd.get_dummies(y_.sofa).values

In [246]:
X.shape, y1.shape, y2.shape, y3.shape, y3_cls.shape, y4.shape

((12409, 7, 98), (12409, 1), (12409, 1), (12409, 1), (12409, 4), (12409, 22))

- label ratio

In [172]:
y1.sum() / y1.shape[0] 

0.10290917882182286

In [173]:
y2.sum() / y2.shape[0] 

0.13046982029172374

In [218]:
y3.sum(axis=0) / y3.shape[0]

array([0.60415827])

In [247]:
y3_cls_weight = {i: w for i, w in enumerate(y3_cls.sum(axis=0) / y3_cls.shape[0])}

In [248]:
y3_cls_weight

{0: 0.10323152550568136,
 1: 0.29261020227254414,
 2: 0.4917398662261262,
 3: 0.11241840599564833}

In [249]:
y4.sum(axis=0) / y4.shape[0] 

array([0.09694577, 0.18849222, 0.17318076, 0.12378113, 0.11612539,
       0.08687243, 0.06237408, 0.04182448, 0.03030059, 0.02095253,
       0.01789024, 0.012088  , 0.00886453, 0.00588283, 0.00459344,
       0.00354581, 0.0024176 , 0.00112821, 0.00145056, 0.00072528,
       0.00032235, 0.00024176])

In [179]:
y4_cls_weight = {i: w for i, w in enumerate(y4.sum(axis=0) / y4.shape[0])}
y4_cls_weight

{0: 0.0969457651704408,
 1: 0.1884922233862519,
 2: 0.17318075590297366,
 3: 0.12378112660166009,
 4: 0.11612539286002095,
 5: 0.08687243129986301,
 6: 0.06237408332661778,
 7: 0.04182448223063905,
 8: 0.030300588282698042,
 9: 0.02095253445080184,
 10: 0.017890240954146184,
 11: 0.012088000644693368,
 12: 0.008864533806108469,
 13: 0.005882826980417439,
 14: 0.0045934402449834795,
 15: 0.0035458135224433877,
 16: 0.0024176001289386738,
 17: 0.0011282133935047144,
 18: 0.0014505600773632042,
 19: 0.0007252800386816021,
 20: 0.0003223466838584898,
 21: 0.00024176001289386736}

# LSTM

- https://github.com/philipperemy/keras-attention-mechanism 참고

In [180]:
from keras.layers import Multiply, BatchNormalization, SpatialDropout2D, Masking
from keras.layers.core import *
from keras.layers.recurrent import LSTM
from keras.models import *
from keras.optimizers import Adam
import keras.backend as K
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix, f1_score, accuracy_score
import numpy as np

# INPUT_DIM = 2
# TIME_STEPS = 20
TIME_STEPS, INPUT_DIM = X.shape[1:]
# if True, the attention vector is shared across the input_dimensions where the attention is applied.
SINGLE_ATTENTION_VECTOR = False
APPLY_ATTENTION_BEFORE_LSTM = True

def get_activations(model, inputs, print_shape_only=False, layer_name=None):
    # Documentation is available online on Github at the address below.
    # From: https://github.com/philipperemy/keras-visualize-activations
    print('----- activations -----')
    activations = []
    inp = model.input
    if layer_name is None:
        outputs = [layer.output for layer in model.layers]
    else:
        outputs = [layer.output for layer in model.layers if layer.name == layer_name]  # all layer outputs
    funcs = [K.function([inp] + [K.learning_phase()], [out]) for out in outputs]  # evaluation functions
    layer_outputs = [func([inputs, 1.])[0] for func in funcs]
    for layer_activations in layer_outputs:
        activations.append(layer_activations)
        if print_shape_only:
            print(layer_activations.shape)
        else:
            print(layer_activations)
    return activations


def attention_3d_block(inputs):
    # inputs.shape = (batch_size, time_steps, input_dim)
    input_dim = int(inputs.shape[2])
    a = Permute((2, 1))(inputs)
    a = Reshape((input_dim, TIME_STEPS))(a) # this line is not useful. It's just to know which dimension is what.
    a = Dense(TIME_STEPS, activation='softmax')(a)
    if SINGLE_ATTENTION_VECTOR:
        a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a)
        a = RepeatVector(input_dim)(a)
    a_probs = Permute((2, 1), name='attention_vec')(a)
    # output_attention_mul = merge([inputs, a_probs], name='attention_mul', mode='mul')
    output_attention_mul = Multiply(name='attention_mul')([inputs, a_probs])
    return output_attention_mul


def model_attention_applied_before_lstm():
    inputs = Input(shape=(TIME_STEPS, INPUT_DIM,))
    attention_mul = attention_3d_block(inputs)
    masking = Masking(mask_value=0.0)(attention_mul)
    lstm_units = 256
    attention_mul = LSTM(lstm_units, return_sequences=False)(masking)
    output = Dense(1, activation='sigmoid')(attention_mul)
    model = Model(input=[inputs], output=output)
    return model

def model_lstm_binary():
    inputs = Input(shape=(TIME_STEPS, INPUT_DIM,))
    norm = BatchNormalization()(inputs)
    masking = Masking(mask_value=0.0)(inputs)
    
    lstm_units = 256
    lstm = LSTM(lstm_units, return_sequences=False)(masking)
    
    output = Dense(1, activation='sigmoid')(lstm)
    model = Model(input=[inputs], output=output)
    return model

def model_lstm_classification(output_dim=10):
    inputs = Input(shape=(TIME_STEPS, INPUT_DIM,))
    norm = BatchNormalization()(inputs)
    masking = Masking(mask_value=0.0)(norm)
    lstm_units = 256
    lstm = LSTM(lstm_units, return_sequences=False)(masking)
    output = Dense(output_dim, activation='softmax')(lstm)
    model = Model(input=[inputs], output=output)
    return model

# mortality

In [231]:
X_train, X_test, y_train, y_test = train_test_split(X, y3, test_size=0.3, random_state=13)
# X_train, X_test, y_train, y_test = train_test_split(X, y2, test_size=0.3, random_state=13)

In [235]:
# m = model_attention_applied_before_lstm()
m = model_lstm_binary()
opt = Adam(learning_rate=0.0005, beta_1=0.9, beta_2=0.999)
m.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
print(m.summary())
m.fit(X_train, 
      y_train, 
      epochs=10, 
      batch_size=64, 
      validation_split=0.1 
      # , class_weight={0: 1., 1: 10.}
     )



Model: "model_27"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_27 (InputLayer)        (None, 7, 98)             0         
_________________________________________________________________
batch_normalization_16 (Batc (None, 7, 98)             392       
_________________________________________________________________
masking_26 (Masking)         (None, 7, 98)             0         
_________________________________________________________________
lstm_27 (LSTM)               (None, 256)               363520    
_________________________________________________________________
dense_27 (Dense)             (None, 1)                 257       
Total params: 364,169
Trainable params: 363,973
Non-trainable params: 196
_________________________________________________________________
None
Train on 7817 samples, validate on 869 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/1

<keras.callbacks.callbacks.History at 0x7f6d0e4acfd0>

In [237]:
y_pred = m.predict(X_test)
roc_auc_score(y_test.reshape(-1), y_pred.reshape(-1))

0.5107380619585256

# sepsis cls

In [250]:
X_train, X_test, y_train, y_test = train_test_split(X, y3_cls, 
                                                    test_size=0.3, 
                                                    random_state=13)

In [251]:
m = model_lstm_classification(output_dim=y3_cls.shape[1])
opt = Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999)
m.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
print(m.summary())
m.fit(X_train, y_train, epochs=10, batch_size=128, 
      validation_split=0.1
     #, class_weight=y3_cls_weight
     )



Model: "model_31"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_31 (InputLayer)        (None, 7, 98)             0         
_________________________________________________________________
batch_normalization_20 (Batc (None, 7, 98)             392       
_________________________________________________________________
masking_30 (Masking)         (None, 7, 98)             0         
_________________________________________________________________
lstm_31 (LSTM)               (None, 256)               363520    
_________________________________________________________________
dense_31 (Dense)             (None, 4)                 1028      
Total params: 364,940
Trainable params: 364,744
Non-trainable params: 196
_________________________________________________________________
None
Train on 7817 samples, validate on 869 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/1

<keras.callbacks.callbacks.History at 0x7f6d096f4fd0>

In [252]:
y_pred = np.argmax(m.predict(X_test), axis=1)
answer = np.argmax(y_test, axis=1)

print('Confusion matrix:')
print(confusion_matrix(answer, y_pred))
print('F1 score: %f', f1_score(answer, y_pred, average='micro'))
print('Accuracy score: %f', accuracy_score(answer, y_pred))


Confusion matrix:
[[   0    8  381    0]
 [   0   16 1075    0]
 [   0   27 1814    0]
 [   0    8  394    0]]
F1 score: %f 0.49153908138597907
Accuracy score: %f 0.49153908138597907
