In [1]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sklearn

In [2]:
patient_df = pd.read_csv('data/allpatients_imputed_df').drop(columns=['Unnamed: 0'])

In [3]:
patient_df.shape

(188453, 42)

In [4]:
patient_df.columns

Index(['pid', 'HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2',
       'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN',
       'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct',
       'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium',
       'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
       'Fibrinogen', 'Platelets', 'Age', 'Gender', 'Unit1', 'Unit2',
       'HospAdmTime', 'ICULOS', 'SepsisLabel'],
      dtype='object')

## Extract sliding-window data from patients and create new dataset

In [5]:
sepsis_patients = patient_df.groupby('pid').filter(lambda x: x['SepsisLabel'].any())
normal_patients = patient_df.groupby('pid').filter(lambda x: x['SepsisLabel'].sum() == 0)

In [8]:
# For sepsis patients, get measurements starting with t-17 to t-3, window size = 6.
sepsis_patient_agg = sepsis_patients.groupby('pid')
X_sepsis = pd.DataFrame(data=[])

for pid, data in sepsis_patient_agg:
    data.reset_index(drop=True, inplace=True)
    
    first_sepsis_idx =  data['SepsisLabel'].idxmax()
    start_idx = max(0, first_sepsis_idx - 11)
    end_idx = min(first_sepsis_idx + 20, data.shape[0])
    
    new_data = data.iloc[start_idx:end_idx+1]
    assert new_data.shape[0] <= 21
    X_sepsis = pd.concat([X_sepsis, new_data])

In [18]:
# Split X_sepsis intro train and test sets
sepsis_pids = pd.Series(X_sepsis.pid.unique())
sepsis_pids = sepsis_pids.sample(frac=1).reset_index(drop=True)

train_sepsis_pids = sepsis_pids.iloc[0:int(np.floor(0.8*sepsis_pids.shape[0]))]
test_sepsis_pids = sepsis_pids.iloc[int(np.floor(0.8*sepsis_pids.shape[0])):]

In [19]:
train_sepsis_pids.shape, test_pids.shape

((223,), (56,))

In [20]:
X_sepsis_train = X_sepsis[X_sepsis.pid.isin(train_sepsis_pids)]
X_sepsis_test = X_sepsis[X_sepsis.pid.isin(test_sepsis_pids)]

In [21]:
X_sepsis_train.shape, X_sepsis_test.shape

((3827, 42), (961, 42))

In [91]:
# Create final dataset where we take each patient's windows and stack them into a single vector. 
X = []
y = []

for pid, df in X_sepsis.groupby('pid'):
    # df = df.drop(columns=['pid', 'Unit1', 'Unit2', 'SepsisLabel', 'ICULOS'])
    
    start_idx = 0
    window_end = 6
    
    j = 0
    while window_end + j <= df.shape[0]:
        x_i = df.iloc[start_idx+j:window_end+j]
        assert x_i.shape[0] == 6
        
        x_i = x_i.stack().values
        X.append(x_i)
        y.append(1)
        
        j += 1

for pid, df in X_normal.groupby('pid'):
    df = df.drop(columns=['pid', 'Unit1', 'Unit2', 'SepsisLabel', 'ICULOS'])
    
    start_idx = 0
    window_end = 6
    
    j = 0
    while window_end + j < df.shape[0]:
        x_i = df.iloc[start_idx+j:window_end+j]
        assert x_i.shape[0] == 6
        
        x_i = x_i.stack().values
        X.append(x_i)
        y.append(0)
        
        j += 1

In [94]:
X = np.array(X)

In [95]:
X.shape

(135833, 222)

In [97]:
y = np.array(y)
y.shape

(135833,)

In [98]:
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import train_test_split

In [99]:
X, y = sklearn.utils.shuffle(X, y)

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [104]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((101874, 222), (101874,), (33959, 222), (33959,))

## Regularized Log. Reg. 

In [107]:
from sklearn.linear_model import LogisticRegressionCV

lr_model = LogisticRegressionCV(solver='lbfgs', cv=5, max_iter=500).fit(X_train, y_train)





In [108]:
lr_model.score(X_test, y_test)

0.9795635913896169

In [109]:
y_pred = lr_model.predict_proba(X_test)
roc_auc_score(y_test, y_pred[:,1])

0.7104946762706279

In [None]:
y_hat = lr_model.predict(X_test)
f1_score(y_test, y_hat)