In [2]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sklearn

In [69]:
train_df = pd.read_csv('../data/train_forw_mean_imputed').drop(columns=['Unnamed: 0'])
test_df = pd.read_csv('../data/test_forw_mean_imputed').drop(columns=['Unnamed: 0'])

In [70]:
train_df.shape, test_df.shape

((150976, 42), (37477, 42))

In [71]:
train_df.columns

Index(['pid', 'HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2',
       'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN',
       'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct',
       'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium',
       'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
       'Fibrinogen', 'Platelets', 'Age', 'Gender', 'Unit1', 'Unit2',
       'HospAdmTime', 'ICULOS', 'SepsisLabel'],
      dtype='object')

## Extract sliding-window data from patients and create new dataset

In [72]:
train_df.drop(columns=['Unit1', 'Unit2', 'ICULOS'], inplace=True)
test_df.drop(columns=['Unit1', 'Unit2', 'ICULOS'], inplace=True)

In [73]:
X_train = []
y_train = []
window_size = 6

for pid, data in train_df.groupby('pid'):
    data.reset_index(drop=True, inplace=True)
    
    # Check if septic
    if data['SepsisLabel'].max():
        first_sepsis_idx =  data['SepsisLabel'].idxmax() # t-6
        end_idx = min(first_sepsis_idx + 9, data.shape[0]) # t+3
    else:
        end_idx = data.shape[0]
    
    window_start = 0
    while (window_start + window_size) <= end_idx:
        new_data = data.iloc[window_start:window_start + window_size].drop(columns=['pid', 'SepsisLabel'])
        assert new_data.shape[0] == 6
        
        x_i = new_data.stack().values # concat rows into one vector
        X_train.append(x_i)
        
        label = data.iloc[window_start + window_size - 1]['SepsisLabel']
        y_train.append(label)
        
        window_start += 1

In [74]:
X_train, y_train = np.array(X_train), np.array(y_train)

In [75]:
X_train.shape, y_train.shape

((130840, 222), (130840,))

In [76]:
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, precision_score, recall_score

## Regularized Log. Reg. 

In [None]:
from sklearn.linear_model import LogisticRegressionCV

lr_model = LogisticRegressionCV(solver='lbfgs', cv=5, max_iter=500, verbose=0).fit(X_train, y_train)

In [68]:
# Given patient df and model, predict at each time step using window size = 6
def predict_patient(data, model, window_size=6):
    X = []
    y_test = []
    
    if data['SepsisLabel'].max():
        first_sepsis_idx =  data['SepsisLabel'].idxmax()
        end_idx = min(first_sepsis_idx + 9, data.shape[0])
    else:
        end_idx = data.shape[0]
        
    window_start = 0
    
    while (window_start + window_size) <= end_idx:
        new_data = data.iloc[window_start:window_start + window_size].drop(columns=['pid', 'SepsisLabel'])
        assert new_data.shape[0] == 6
        
        x_i = new_data.stack().values
        X.append(x_i)
        
        label= data.iloc[window_start + window_size - 1]['SepsisLabel']
        y_test.append(label)
        
        window_start += 1
    
    X, y_test = np.array(X), np.array(y_test)
    return model.score(X, y_test)

predict_patient(test_df[test_df.pid == 'p01455'], lr_model)

1.0

In [78]:
X_test, y_test = [], []

for pid, data in test_df.groupby('pid'):
    data.reset_index(drop=True, inplace=True)
    
    if data['SepsisLabel'].max():
        first_sepsis_idx =  data['SepsisLabel'].idxmax() # t-6
        end_idx = min(first_sepsis_idx + 9, data.shape[0]) # t+3
    else:
        end_idx = data.shape[0]
    
    window_start = 0
    while (window_start + window_size) <= end_idx:
        new_data = data.iloc[window_start:window_start + window_size].drop(columns=['pid', 'SepsisLabel'])
        assert new_data.shape[0] == 6
        
        x_i = new_data.stack().values # concat rows into one vector
        X_test.append(x_i)
        
        label = data.iloc[window_start + window_size - 1]['SepsisLabel']
        y_test.append(label)
        
        window_start += 1

X_test, y_test = np.array(X_test), np.array(y_test)

In [82]:
X_test.shape, y_test.shape

((32441, 222), (32441,))

In [79]:
y_pred = lr_model.predict_proba(X_test)
roc_auc_score(y_test, y_pred[:,1])

0.6952558418756009

In [80]:
y_hat = lr_model.predict(X_test)
f1_score(y_test, y_hat)

  'precision', 'predicted', average, warn_for)


0.0

In [87]:
confusion_matrix(y_test, y_hat)

array([[32004,     0],
       [  437,     0]])

## Random Forest

In [88]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier().fit(X_train, y_train)



In [89]:
y_pred = rf_model.predict_proba(X_test)
roc_auc_score(y_test, y_pred[:,1])

0.6184415020204854

In [90]:
y_hat = rf_model.predict(X_test)
confusion_matrix(y_test, y_hat)

array([[31980,    24],
       [  432,     5]])