In [7]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sklearn

#### Fill in NaN values with mean of the column

In [249]:
# patient_df = patient_df.fillna(patient_df.mean()['HR':'Platelets'])

In [39]:
# patient_df.to_csv('data/allpatients_imputed_df')
patient_df = pd.read_csv('../data/allpatients_forbacmean_imputed')

#### Sample 6 time-steps from each patient to create new dataset X

In [10]:
sepsis_patients = patient_df.groupby('pid').filter(lambda x: x['SepsisLabel'].any())
normal_patients = patient_df.groupby('pid').filter(lambda x: x['SepsisLabel'].sum() == 0)

In [11]:
# For sepsis patients, get the 6 measurements before t_sepsis and create new dataset X_sepsis. 
sepsis_patient_agg = sepsis_patients.groupby('pid')
X_sepsis = pd.DataFrame(data=[])

for pid, data in sepsis_patient_agg:
    data.reset_index(drop=True, inplace=True)
    
    if data.shape[0] >= 13:
        first_sepsis_idx =  data['SepsisLabel'].idxmax()
        start_idx = first_sepsis_idx - 6
        
        if start_idx >= 0:
            new_data = data.iloc[start_idx:first_sepsis_idx]
            assert new_data.shape[0] == 6
            X_sepsis = pd.concat([X_sepsis, new_data])

In [12]:
X_sepsis.shape

(1176, 43)

In [13]:
# For normal patients, randomly sample 6 measurements. 
normal_patient_agg = normal_patients.groupby('pid')
X_normal = pd.DataFrame(data=[])

for pid, data in normal_patient_agg:
    data.reset_index(drop=True, inplace=True)
    rand_idx = np.random.randint(0, high=(data.shape[0] - 6))
    new_data = data.iloc[rand_idx:rand_idx+6]
    X_normal = pd.concat([X_normal, new_data])

In [15]:
# Create final dataset where we take each patient's rows and stack them into a single vector. 
X = np.ndarray((X_sepsis.pid.unique().shape[0] + X_normal.pid.unique().shape[0], 228))
y = [1]*(X_sepsis.pid.unique().shape[0]) + [0]*(X_normal.pid.unique().shape[0])

i = 0
for pid, df in X_sepsis.groupby('pid'):
    df = df.drop(columns=['pid', 'Unit1', 'Unit2', 'SepsisLabel', 'ICULOS'])
    x_i = df.stack().values
    X[i,:] = x_i
    i += 1

for pid, df in X_normal.groupby('pid'):
    df = df.drop(columns=['pid', 'Unit1', 'Unit2', 'SepsisLabel', 'ICULOS'])
    x_i = df.stack().values
    X[i,:] = x_i
    i += 1

In [16]:
X.shape

(4917, 228)

In [17]:
y = np.array(y)

In [18]:
y.shape

(4917,)

In [19]:
y.sum()

196

In [20]:
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import train_test_split

In [21]:
X, y = sklearn.utils.shuffle(X, y)

In [22]:
X.shape, y.shape

((4917, 228), (4917,))

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [24]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((3687, 228), (3687,), (1230, 228), (1230,))

## Regularized Log. Regression with 1 window, size = 6

In [30]:
from sklearn.linear_model import LogisticRegressionCV

lr_model = LogisticRegressionCV(cv=5, penalty='l2', max_iter=500, solver='lbfgs')
lr_model = lr_model.fit(X_train, y_train)



In [31]:
lr_model.score(X_test, y_test)

0.9585365853658536

In [32]:
y_pred = lr_model.predict_proba(X_test)
roc_auc_score(y_test, y_pred[:,1])

0.625688103909927

In [33]:
y_hat = lr_model.predict(X_test)
f1_score(y_test, y_hat)

  'precision', 'predicted', average, warn_for)


0.0

In [34]:
confusion_matrix(y_test, y_hat).ravel()

array([1179,    0,   51,    0])

## SVM

In [278]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

for c in [1, 2, 4, 8, 16]:
    svm_model = SVC(gamma='scale', C=c)
    scores = cross_val_score(svm_model, X_train, y_train, cv=5, scoring='roc_auc')
    avg_score = scores.mean()
    print(c, avg_score)

1 0.7712702508204196
2 0.7711887872850168
4 0.761789889640774
8 0.7579563113425478
16 0.7578881250966877


In [279]:
svm_model = SVC(gamma='scale', C=1, probability=True)
svm_model = svm_model.fit(X_train, y_train)

In [280]:
svm_model.score(X_test, y_test)

0.959349593495935

In [281]:
y_pred = svm_model.predict_proba(X_test)
roc_auc_score(y_test, y_pred[:,1])

0.788728813559322

In [282]:
y_hat = svm_model.predict(X_test)
f1_score(y_test, y_hat)

  'precision', 'predicted', average, warn_for)


0.0

In [283]:
confusion_matrix(y_test, y_hat)

array([[1180,    0],
       [  50,    0]])

## RF

In [40]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model = rf_model.fit(X_train, y_train)



In [41]:
rf_model.score(X_test, y_test)

0.9577235772357724

In [42]:
y_pred = rf_model.predict_proba(X_test)
roc_auc_score(y_test, y_pred[:,1])

0.686249563438607

In [43]:
y_hat = rf_model.predict(X_test)
f1_score(y_test, y_hat)

0.0