In [3]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sklearn

In [20]:
patient_df = pd.read_csv('data/allpatients_imputed_df').drop(columns=['Unnamed: 0'])

In [21]:
patient_df.shape

(188453, 42)

In [22]:
patient_df.columns

Index(['pid', 'HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2',
       'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN',
       'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct',
       'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium',
       'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
       'Fibrinogen', 'Platelets', 'Age', 'Gender', 'Unit1', 'Unit2',
       'HospAdmTime', 'ICULOS', 'SepsisLabel'],
      dtype='object')

#### Fill in NaN values with mean of the column

In [24]:
patient_df = patient_df.fillna(patient_df.mean()['HR':'Platelets'])

#### Sample 6 time-steps from each patient to create new dataset X

In [42]:
sepsis_patients = patient_df.groupby('pid').filter(lambda x: x['SepsisLabel'].any())
normal_patients = patient_df.groupby('pid').filter(lambda x: x['SepsisLabel'].sum() == 0)

In [77]:
(sepsis_patients.groupby('pid').ICULOS.agg(['count'])['count'] < 13).sum()

73

In [165]:
sepsis_patient_agg = sepsis_patients.groupby('pid')
X_sepsis = pd.DataFrame(data=[])

for pid, data in sepsis_patient_agg:
    data.reset_index(drop=True, inplace=True)
    
    if data.shape[0] >= 13:
        first_sepsis_idx =  data['SepsisLabel'].idxmax()
        start_idx = first_sepsis_idx - 6
        
        if start_idx >= 0:
            new_data = data.iloc[start_idx:first_sepsis_idx]
            assert new_data.shape[0] == 6
            X_sepsis = pd.concat([X_sepsis, new_data])

In [166]:
X_sepsis.shape

(1176, 42)

In [167]:
normal_patient_agg = normal_patients.groupby('pid')
X_normal = pd.DataFrame(data=[])

for pid, data in normal_patient_agg:
    data.reset_index(drop=True, inplace=True)
    new_data = data.sample(n=6)
    X_normal = pd.concat([X_normal, new_data])

In [79]:
X_normal.pid.unique().shape

(4721,)

In [178]:
X = np.ndarray((X_sepsis.pid.unique().shape[0] + X_normal.pid.unique().shape[0], 228))
y = [1]*(X_sepsis.pid.unique().shape[0]) + [0]*(X_normal.pid.unique().shape[0])

i = 0
for pid, df in X_sepsis.groupby('pid'):
    df = df.drop(columns=['pid', 'Unit1', 'Unit2', 'SepsisLabel'])
    x_i = df.stack().values
    X[i,:] = x_i
    i += 1

for pid, df in X_normal.groupby('pid'):
    df = df.drop(columns=['pid', 'Unit1', 'Unit2', 'SepsisLabel'])
    x_i = df.stack().values
    X[i,:] = x_i
    i += 1

In [179]:
X.shape

(4917, 228)

In [181]:
y = np.array(y)

In [182]:
y.shape

(4917,)

In [184]:
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [189]:
X, y = sklearn.utils.shuffle(X, y)

In [190]:
X.shape, y.shape

((4917, 228), (4917,))

In [194]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [195]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((3687, 228), (3687,), (1230, 228), (1230,))

## Regularized Log. Regression

In [204]:
from sklearn.linear_model import LogisticRegressionCV

lr_model = LogisticRegressionCV(cv=5, penalty='l2', max_iter=1000, solver='lbfgs')
lr_model = lr_model.fit(X_train, y_train)





In [205]:
lr_model.score(X_test, y_test)

0.9666666666666667

In [206]:
y_pred = lr_model.predict_proba(X_test)
roc_auc_score(y_test, y_pred[:,1])

0.8119875600791632

In [207]:
y_hat = lr_model.predict(X_test)
f1_score(y_test, y_hat)

0.4057971014492754

In [208]:
confusion_matrix(y_test, y_hat)

array([[1175,    4],
       [  37,   14]])

## SVM

In [216]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

for c in [1, 2, 4, 8, 16]:
    svm_model = SVC(gamma='scale', C=c)
    scores = cross_val_score(svm_model, X_train, y_train, cv=5, scoring='roc_auc')
    avg_score = scores.mean()
    print(c, avg_score)

1 0.779577937717092
2 0.776832503640126
4 0.7590741563549225
8 0.7542063893109027
16 0.7512286026669436


In [227]:
svm_model = SVC(gamma='scale', C=8, probability=True)
svm_model = svm_model.fit(X_train, y_train)

In [228]:
svm_model.score(X_test, y_test)

0.9585365853658536

In [229]:
y_pred = svm_model.predict_proba(X_test)
roc_auc_score(y_test, y_pred[:,1])

0.794209117064977

In [230]:
y_hat = svm_model.predict(X_test)
f1_score(y_test, y_hat)

0.03773584905660377

In [231]:
confusion_matrix(y_test, y_hat)

array([[1178,    1],
       [  50,    1]])

## RF

In [232]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model = rf_model.fit(X_train, y_train)



In [234]:
rf_model.score(X_test, y_test)

0.9707317073170731

In [233]:
y_pred = rf_model.predict_proba(X_test)
roc_auc_score(y_test, y_pred[:,1])

0.814282625688104

In [235]:
y_hat = rf_model.predict(X_test)
f1_score(y_test, y_hat)

0.47058823529411764