In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [48]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix
from lifelines import CoxPHFitter
from datautils.dataset import Dataset
from datautils.data import Data
from tqdm import tqdm

In [3]:
dataset = Dataset("data/small_challenge_data",
                    batchSize=100,
                    train_ratio=0.8,
                    normalize=True,
                    padding=False,
                    imputeForward=False,
                    calculateDelay=False)

Imputation mode = mean
Processing train data...
* Reading data...


100%|██████████████████████████████████████████████████████████████████████████████| 4000/4000 [00:43<00:00, 91.74it/s]


* Processing data...


100%|███████████████████████████████████████████████████████████████████████████| 4000/4000 [00:00<00:00, 18302.75it/s]


* Unpadding...


100%|███████████████████████████████████████████████████████████████████████████| 4000/4000 [00:00<00:00, 43514.46it/s]


Processing val data...
* Reading data...


100%|███████████████████████████████████████████████████████████████████████████████| 500/500 [00:04<00:00, 109.41it/s]


* Processing data...


100%|█████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 16761.80it/s]


* Unpadding...


100%|████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 124956.92it/s]


Processing test data...
* Reading data...


100%|███████████████████████████████████████████████████████████████████████████████| 500/500 [00:04<00:00, 116.26it/s]


* Processing data...


100%|█████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 16566.49it/s]


* Unpadding...


100%|█████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 30941.03it/s]


In [109]:
columns = list(dataset.train_data.features.keys())[:-2]

dataset.train_data.x.shape
dataset.val_data.x.shape
dataset.test_data.x.shape

(4000,)

(500,)

(500,)

In [108]:
# create windowing system here
T = 6
#idx = 10
def process_data(d: Data, T: int) -> (pd.DataFrame, np.array):
    npa = d.x
    target_npa = d.y
    
    processed = []
    labels = []

    print("* Processing data...")
    for idx in tqdm(range(npa.shape[0])):
        if target_npa[idx].sum() == 0:
            processed.extend([[row,7,1] for row in npa[idx]])
        else:
            sepsis_count = 0
            for i in range(npa[idx].shape[0]):
                t = (T + 1) - sepsis_count
                t = t if t >= 1 else 1
                s = 1 if t > T else 0
                processed.append([npa[idx][i],t,s])
                sepsis_count += 1 if target_npa[idx][i][0] == 1 else 0
                
        labels.extend(target_npa[idx].flatten().tolist())
                
    return (pd.DataFrame(processed, columns=["x","t","s"]), np.array(labels))
# Naive windowing:
#             for i in range(df[idx].shape[0]):
#                 window = df[idx][i:i+T]
#                 matches = np.where(window[:,-1]==1)[0]
#                 if matches.size > 0:
#                     t = matches[0] + 1
#                     s = 0
#                 else:
#                     t = T + 1
#                     s = 1
#                 processed.append([df[idx][i][:-1],t,s])

In [51]:
X_train, y_train = process_data(dataset.train_data, T)
X_val, y_val = process_data(dataset.val_data, T)
X_test, y_test = process_data(dataset.test_data, T)

* Processing data...


100%|███████████████████████████████████████████████████████████████████████████| 4000/4000 [00:00<00:00, 12699.04it/s]


* Processing data...


100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 4768.67it/s]


In [61]:
X_train.head()

Unnamed: 0,x,t,s
0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",7,1
1,"[0.32291490086539276, 0.9688360691681855, -1.3...",7,1
2,"[0.32291490086539276, 0.9688360691681855, -1.2...",7,1
3,"[0.26717878210519413, 0.9688360691681855, -0.1...",7,1
4,"[-0.17871016797639477, 0.9688360691681855, 0.4...",7,1


In [65]:
inverse_s = 1-X_train.s
X_train_cph = pd.DataFrame(X_train.x.values.tolist(), columns=columns)
X_train_cph["s"] = inverse_s
X_train_cph["w"] = (inverse_s * 70) + X_train.s
X_train_cph["t"] = X_train.t

In [68]:
cph = CoxPHFitter(penalizer=0.2)
cph.fit(X_train_cph, duration_col='t', event_col='s', weights_col='w', step_size=0.070, show_progress=True, robust=False)

Iteration 1: norm_delta = 0.02671, step_size = 0.0700, ll = -1688714.33595, newton_decrement = 10523.83932, seconds_since_start = 0.2
Iteration 2: norm_delta = 0.02438, step_size = 0.0700, ll = -1687293.15293, newton_decrement = 8796.79560, seconds_since_start = 0.5
Iteration 3: norm_delta = 0.02237, step_size = 0.0700, ll = -1686105.08229, newton_decrement = 7408.09968, seconds_since_start = 0.8
Iteration 4: norm_delta = 0.02469, step_size = 0.0840, ll = -1685104.50379, newton_decrement = 6268.78761, seconds_since_start = 1.1
Iteration 5: norm_delta = 0.02192, step_size = 0.0823, ll = -1684095.89550, newton_decrement = 5142.63855, seconds_since_start = 1.3
Iteration 6: norm_delta = 0.01953, step_size = 0.0807, ll = -1683284.26357, newton_decrement = 4251.22615, seconds_since_start = 1.5
Iteration 7: norm_delta = 0.02136, step_size = 0.0968, ll = -1682626.14301, newton_decrement = 3537.16676, seconds_since_start = 1.7
Iteration 8: norm_delta = 0.01873, step_size = 0.0949, ll = -1681974

<lifelines.CoxPHFitter: fitted with 150223 observations, 148257 censored>

In [69]:
#cph.check_assumptions(X_train_cph,show_plots=False,plot_n_bootstraps=0)
cph.print_summary()

<lifelines.CoxPHFitter: fitted with 150223 observations, 148257 censored>
      duration col = 't'
         event col = 's'
       weights col = 'w'
         penalizer = 0.2
number of subjects = 150223
  number of events = 1966
    log-likelihood = -1679252.72
  time fit was run = 2019-04-09 03:06:48 UTC

---
                  coef  exp(coef)  se(coef)      z      p  -log2(p)  lower 0.95  upper 0.95
HR                0.24       1.27      0.00  88.56 <0.005       inf        0.23        0.24
O2Sat            -0.00       1.00      0.00  -1.72   0.09      3.54       -0.01        0.00
Temp              0.10       1.11      0.00  27.51 <0.005    551.20        0.09        0.11
SBP              -0.08       0.93      0.01 -15.17 <0.005    170.26       -0.09       -0.07
MAP               0.09       1.09      0.01  10.63 <0.005     85.19        0.07        0.11
DBP              -0.22       0.80      0.01 -34.45 <0.005    861.46       -0.24       -0.21
Resp              0.04       1.05      0.00  

In [117]:
def get_metrics(ty, py, threshold=0.5):
    print('-'*20)
    auc = roc_auc_score(ty, py)
    print(f"AUC = {auc}")
    lst = [1 if i >=0.5 else 0 for i in py]
    acc = ((lst == ty).sum() / ty.shape[0]) * 100
    print(f"Accuracy = {acc}")
    c_m = confusion_matrix(ty, np.array(py > threshold).astype(int))
    print(c_m)
    PPV = c_m[1,1] / (c_m[1,1] + c_m[0,1])
    print(f"PPV/Precision = {PPV}")
    TPR = c_m[1,1] / c_m[1].sum()
    print(f"TPR/Sensitivity/Recall = {TPR}")
    TNR = c_m[0,0] / c_m[0].sum()
    print(f"TNR/Specificity = {TNR}")
    print('-'*20)

In [116]:
def evaluate(df: pd.DataFrame, ty, columns, threshold=0.5):
    cph_df = pd.DataFrame(df.x.values.tolist(), columns=columns)
    
    preds = 1-cph.predict_survival_function(cph_df,times=[6])
    
    get_metrics(ty, preds, threshold=threshold)

In [119]:
print("Train:")
evaluate(X_train, y_train, columns, threshold=0.5)
print("Val:")
evaluate(X_val, y_val, columns, threshold=0.5)
print("Test:")
evaluate(X_test, y_test, columns, threshold=0.5)

Train:
--------------------
AUC = 0.6387214747953789
Accuracy = 79.33205967128868
[[118348  29675]
 [  1373    827]]
PPV/Precision = 0.02711297619828208
TPR/Sensitivity/Recall = 0.3759090909090909
TNR/Specificity = 0.7995243982354093
--------------------
Val:
--------------------
AUC = 0.6063689287216147
Accuracy = 79.23076923076923
[[15166  3858]
 [  138    78]]
PPV/Precision = 0.019817073170731708
TPR/Sensitivity/Recall = 0.3611111111111111
TNR/Specificity = 0.7972035323801514
--------------------
Test:
--------------------
AUC = 0.5460201317822341
Accuracy = 81.5376513954713
[[15420  3363]
 [  143    64]]
PPV/Precision = 0.018675226145316602
TPR/Sensitivity/Recall = 0.30917874396135264
TNR/Specificity = 0.8209551189905766
--------------------
