In [16]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix
from lifelines import CoxPHFitter
from datautils.dataset import Dataset
from datautils.data import Data
from tqdm import tqdm

In [18]:
dataset = Dataset("data/challenge_data",
                    batchSize=100,
                    train_ratio=0.8,
                    normalize=True,
                    padding=False,
                    imputeForward=False,
                    calculateDelay=False)

  0%|          | 32/32269 [00:00<01:42, 314.05it/s]

Imputation mode = mean
Processing train data...
* Reading data...


100%|██████████| 32269/32269 [01:05<00:00, 488.97it/s]
  1%|          | 343/32269 [00:00<00:09, 3427.94it/s]

* Processing data...


100%|██████████| 32269/32269 [00:07<00:00, 4294.54it/s]


* Unpadding...


100%|██████████| 32269/32269 [00:00<00:00, 147083.08it/s]
  1%|          | 36/4034 [00:00<00:11, 358.84it/s]

Processing val data...
* Reading data...


100%|██████████| 4034/4034 [00:07<00:00, 567.59it/s]
100%|██████████| 4034/4034 [00:00<00:00, 27668.42it/s]

* Processing data...



100%|██████████| 4034/4034 [00:00<00:00, 157966.78it/s]
  2%|▏         | 75/4033 [00:00<00:05, 745.38it/s]

* Unpadding...
Processing test data...
* Reading data...


100%|██████████| 4033/4033 [00:06<00:00, 640.42it/s]
100%|██████████| 4033/4033 [00:00<00:00, 25029.01it/s]

* Processing data...



100%|██████████| 4033/4033 [00:00<00:00, 139151.12it/s]

* Unpadding...





In [19]:
columns = list(dataset.train_data.features.keys())[:-2]

dataset.train_data.x.shape
dataset.val_data.x.shape
dataset.test_data.x.shape

(32269,)

(4034,)

(4033,)

In [20]:
# create windowing system here
T = 6
#idx = 10
def process_data(d: Data, T: int) -> (pd.DataFrame, np.array):
    npa = d.x
    target_npa = d.y
    
    processed = []
    labels = []

    print("* Processing data...")
    for idx in tqdm(range(npa.shape[0])):
        if target_npa[idx].sum() == 0:
            processed.extend([[row,7,1] for row in npa[idx]])
        else:
            sepsis_count = 0
            for i in range(npa[idx].shape[0]):
                t = (T + 1) - sepsis_count
                t = t if t >= 1 else 1
                s = 1 if t > T else 0
                processed.append([npa[idx][i],t,s])
                sepsis_count += 1 if target_npa[idx][i][0] == 1 else 0
                
        labels.extend(target_npa[idx].flatten().tolist())
                
    return (pd.DataFrame(processed, columns=["x","t","s"]), np.array(labels))
# Naive windowing:
#             for i in range(df[idx].shape[0]):
#                 window = df[idx][i:i+T]
#                 matches = np.where(window[:,-1]==1)[0]
#                 if matches.size > 0:
#                     t = matches[0] + 1
#                     s = 0
#                 else:
#                     t = T + 1
#                     s = 1
#                 processed.append([df[idx][i][:-1],t,s])

In [21]:
X_train, y_train = process_data(dataset.train_data, T)
X_val, y_val = process_data(dataset.val_data, T)
X_test, y_test = process_data(dataset.test_data, T)

  0%|          | 0/32269 [00:00<?, ?it/s]

* Processing data...


100%|██████████| 32269/32269 [00:02<00:00, 13294.76it/s]
100%|██████████| 4034/4034 [00:00<00:00, 26516.77it/s]

* Processing data...



100%|█████████▉| 4021/4033 [00:00<00:00, 20522.57it/s]

* Processing data...


100%|██████████| 4033/4033 [00:00<00:00, 20017.36it/s]


In [22]:
X_train.head()

Unnamed: 0,x,t,s
0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",7,1
1,"[-0.7250015421218633, 0.9549845983461365, 0.0,...",7,1
2,"[-0.37832799104262854, 0.9549845983461365, -0....",7,1
3,"[-0.7250015421218633, 0.9549845983461365, 0.0,...",7,1
4,"[-0.6094436917621183, 0.9549845983461365, 0.0,...",7,1


In [23]:
inverse_s = 1-X_train.s
X_train_cph = pd.DataFrame(X_train.x.values.tolist(), columns=columns)
X_train_cph["s"] = inverse_s
X_train_cph["w"] = (inverse_s * 70) + X_train.s
X_train_cph["t"] = X_train.t

In [24]:
cph = CoxPHFitter(penalizer=0.2)
cph.fit(X_train_cph, duration_col='t', event_col='s', weights_col='w', step_size=0.070, show_progress=True, robust=False)


>>> events = df['s'].astype(bool)
>>> print(df.loc[events, 'FiO2'].var())
>>> print(df.loc[~events, 'FiO2'].var())

A very low variance means that the column FiO2 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression 


Iteration 1: norm_delta = 0.29762, step_size = 0.0700, ll = -19830176.15702, newton_decrement = 79158.93220, seconds_since_start = 0.9
Iteration 2: norm_delta = 0.27390, step_size = 0.0700, ll = -19819483.62577, newton_decrement = 67442.12926, seconds_since_start = 1.9
Iteration 3: norm_delta = 0.25249, step_size = 0.0700, ll = -19810373.60302, newton_decrement = 57545.76550, seconds_since_start = 2.8
Iteration 4: norm_delta = 0.23304, step_size = 0.0840, ll = -19802600.25375, newton_decrement = 49165.87385, seconds_since_start = 3.7
Iteration 5: norm_delta = 0.21177, step_size = 0.1008, ll = -19794688.75476, newton_decrement = 40698.09037, seconds_since_start = 4.6
Iteration 6: norm_delta = 0.18886, step_size = 0.1210, ll = -19786899.39487, newton_decrement = 32424.00882, seconds_since_start = 5.5
Iteration 7: norm_delta = 0.16463, step_size = 0.1452, ll = -19779532.02312, newton_decrement = 24659.17587, seconds_since_start = 6.4
Iteration 8: norm_delta = 0.13955, step_size = 0.1742, 

<lifelines.CoxPHFitter: fitted with 1240275 observations, 1220655 censored>

In [25]:
#cph.check_assumptions(X_train_cph,show_plots=False,plot_n_bootstraps=0)
cph.print_summary()

<lifelines.CoxPHFitter: fitted with 1240275 observations, 1220655 censored>
      duration col = 't'
         event col = 's'
       weights col = 'w'
         penalizer = 0.2
number of subjects = 1240275
  number of events = 19620
    log-likelihood = -19755639.57
  time fit was run = 2019-04-10 21:48:38 UTC

---
                  coef exp(coef)  se(coef)      z      p  -log2(p)  lower 0.95  upper 0.95
HR                0.18      1.20      0.00 204.95 <0.005       inf        0.18        0.18
O2Sat            -0.00      1.00      0.00  -1.68   0.09      3.44       -0.00        0.00
Temp              0.12      1.13      0.00  99.16 <0.005       inf        0.12        0.13
SBP               0.04      1.05      0.00  30.43 <0.005    673.02        0.04        0.05
MAP              -0.15      0.86      0.00 -76.72 <0.005       inf       -0.15       -0.14
DBP              -0.04      0.96      0.00 -27.04 <0.005    532.51       -0.05       -0.04
Resp              0.13      1.14      0.00 159.

In [26]:
def get_metrics(ty, py, threshold=0.5):
    print('-'*20)
    auc = roc_auc_score(ty, py)
    print(f"AUC = {auc}")
    lst = [1 if i >=0.5 else 0 for i in py]
    acc = ((lst == ty).sum() / ty.shape[0]) * 100
    print(f"Accuracy = {acc}")
    c_m = confusion_matrix(ty, np.array(py > threshold).astype(int))
    print(c_m)
    PPV = c_m[1,1] / (c_m[1,1] + c_m[0,1])
    print(f"PPV/Precision = {PPV}")
    TPR = c_m[1,1] / c_m[1].sum()
    print(f"TPR/Sensitivity/Recall = {TPR}")
    TNR = c_m[0,0] / c_m[0].sum()
    print(f"TNR/Specificity = {TNR}")
    print('-'*20)

In [27]:
def evaluate(df: pd.DataFrame, ty, columns, threshold=0.5):
    cph_df = pd.DataFrame(df.x.values.tolist(), columns=columns)
    
    preds = 1-cph.predict_survival_function(cph_df,times=[6])
    
    get_metrics(ty, preds, threshold=threshold)

In [28]:
print("Train:")
evaluate(X_train, y_train, columns, threshold=0.5)
print("Val:")
evaluate(X_val, y_val, columns, threshold=0.5)
print("Test:")
evaluate(X_test, y_test, columns, threshold=0.5)

Train:
--------------------
AUC = 0.6330696316140252
Accuracy = 66.72044506258693
[[815949 402402]
 [ 10356  11568]]
PPV/Precision = 0.027944053916950505
TPR/Sensitivity/Recall = 0.5276409414340448
TNR/Specificity = 0.669715870057151
--------------------
Val:
--------------------
AUC = 0.6354492241246463
Accuracy = 66.16481415470365
[[100527  50888]
 [  1318   1562]]
PPV/Precision = 0.029780743565300288
TPR/Sensitivity/Recall = 0.5423611111111111
TNR/Specificity = 0.663917049169501
--------------------
Test:
--------------------
AUC = 0.6520365380575449
Accuracy = 66.5960416138036
[[103256  51272]
 [  1386   1726]]
PPV/Precision = 0.03256726668930903
TPR/Sensitivity/Recall = 0.5546272493573264
TNR/Specificity = 0.6682025264029819
--------------------
