In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import datetime
import os
import shutil
import tensorflow as tf

import math
from lifelines import KaplanMeierFitter
from lifelines import CoxPHFitter
from lifelines.utils import concordance_index
from sklearn.preprocessing import StandardScaler
from scipy import stats

from preprocessor.nnet_survival import nnet_survival
from preprocessor.tabular import nnet_survival_trainer as st

%load_ext autoreload
%autoreload 2

2023-06-28 13:17:35.943101: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-28 13:17:44.839513: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/biotools/python/3.8.1/lib:/usr/local/biotools/cuda/11.7/lib64:/usr/local/biotools/cuda/11.7/nccl/lib:
2023-06-28 13:17:44.839610: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/biotoo

In [None]:
%%writefile trainer/model.py
import datetime
import os
import shutil
import numpy as np
import tensorflow as tf


In [16]:

# Determine CSV, label, and key columns
CSV_COLUMNS = [
    "patientid",
    "task_1",
    "task_2",
    "centerid",
    "age",
    "weight",
    "tobacco",
    "alcohol",
    "performance_status",
    "hpv_status",
    "surgery",
    "chemotherapy",
    "relapse",
    "rfs",    
    "gender_m",
]

NUMERICAL_COLUMNS = ["age", "weight"]
CATEGORICAL_COLUMNS = ["centerid", "gender_m", "tobacco", "alcohol",
                    "performance_status", "hpv_status",
                    "surgery", "chemotherapy", ]
UNWANTED_COLS = ["patientid", "task_1", "task_2"]

# Set default values for each CSV column.
# Treat is_male and plurality as strings.
DEFAULTS = [
    "null",
    [0],
    [0],
    "null",
    [0.0],
    [0.0],
    [0],
    [0],
    [0],
    [0],
    [0],
    [0],
    [0],
    [0],
    [0],
]   

del survival_model

survival_model = st.TrainerNNetSurvival(
    input_file_train = "../data/task2/train_data.csv",
    input_file_eval = "../data/task2/test_data.csv",
    output_dir = "output",
    unwanted_cols = UNWANTED_COLS,
    numerical_cols = NUMERICAL_COLUMNS,
    categorical_cols =CATEGORICAL_COLUMNS,
    train_batch_size = 32,
    eval_batch_size = 100,
    num_train_examples = 5000,
    num_epochs = 500,
    halflife= 1460)


In [17]:
survival_model.train_and_evaluate()


Here is our Wide-and-Deep architecture so far:

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_9 (Dense)             (None, 64)                704       
                                                                 
 dense_10 (Dense)            (None, 32)                2080      
                                                                 
 dropout_6 (Dropout)         (None, 32)                0         
                                                                 
 dense_11 (Dense)            (None, 19)                627       
                                                                 
 activation_3 (Activation)   (None, 19)                0         
                                                                 
Total params: 3,411
Trainable params: 3,411
Non-trainable params: 0
_________________________________________________________________
None

ValueError: Unexpected result of `train_function` (Empty logs). Please use `Model.compile(..., run_eagerly=True)`, or `tf.config.run_functions_eagerly(True)` for more information of where went wrong, or file a issue/bug to `tf.keras`.

In [None]:

###################################################################################
#Flexible model: Convolutional neural network using MNIST data
#Uses some code from https://github.com/keras-team/keras/blob/master/examples/mnist_cnn.py
#For this example, we use images of the handwritten numbers 0 through 4.
#Larger numbers have shorter average survival. Task: Given an image of a number, predict survival curve.
#Described in paper.


# We need to split the data set into train and test, and fill in time & event for both groups
train_data = pd.read_csv("../data/task2/train_data.csv")
test_data = pd.read_csv("../data/task2/test_data.csv")
time = train_data['rfs']
event = train_data['relapse']
timeTest = test_data['rfs']
eventTest = test_data['relapse']
x_train = train_data[['centerid',
                      'age',
                      'weight',
                      'tobacco',
                      'alcohol',
                      'performance_status',
                      'hpv_status',
                      'surgery',
                      'chemotherapy',
                      'gender_m']].values
x_test = test_data[['centerid',
                      'age',
                      'weight',
                      'tobacco',
                      'alcohol',
                      'performance_status',
                      'hpv_status',
                      'surgery',
                      'chemotherapy',
                      'gender_m']].values

#Convert event data to array format

halflife=365.*4
breaks=-np.log(1-np.arange(0.0,0.96,0.05))*halflife/np.log(2) 
#breaks=np.concatenate((np.arange(0,200,10),np.arange(200,1001,25)))

n_intervals=len(breaks)-1
timegap = breaks[1:] - breaks[:-1]
y_train_array=nnet_survival.make_surv_array(time,event,breaks)

#Train model
from numpy.random import seed
seed(1)
import tensorflow as tf
tf.random.set_seed(1)

model = Sequential()
model.add(Dense(64, input_shape=(10,),activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.25))

prop_hazards=0
if prop_hazards:
	model.add(Dense(1, use_bias=0, kernel_initializer='zeros'))
	model.add(nnet_survival.PropHazards(n_intervals))
else:
	model.add(Dense(n_intervals, kernel_initializer='zeros', bias_initializer='zeros'))
	model.add(Activation('sigmoid'))

model.compile(loss=nnet_survival.surv_likelihood(n_intervals), optimizer=tf.keras.optimizers.Adam())
early_stopping = EarlyStopping(monitor='loss', patience=50)
history=model.fit(x_train, y_train_array, batch_size=64, epochs=10000, verbose=1, callbacks=[early_stopping])

#Training set results
y_pred=model.predict(x_train,verbose=0)


#discrimination (C-index)
oneyr_surv=np.cumprod(y_pred[:,0:np.nonzero(breaks>365)[0][0]], axis=1)[:,-1]
print(concordance_index(time,oneyr_surv,event))

#calibration plot
days_plot = 365*4
plt.subplot(1,1,1)
#plt.subplot(1, 2, 1)
kmf = KaplanMeierFitter()
matplotlib.style.use('default')
actual = []
predicted = []
kmf.fit(time, event_observed=event)
actual.append(plt.plot(kmf.survival_function_.index.values, kmf.survival_function_.KM_estimate,ls='--',c='r'))
pred_surv=np.mean(np.cumprod(y_pred, axis=1),axis=0)
predicted.append(plt.plot(breaks,np.concatenate(([1],pred_surv)),ls='-',c='b'))
#print(i, kmf.median_)

plt.xticks(np.arange(0, days_plot+0.0001, 200))
plt.yticks(np.arange(0, 1.0001, 0.125))
plt.xlim([0,days_plot])
plt.ylim([0,1])
plt.xlabel('Follow-up time (days)')
plt.ylabel('Proportion surviving')
plt.title('Training set calibration')
#plt.show()

#Test set results
y_pred=model.predict(x_test,verbose=0)

#discrimination (C-index)
oneyr_surv=np.cumprod(y_pred[:,0:np.nonzero(breaks>365)[0][0]], axis=1)[:,-1]
print(concordance_index(timeTest,oneyr_surv,eventTest))

#discrimination of perfect model that uses actual digit as survival time predictor
#print(concordance_index(timeTest,-y_test.astype('float'),eventTest))