In [None]:
from __future__ import annotations
import datetime
import os
from copy import deepcopy
from pathlib import Path
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import model_selection

from preprocessor.tabular import utils
from preprocessor.nnet_survival import nnet_survival

%load_ext autoreload
%autoreload 2

In [None]:

# Determine CSV, label, and key columns
CSV_COLUMNS = [
    "patientid",
    "task_1",
    "task_2",
    "centerid",
    "age",
    "weight",
    "tobacco",
    "alcohol",
    "performance_status",
    "hpv_status",
    "surgery",
    "chemotherapy",
    "relapse",
    "rfs",    
    "gender_m",
]

NUMERICAL_COLUMNS = ["age", "weight"]
CATEGORICAL_COLUMNS = ["centerid", "gender_m", "tobacco", "alcohol",
                    "performance_status", "hpv_status",
                    "surgery", "chemotherapy", ]
UNWANTED_COLS = ["patientid", "task_1", "task_2"]

# Set default values for each CSV column.
# Treat is_male and plurality as strings.
DEFAULTS = [
    "null",
    [0],
    [0],
    "null",
    [0.0],
    [0.0],
    [0],
    [0],
    [0],
    [0],
    [0],
    [0],
    [0],
    [0],
    [0],
]   
HALFLIFE = 1640.
BREAKS =-np.log(1-np.arange(0.0,0.96,0.05))*HALFLIFE/np.log(2) 


def load_dataset(input_file, batch_size = 1, 
                mode = tf.estimator.ModeKeys.EVAL):
    """Loads dataset using the tf.data API from CSV files.
    Args:
        pattern: str, file pattern to glob into list of files.
        batch_size: int, the number of examples per batch.
        mode: tf.estimator.ModeKeys to determine if training or evaluating.
    Returns:
        `Dataset` object.
    """        
    if not os.path.exists(input_file):
        raise FileExistsError(f'{input_file} cannot be found.')
    suffix = Path(input_file).suffix
    if suffix not in ['.csv', '.xlsx', '.xls']:
        raise ValueError(f'{suffix} type file not supported yet.')

    raw_data = pd.DataFrame()
    if suffix == '.csv':
        raw_data = pd.read_csv(input_file, header=0)
    elif suffix == '.xlsx':
        raw_data = pd.read_excel(input_file, sheet_name=0, header=0, index_col=0)

    # clean column names
    raw_data.columns = raw_data.columns.str.strip().str.lower().str.replace(' ', '_')
    # clean string values
    df_obj = raw_data.select_dtypes(['object'])
    raw_data[df_obj.columns] = df_obj.apply(
        lambda x: x.str.strip().str.lower().str.replace(' ', '_'))
    
    # data type
    for c in NUMERICAL_COLUMNS:
        raw_data[c] = raw_data[c].astype('float32')
    for c in CATEGORICAL_COLUMNS:
        raw_data[c] = raw_data[c].astype('str')
    
    # calculate the training label for nnet survival
    time = raw_data.pop('rfs')
    event = raw_data.pop('relapse')
    label=nnet_survival.make_surv_array(time,event,BREAKS)

    for c in UNWANTED_COLS:
        raw_data.pop(c)
    
    dataset = tf.data.Dataset.from_tensor_slices((dict(raw_data), label))
    
    # Shuffle and repeat for training
    if mode == tf.estimator.ModeKeys.TRAIN:
        dataset = dataset.shuffle(buffer_size=1000).repeat()

    # Take advantage of multi-threading; 1=AUTOTUNE
    dataset = dataset.prefetch(buffer_size=1)
    
    return dataset

def create_input_layers():
    """Creates dictionary of input layers for each feature.

    Returns:
        Dictionary of `tf.Keras.layers.Input` layers for each feature.
    """
    deep_inputs = {
        colname: tf.keras.layers.Input(
            name=colname, shape=(1,), dtype="float32"
        )
        for colname in NUMERICAL_COLUMNS
    }

    wide_inputs = {
        colname: tf.keras.layers.Input(name=colname, shape=(1,), dtype="string")
        for colname in CATEGORICAL_COLUMNS
    }

    inputs = {**wide_inputs, **deep_inputs}
    print(inputs)
    return inputs

def transform(inputs, nembeds):
    """Creates dictionary of transformed inputs.

    Returns:
        Dictionary of transformed Tensors
    """

    deep = {}
    wide = {}
    
    buckets = {
        "age": np.arange(30, 90, 12).tolist(),
        "weight": np.arange(50, 160, 11).tolist(),
    }
    bucketized = {}

    for numerical_column in NUMERICAL_COLUMNS:
        deep[numerical_column] = inputs[numerical_column]
        bucketized[numerical_column] = tf.keras.layers.Discretization(buckets[numerical_column])(inputs[numerical_column])
        wide[f"btk_{numerical_column}"] = tf.keras.layers.CategoryEncoding(
            num_tokens=len(buckets[numerical_column]) + 1, output_mode="one_hot"
        )(bucketized[numerical_column])

    crossed = tf.keras.layers.experimental.preprocessing.HashedCrossing(
        num_bins=len(buckets["age"]) * len(buckets["weight"])
    )((bucketized["age"], bucketized["weight"]))

    deep["age_weight_embeds"] = tf.keras.layers.Flatten()(
        tf.keras.layers.Embedding(
            input_dim=len(buckets["age"])
            * len(buckets["weight"]),
            output_dim=nembeds,
        )(crossed)
    )

    vocab = {
        "centerid": ["True", "False", "Unknown"],
        "gender_m": ["0", "1"],
        "tobacco": ["0", "1", "-1"],
        "alcohol": ["0", "1", "-1"],
        "performance_status": ["0", "1", "2", "3", "4", "-1"],
        "hpv_status": ["0", "1", "-1"],
        "surgery": ["0", "1", "-1"],
        "chemotherapy": ["0", "1"],
    }

    for categorical_column in CATEGORICAL_COLUMNS:
        wide[categorical_column] = tf.keras.layers.StringLookup(
            vocabulary=vocab[categorical_column], output_mode="one_hot"
        )(inputs[categorical_column])

    return wide, deep

def get_model_outputs(wide_inputs, deep_inputs, dnn_hidden_units: str = "64 32"):
    """Creates model architecture and returns outputs.

    Args:
        wide_inputs: Dense tensor used as inputs to wide side of model.
        deep_inputs: Dense tensor used as inputs to deep side of model.
        dnn_hidden_units: List of integers where length is number of hidden
            layers and ith element is the number of neurons at ith layer.
    Returns:
        Dense tensor output from the model.
    """
    # Hidden layers for the deep side
    layers = [int(x) for x in dnn_hidden_units.split()]
    deep = deep_inputs
    for layerno, numnodes in enumerate(layers):
        deep = tf.keras.layers.Dense(
            units=numnodes, activation="relu", name=f"dnn_{layerno + 1}"
        )(deep)
    deep_out = deep

    # Linear model for the wide side
    wide_out = tf.keras.layers.Dense(
        units=10, activation="relu", name="linear"
    )(wide_inputs)

    # Concatenate the two sides
    both = tf.keras.layers.Concatenate(name="both")([deep_out, wide_out])

    output1 = tf.keras.layers.Dense(
        units=64, activation="relu", name="dense")(both)
    
    output2 = tf.keras.layers.Dropout(0.2)(output1)
    
    n_intervals = len(BREAKS)-1
    # Final output is a linear activation because this is regression
    output3 = tf.keras.layers.Dense(
        units=n_intervals, activation="sigmoid", name="haz",
        kernel_initializer='zeros', bias_initializer='zeros')(output2)

    return output3

def build_wide_deep_model(dnn_hidden_units="64 32", nembeds=3):
    """Builds wide and deep model using Keras Functional API.

    Returns:
        `tf.keras.models.Model` object.
    """
    # Create input layers
    inputs = create_input_layers()

    # transform raw features for both wide and deep
    wide, deep = transform(inputs, nembeds)

    # The Functional API in Keras requires: LayerConstructor()(inputs)
    wide_inputs = tf.keras.layers.Concatenate()(wide.values())
    deep_inputs = tf.keras.layers.Concatenate()(deep.values())

    # Get output of model given inputs
    output = get_model_outputs(wide_inputs, deep_inputs, dnn_hidden_units)

    # Build model and compile it all together
    model = tf.keras.models.Model(inputs=inputs, outputs=output)

    n_intervals = len(BREAKS)-1

    model.compile(loss=nnet_survival.surv_likelihood(n_intervals), 
                    metrics = nnet_survival.surv_likelihood(n_intervals),
                    optimizer=tf.keras.optimizers.Adam(),
                    run_eagerly=True)

    return model

def build_wide_deep_model_test():
    """Builds wide and deep model using Keras Functional API.

    Returns:
        `tf.keras.models.Model` object.
    """
    n_intervals = len(BREAKS)-1
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(64, input_shape=(10,),activation='relu'))
    model.add(tf.keras.layers.Dense(32, activation='relu'))
    model.add(tf.keras.layers.Dropout(0.25))

    prop_hazards=0
    if prop_hazards:
        model.add(tf.keras.layers.Dense(1, use_bias=0, kernel_initializer='zeros'))
        model.add(nnet_survival.PropHazards(n_intervals))
    else:
        model.add(tf.keras.layers.Dense(n_intervals, kernel_initializer='zeros', bias_initializer='zeros'))
        model.add(tf.keras.layers.Activation('sigmoid'))

    model.compile(loss=nnet_survival.surv_likelihood(n_intervals), optimizer=tf.keras.optimizers.Adam())
    return model
     

In [None]:
model = build_wide_deep_model()
print("Here is our Wide-and-Deep architecture so far:\n")
print(model.summary())


In [None]:

num_epochs = 1000
batch_size = 32
train_examples = 50000

trainds = load_dataset(
    input_file = "../data/task2/train_data.csv",
    batch_size = batch_size,
    mode = tf.estimator.ModeKeys.TRAIN)

evalds = load_dataset(
    input_file = "../data/task2/test_data.csv", 
    batch_size = 1000,
    mode = tf.estimator.ModeKeys.EVAL)
#if args["eval_steps"]:
#    self.evalds = self.evalds.take(count=args["eval_steps"])


num_batches = batch_size * num_epochs
steps_per_epoch = train_examples // num_batches

checkpoint_path = os.path.join("output", "checkpoints")
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, verbose=1, save_weights_only=True)

history = model.fit(
    trainds,
    validation_data=evalds,
    epochs=num_epochs,
    steps_per_epoch=steps_per_epoch,
    verbose=2,  # 0=silent, 1=progress bar, 2=one line per epoch
    callbacks=[cp_callback])
