## Environment Setup

In [2]:
from argparse import Namespace
from lifelines.utils import concordance_index
import numpy as np
import os
import pandas as pd
# from pysurvival.models.simulations import SimulationModel
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset

# from utils import *
# from models import *

### models.py
Code from Hassan

In [3]:
class BasicModel(nn.Module):
    ''' The module class performs building network according to config'''
    def __init__(self, activation, covariates):
        ''' Initialize BasicModel class

        Args:
            activation: string, name of activation function to use
            covariates: int, number of covariates, needed for size of first layer

        Returns:
            torch.nn Module object, built sequential network
        '''
        super(BasicModel, self).__init__()
        # parses parameters of network from configuration
        # Set some defaults for network arguments
        # Fraction of input units to drop in dropout layer
        self.drop = 0.375#0.401
        # Flag to in/exclude normalization layers
        self.norm = True
        # Default dimensions of fully connected layers
        self.dims = [covariates, 4, 1]#10, 17, 17, 17, 1]
        # Activation type to use
        self.activation = activation
        # Build network using class function (below)
        self.model = self._build_network()

    def _build_network(self):
        ''' Performs building networks according to parameters'''
        layers = []
        for i in range(len(self.dims)-1):
            if i and self.drop is not None:
                # Add dropout layer
                layers.append(nn.Dropout(self.drop))

            # Add fully connected layer
            layers.append(nn.Linear(self.dims[i], self.dims[i+1]))

            if self.norm:
                # Add batchnormalize layer
                layers.append(nn.BatchNorm1d(self.dims[i+1]))

            # Adds activation layer
            # eval creates proper format of activation to get from NN
            layers.append(eval('nn.{}()'.format(self.activation)))

        # Build sequential network from list of layers created in for loop
        return nn.Sequential(*layers)

    def forward(self, X):
        ''' Forward propagation through network

        Args:
            X: data to pass through network

        Returns:
            Output of model (risk prediction)
        '''
        return self.model(X)



    
class NegativeLogLikelihood(nn.Module):
    '''Negative log likelihood loss function from Katzman et al. (2018) DeepSurv model (equation 4)'''
    def __init__(self, gpu):
        ''' Initialize NegativeLogLikelihood class

        Args:
            gpu: string, what kind of tensor to use for loss calculation
        '''
        super(NegativeLogLikelihood, self).__init__()
        # self.L2_reg = 0
        self.reg = Regularization(order=2, weight_decay=0)
        self.device = gpu

    def forward(self, risk_pred, y, e, model):
        # Think this is getting set of patients still at risk of failure at time t???
        mask = torch.ones(y.shape[0], y.shape[0], device=self.device)
        mask[(y.T - y) > 0] = 0
        log_loss = torch.exp(risk_pred) * mask
        log_loss = torch.sum(log_loss, dim=0) / torch.sum(mask, dim=0)
        log_loss = torch.log(log_loss).reshape(-1, 1)
        neg_log_loss = -torch.sum((risk_pred-log_loss) * e) / torch.sum(e)
        l2_loss = self.reg(model)
        return neg_log_loss + l2_loss


class NegativeLogLikelihoodStrat(nn.Module):
    def __init__(self, gpu):
        super(NegativeLogLikelihoodStrat, self).__init__()
        self.device = gpu

    def forward(self, risk_pred, y, e, low, high):
        mask = torch.ones(y.shape[0], y.shape[0], device=self.device)
        mask[(y.T - y) > 0] = 0
        log_loss = torch.exp(risk_pred) * mask
        log_loss = torch.sum(log_loss, dim=0) / torch.sum(mask, dim=0)
        log_loss = torch.log(log_loss).reshape(-1, 1)
        neg_log_loss = -torch.sum((risk_pred-log_loss) * e) / torch.sum(e)
        strat_loss = 1 / (1 + torch.abs((high.mean() - low.mean())))
        strat_loss = F.smooth_l1_loss(strat_loss, torch.zeros(1).squeeze().to(self.device), reduction='none').to(self.device)
        return neg_log_loss, strat_loss


class Regularization(object):
    def __init__(self, order, weight_decay):
        ''' Initialize Regularization class

        Args:
            order: int, norm order number
            weight_decay: float, weight decay rate
        '''
        super(Regularization, self).__init__()
        self.order = order
        self.weight_decay = weight_decay

    def __call__(self, model):
        ''' Calculates regularization(self.order) loss for model

        Args:
            model: torch.nn Module object

        Returns:
            reg_loss: torch.Tensor, regularization loss
        '''
        reg_loss = 0
        for name, w in model.named_parameters():
            if 'weight' in name:
                reg_loss = reg_loss + torch.norm(w, p=self.order)
        reg_loss = self.weight_decay * reg_loss
        return reg_loss


## utils.py
Code from Hassan

In [4]:
class SurvivalDataset(Dataset):
    def __init__(self, dataset, args):
        '''Initialize SurvivalDataset class

        Args:
            dataset: pandas.Dataframe, Contains covariates, time of event (T), and event indicator (E) values.
            T and E must be the final two columns
            args: Namespace,
        '''
        # Get covariates out of dataframe (args.covariates is num of columns containing covariates)
        self.X = dataset.iloc[:, 0:args.covariates].values
        # Get time and event indicator columns out of dataframe
        self.data = list(zip(dataset.time, dataset.event))
        self.len = len(dataset)
        print('=> load {} samples'.format(self.len))
        # Normalize covariate data with class function
        if args.normalize:
            self._normalize()

    def _normalize(self):
        '''Normalize X data (covariates) (transform values to range between 0 and 1)'''
        self.X = (self.X - self.X.min(axis=0)) / (self.X.max(axis=0) - self.X.min(axis=0))

    def __getitem__(self, item):
        '''Getter for single data piece

        Args:
            item: int, index of data to retrieve

        Returns:
            X_tensor: torch.Tensor, covariate values for data item
            y_tensor: torch.Tensor, time of event value for data item
            e_tensor: int torch.Tensor, event indicator value for data item
        '''
        y, e = self.data[item]
        X_tensor = torch.from_numpy(self.X[item])
        e_tensor = torch.Tensor([e]).int()
        y_tensor = torch.Tensor([y])
        return X_tensor, y_tensor, e_tensor

    def __len__(self):
        return self.len

    

def save_error(train_ci, val_ci, coxLoss, stratLoss, variance, epoch, slname):
    '''Save training and validation statistics to csv file

        Args:
            train_ci: float, training concordance index for this epoch
            val_ci: float, validation concordance index for this epoch
            coxLoss:
            stratLoss:
            variance:
            epoch: int, epoch these stats are from
            slname: string, filename
    '''
    if epoch == 0:
        # Create file for first epoch
        f = open(slname, 'w')
        f.write('epoch,coxLoss,stratLoss,trainCI,valCI,variance\n')
        f.write('{},{:.4f},{:.4f},{:.4f},{:.4f},{}\n'.format(epoch, coxLoss, stratLoss, train_ci, val_ci, variance))
        f.close()
    else:
        f = open(slname, 'a')
        f.write('{},{:.4f},{:.4f},{:.4f},{:.4f},{}\n'.format(epoch, coxLoss, stratLoss, train_ci, val_ci, variance))
        f.close()


def c_index(risk_pred, y, e):
    '''Calculate c-index

    Args:
        risk_pred: np.ndarray or torch.Tensor, model prediction
        y: np.ndarray or torch.Tensor, times of event e
        e: np.ndarray or torch.Tensor, event indicator

    Returns:
        c_index: float, concordance index
    '''
    # Convert risk_pred, y, and e from torch.Tensor to np.ndarray if not already
    if not isinstance(y, np.ndarray):
        y = y.detach().cpu().numpy()
    if not isinstance(risk_pred, np.ndarray):
        risk_pred = risk_pred.detach().cpu().numpy()
    if not isinstance(e, np.ndarray):
        e = e.detach().cpu().numpy()
    return concordance_index(y, risk_pred, e)


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def adjust_learning_rate(optimizer, epoch, lr, lr_decay_rate):
    '''Adjust learning rate according to (epoch, lr, and lr_decay_rate)

    Args:
        optimizer: torch.optim object,
        epoch: int, epoch number
        lr: float, initial learning rate
        lr_decay_rate: float, decay rate to apply to learning rate

    Returns:
        lr: float, updated learning rate
    '''
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr / (1+epoch*lr_decay_rate)
    return optimizer.param_groups[0]['lr']



### train.py
Code from Hassan

In [10]:
# Arguments for network
args = Namespace(activation = 'SELU',  # activation for fully connected layers
                 batch_size = 4000,    # this is currently ignored by the DataLoader
                 covariates = 18,      # input size, will be size of first layer
                 decay_interval = 400, # how many epochs pass before weight decay is applied
                 development = True,   # if testing, sends output to separate directory
                 dropout = 0.3,        # Dropout rate 
                 epochs = 500,         # Training epoch count
                 lr = 0.001,           # Learning rate
                 normalize = False,    # Whether to normalize covariate data 
                 weight_decay = 0.0001 # Decay rate for weights
                )

best_acc = 0
# Where to allocate all the Tensors (can be 'cpu' or 'coda')
gpu = torch.device("cpu")

# Setting up output path from model training
# Mac directory 
# root_output = '/Users/katyscott/Documents/ICC/Code/cox_experiments'

# Linux directory
root_output = '/media/katy/Data/ICC/Code/cox_experiments'

if args.development:
    save_path = 'test'
else:
    save_path = '{}_{}lr_{}b_'.format(args.activation,args.lr,args.batch_size)
    
out_dir = os.path.join(root_output, save_path)
if not os.path.exists(out_dir):
    os.makedirs(out_dir)


#### Simulated Data creation

*Note: this is currently broken on Linux, can't install pysurvival. Works on Mac.*

In [None]:
# generate random survival times with exp. distribution
# sim = SimulationModel(survival_distribution='exponential',
#                       risk_type = 'Linear',
#                       censored_parameter = 6,
#                       alpha = 1,
#                       beta = 5)

# train_samples = sim.generate_data(num_samples = 4000,
#                                   num_features = args.covariates,
#                                   feature_weights = [1, 1, 0, 0, 0, 0, 0, 0, 0, 0])

# val_samples = sim.generate_data(num_samples = 500,
#                                 num_features = args.covariates,
#                                 feature_weights = [1, 1, 0, 0, 0, 0, 0, 0, 0, 0])

# train_dataset = SurvivalDataset(train_samples, args)
# train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=train_dataset.__len__())

# val_dataset = SurvivalDataset(val_samples, args)
# val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=val_dataset.__len__())


In [11]:
# Load and setup cholangio genetic data
genomic_file = "../Data/MSK_Genomic_Data.csv"

gene_features = pd.read_csv(genomic_file)
#Patient IDs have a space at the end of the name
gene_features['ScoutID'] = gene_features['ScoutID'].str.strip()
# Fixing columns with illegal characters in name
gene_features.rename(columns={'CDKN2A.DEL':'CDKN2A_DEL', 'TGF-Beta_Pathway':'TGF_Beta_Pathway'}, inplace=True)

# Get number of covariates = number of genetic columns
args.covariates = gene_features.shape[1] - 1

labels_file = "../Data/RFS_Scout.xlsx"

rfs_labels = pd.read_excel(labels_file)
rfs_labels = rfs_labels[['ScoutID', 'RFS', 'RFS_Code']]
rfs_labels.rename(columns={'RFS':'time', 'RFS_Code':'event'}, inplace=True)

# Getting intersection of patients with gene features and RFS labels all in one dataframe
genes_and_labels = pd.merge(gene_features, rfs_labels, how='inner', on=['ScoutID', 'ScoutID'])

# Removing ScoutID so setup is proper for Survival Dataset generation
genes_and_labels.drop(columns=['ScoutID'], inplace=True)

train_genes, val_genes = train_test_split(genes_and_labels, test_size=0.2, random_state=42, shuffle=True)

In [12]:
train_dataset = SurvivalDataset(train_genes, args)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=train_dataset.__len__())

val_dataset = SurvivalDataset(val_genes, args)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=val_dataset.__len__())

=> load 88 samples
=> load 23 samples


In [13]:
# Build network 
model = BasicModel(args.activation, args.covariates).to(gpu)

# Loss function
criterion = NegativeLogLikelihood(gpu)

# Set optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

In [None]:
for epoch in range(0, args.epochs):
    coxLossMeter = AverageMeter()
    stratLossMeter = AverageMeter()
    ciMeter = AverageMeter()
    varMeter = AverageMeter()
    
    # Training
    model.train()
    for X, y, e in train_loader:
        # Get risk prediction from network
        risk_pred = model(X.float().to(gpu))
        
        # Calculate neg. log likelihood
        cox_loss = criterion(-risk_pred, y.to(gpu), e.to(gpu), model)
        strat_loss = torch.Tensor([0])
        train_loss = cox_loss
        
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        
        coxLossMeter.update(cox_loss.item(), y.size(0))
        stratLossMeter.update(strat_loss.item(), y.size(0))
        varMeter.update(risk_pred.var(), y.size(0))
        
        # Calculate c index
        train_c = c_index(risk_pred, y, e)
        ciMeter.update(train_c.item(), y.size(0))
    
    # Validation
    model.eval()
    ciValMeter = AverageMeter()
    for X, y, e in val_loader:
        risk_pred = model(X.float().to(gpu))
        val_c = c_index(risk_pred, y, e)
        ciValMeter.update(val_c.item(), y.size(0))
    
    print('Epoch: {} \t Train Loss: {:.4f} \t Train CI: {:.3f} \t Val CI: {:.3f}'.format(epoch, train_loss, train_c, val_c))
    save_error(ciMeter.avg, ciValMeter.avg, coxLossMeter.avg, stratLossMeter.avg, varMeter.avg, epoch, os.path.join(out_dir, 'convergence.csv'))