# Code Annotations/Analysis of Model + Training/Testing

In [35]:
import torch
import torch.nn as nn
# from torch.nn.modules.rnn import GRU, LSTM, RNN
import utils
import os
import sys
import numpy as np
import pandas as pd
from random import SystemRandom
from tqdm import tqdm

from args import args
import torch.optim as optim
from torchdiffeq import odeint_adjoint as odeint

from data_parse import parse_tdm1

## model.py

The ODEFunc class is a neural network responsible for uncovering the underlying differential equation for the dyanmical system.

In [36]:
class ODEFunc(nn.Module):

    #initializes neural network with desired dimensions
    def __init__(self, input_dim, hidden_dim):
        super(ODEFunc, self).__init__()

        #nn.Sequential is a method that allows the creation of layers in the neural network. The method itself acts as a "container" for the layers/modules inside the network.
        self.net = nn.Sequential(
            #layers in a neural network are nothing but a series of linear transformations on our input matrix (y = xAt + b). nn.Linear forms a "linear layer" which applies learnable weights (x) and biases (b) to our input data (At). The dimensionality of data often changes hence the allowance of input_dim and hidden-dim. 
            nn.Linear(input_dim, hidden_dim),
            #nn.SeLu is an activation function. Activation functions determine the weighted "importance" of features in the input data. This adds non-linearity to our model which allows it to be more complex than simple linear regression. SELU specifically allows for self-normalizing neural nets and tackles the vanishing gradient problem.
            nn.SELU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.SELU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.SELU(),
            nn.Linear(hidden_dim, input_dim)
        )

        # this for loop interates through every linear layer (set of layers is returned by calling self.net.modulesi) in the neural network we defined above and initializes weights/biases of input feature
        for m in self.net.modules():
            if isinstance(m, nn.Linear):
                #if a module (layer) in our network is a linear layer and not an activation function, this randomly initializes our input tensor of weights of each layer (m.weights) with values sampled from a Gaussian distribution with mean 0 and SD 0.001. This mitigates the vanishing/exploding gradient problem.
                nn.init.normal_(m.weight, mean=0, std=0.001)
                # if a module is a linear layer, then the input tensor (tensor containing biases of the layer) are all initialized to 0.5
                nn.init.constant_(m.bias, val=0.5)

    #feeds our input data through our network (self.net)
    def forward(self, t, x):
        # print(x)
        return self.net(x)

The encoder class defines an encoder network following variational autoencoder concept. This means that the inputs are mapped to a distribution rather than a deterministic outcome. 

In [37]:
#defines an encoder network following variational autoencoder concept; this means that the inputs are mapped to a distribution rather than a deterministic outcome. 
class Encoder(nn.Module):

    #initializes attributes of instances of encoder
    def __init__(self, input_dim, output_dim, hidden_dim, device=torch.device("cpu")):
        super(Encoder, self).__init__()

        self.output_dim = output_dim
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.device = device

    #Sets up a sequential layers for network. Encoders analyzes a single element of the input sequence, "retains/encodes" important info about that element, and propogates forward. 
        self.hiddens_to_output = nn.Sequential(
            nn.Linear(self.hidden_dim, self.hidden_dim),
            #ReLU activation function is similar to SELU except it takes on binary values and can result in dead neurons, causing them to not be used for predicing outputs from features.
            nn.ReLU(),
            nn.Linear(self.hidden_dim, self.output_dim),
        )
        #utils function that serves similar purpose as for loop in ODEFunc class. However, this function initializes biases as a 0 constant while weights are sampled from gaussian dist.
        utils.init_network_weights(self.hiddens_to_output, std=0.001)

        #nn.GRU applies a "pre-built" GRU to a given input; the GRU "scans" through the time series data in reverse and encodes the relevant data into a 12-element array. This array is fed into the ODEFunc network to define the mean and standard deviation of the latent state distributions which z_t0 is sampled from. 
        # self.rnn = nn.RNN(self.input_dim, self.hidden_dim, nonlinearity="relu").to(device)
        self.rnn = nn.GRU(self.input_dim, self.hidden_dim).to(device)

    #defines forward pass of encoder
    def forward(self, data):
        #permutes data to make necessary dimensional changes
        data = data.permute(1, 0, 2)
        #reverses data to allow GRU to scan through time series data in reverse fashion (why?)
        data = utils.reverse(data)
        #sends input data through GRU
        output_rnn, _ = self.rnn(data)
        #print(output_rnn)
        #takes in the data scanned in reverse (done by GRU) and feeds through 
        outputs = self.hiddens_to_output(output_rnn[-1])
        #print(outputs)
        
        return outputs
    

Classifier initializes and defines a decoder network that takes in the sequence of z_t's outputted by the ODEFunc network. It then generates the predictions from the output of the ODE solver and the first dosing observations.

In [38]:
#initializes and defines a decoder network that takes in the sequence of z_t's outputted by the ODEFunc network. It then generates the predictions from the output of the ODE solver and the first dosing observations.
class Classifier(nn.Module):

    #init method creates another set of sequential modules with 1 fully connected layer and 32 hidden units
    def __init__(self, latent_dim, output_dim):
        super(Classifier, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(latent_dim + 20, 32),
            nn.SELU(),
            nn.Linear(32, output_dim)
        )
        
        #follows same weight and bias initialization protocol as the Encoder class
        utils.init_network_weights(self.net, std=0.001)

    #defines forward pass where z is the sequence of z_t's generated by the output of ODEFunc and cmax_time refers to the dosing information.
    def forward(self, z, cmax_time):
        #repeates dosing information along given dimensions to match up with z
        cmax_time = cmax_time.repeat(z.size(0), 1, 1)
        #joins dosing info with sequence of z_t's and feeds in as input to decoder
        z = torch.cat([z, cmax_time], 2)
        return self.net(z)

## run_train.py

In [39]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tdm1_obj = parse_tdm1(device, phase="train")
input_dim = tdm1_obj["input_dim"]
hidden_dim = 128
latent_dim = 6

encoder = Encoder(input_dim=input_dim, output_dim=2 * latent_dim, hidden_dim=hidden_dim)
ode_func = ODEFunc(input_dim=latent_dim, hidden_dim=16)
classifier = Classifier(latent_dim=latent_dim, output_dim=1)


In [40]:
tdm1_obj

{'train_dataloader': <generator object inf_generator at 0x7f929b498f20>,
 'val_dataloader': <generator object inf_generator at 0x7f929b4b43c0>,
 'n_train_batches': 656,
 'n_val_batches': 56,
 'input_dim': 5}

In [41]:
encoder

Encoder(
  (hiddens_to_output): Sequential(
    (0): Linear(in_features=128, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=12, bias=True)
  )
  (rnn): GRU(5, 128)
)

In [42]:
ode_func

ODEFunc(
  (net): Sequential(
    (0): Linear(in_features=6, out_features=16, bias=True)
    (1): SELU()
    (2): Linear(in_features=16, out_features=16, bias=True)
    (3): SELU()
    (4): Linear(in_features=16, out_features=16, bias=True)
    (5): SELU()
    (6): Linear(in_features=16, out_features=6, bias=True)
  )
)

In [43]:
classifier

Classifier(
  (net): Sequential(
    (0): Linear(in_features=26, out_features=32, bias=True)
    (1): SELU()
    (2): Linear(in_features=32, out_features=1, bias=True)
  )
)

In [44]:
# batch size defines how many training samples must be done before updating weights/biases of a node during backprop
#epoch defines how many total backward passes we will do
batches_per_epoch = tdm1_obj["n_train_batches"]
#sets L2 norm-squared (MSE) between predicted and actual as loss criterion
criterion = nn.MSELoss().to(device=device)
params = (list(encoder.parameters()) + 
          list(ode_func.parameters()) + 
          list(classifier.parameters()))
#utilizing Adam optimization algorithm rather than SGD to overcome saddlepoints in data. It incorporates the idea of momentum by nudging weights/biases by the average running gradient rather than the gradient itself.
optimizer = optim.Adam(params, lr=args.lr, weight_decay=args.l2)
best_rmse = 0x7fffffff
best_epochs = 0

In [45]:
print(args)

Namespace(data='data.csv', fold=None, model=None, save=None, continue_train=False, random_seed=1000, layer=2, lr=5e-05, l2=0.1, hidden=None, tol=0.0001, epochs=30)


In [47]:
for epoch in range(1, 30):

    for _ in tqdm(range(batches_per_epoch), ascii=True):
        #sets gradients of all parameters to zero. This prevents the incorrect accumulation of gradients that occurs if you call loss.backwards more than once wihtout zeroing out the gradients.
        optimizer.zero_grad()

        #extracts training features and dosing
        ptnms, times, features, labels, cmax_time = tdm1_obj["train_dataloader"].__next__()
        dosing = torch.zeros([features.size(0), features.size(1), latent_dim])
        dosing[:, :, 0] = features[:, :, -2]
        dosing = dosing.permute(1, 0, 2)

        #VAE concept used here. We are taking the output of the encoder and sampling z_0 from a distribution of the latent space variables gathered from estimating the mean and variance from the 12 elements outputted by the encoder. 
        encoder_out = encoder(features)
        qz0_mean, qz0_var = encoder_out[:, :latent_dim], encoder_out[:, latent_dim:]
        z0 = utils.sample_standard_gaussian(qz0_mean, qz0_var)
        
        solves = z0.unsqueeze(0).clone()
        try:
            #this is where the idea of neural-ODE's are used. dosing information and time interval are incorporated into the event from the previous time step. The time interval from the previous time interval and z_i-1 are sent into the ODE Solver function.
            for idx, (time0, time1) in enumerate(zip(times[:-1], times[1:])):
                z0 += dosing[idx]
                time_interval = torch.Tensor([time0 - time0, time1 - time0])
                #ODE Solver function 
                sol = odeint(ode_func, z0, time_interval, rtol=1e-4, atol=1e-4)
                z0 = sol[-1].clone()
                #running sequence of all z_i's at each time step which will eventially be used to predict output in the decoder
                solves = torch.cat([solves, sol[-1:, :]], 0)
        except AssertionError:
            print(times)
            print(time0, time1, time_interval, ptnms)
            continue

        #prediction generation from sequence of z_i's
        preds = classifier(solves, cmax_time)

        # computes MSE on preds vs observations 
        loss = utils.compute_loss_on_train(criterion, labels, preds)
        try: 
            #automatically computes gradients of loss tensor
            loss.backward()
        except RuntimeError:
            print(ptnms)
            print(times)
            continue
        #performs a single parameter update (single optimization step)
        optimizer.step()
    
    idx_not_nan = ~(torch.isnan(labels) | (labels == -1))
    preds = preds.permute(1, 0, 2)[idx_not_nan]
    labels = labels[idx_not_nan]
    print(preds)
    print(labels)

    #torch.no_grad() is used to prevent the automatic calculation of gradients to clearly see unbiased training/validation error 
    with torch.no_grad():
        
        #training error
        train_res = utils.compute_loss_on_test(encoder, ode_func, classifier, args,
            tdm1_obj["train_dataloader"], tdm1_obj["n_train_batches"], 
            device, phase="train")

        #validation error
        validation_res = utils.compute_loss_on_test(encoder, ode_func, classifier, args,
            tdm1_obj["val_dataloader"], tdm1_obj["n_val_batches"], 
            device, phase="validate")
        
        train_loss = train_res["loss"] 
        validation_loss = validation_res["loss"]

        #if the validation loss on the current interation is better than the best running RMSE, then we save the weights and biases of the encoder, ode, classifier, and arguments
        if validation_loss < best_rmse:
            torch.save({'encoder': encoder.state_dict(),
                        'ode': ode_func.state_dict(),
                        'classifier': classifier.state_dict(),
                        'args': args}, ckpt_path)
            best_rmse = validation_loss
            best_epochs = epoch

print("Best rmse: " + string(best_rmse))
        #message = """
        #Epoch {:04d} | Training loss {:.6f} | Training R2 {:.6f} | Validation loss {:.6f} | Validation R2 {:.6f}
        #Best loss {:.6f} | Best epoch {:04d}
        #""".format(epoch, train_loss, train_res["r2"], validation_loss, validation_res["r2"], best_rmse, best_epochs)
        #logger.info(message)

  2%|7                                         | 11/656 [00:02<02:19,  4.62it/s]


KeyboardInterrupt: 

## run_predict.py

In [48]:
########################################################################
#parses input data into feature columns, etc.
tdm1_obj = parse_tdm1(device, phase="test")
input_dim = tdm1_obj["input_dim"]
#represents hidden units of GRU in encoder
hidden_dim = 128 
latent_dim = 6

#instantiates encoder. Output dimension is 12 because 6 elements are used to determine the value of the mean for the distribution of the latent space while the other 6 are used to estimate the variance.
encoder = Encoder(input_dim=input_dim, output_dim=2 * latent_dim, hidden_dim=hidden_dim)
#instantiates governing ODEFunc
ode_func = ODEFunc(input_dim=latent_dim, hidden_dim=16)
#instantiates decoder
classifier = Classifier(latent_dim=latent_dim, output_dim=1)

#loads the model's parameter dictionary
utils.load_model(ckpt_path, encoder, ode_func, classifier, device)

########################################################################
## Predict & Evaluate
#disables gradient calculation, allowing for less memory consumption and faster compute. It is generally used to perform validation/testing because gradients are not required to be computed when testing model performance.
with torch.no_grad():
    #uses compute loss on test
    #the function compute_loss_ is where the ODE solver functions integrate the dosing info and time interval. This is also where the concept of VAE's are used where z_0 is sampled from the latent distribution (which is derived from the mean and variance calculated by the 12 element input array). see page 6 of paper for more specific info. 
    test_res = utils.compute_loss_on_test(encoder, ode_func, classifier, args,
        tdm1_obj["test_dataloader"], tdm1_obj["n_test_batches"], 
        device, phase="test")

eval_results = pd.DataFrame(test_res).drop(columns="loss")
eval_results.to_csv(eval_path, index=False)

with torch.no_grad():
    #uses compute loss on interpolated data. Interpolated data contains estimated "intermediate" values between data points to smooth out the data
    test_res = utils.compute_loss_on_interp(encoder, ode_func, classifier, args,
        tdm1_obj["interp"], tdm1_obj["test_dataloader"], tdm1_obj["n_test_batches"], 
        device, phase="test")

#puts results in a data frame and migrates to csv file
eval_results = pd.DataFrame(test_res).drop(columns="loss")
eval_results.to_csv(eval_path + ".interp", index=False)

with torch.no_grad():
    #uses compute loss on interpolated data without dosing info
    test_res = utils.compute_loss_on_interp(encoder, ode_func, classifier, args,
        tdm1_obj["nodosing"], tdm1_obj["test_dataloader"], tdm1_obj["n_test_batches"], 
        device, phase="test")

eval_results = pd.DataFrame(test_res).drop(columns="loss")
eval_results

NameError: name 'ckpt_path' is not defined