In [1]:
import os
seed = 40
os.environ['PYTHONHASHSEED']=str(seed)

import random
import seaborn as sns
import torch
import torch.nn as nn 
import torch.nn.functional as F
import matplotlib.pyplot as plt 
import matplotlib as mpl
from matplotlib.pyplot import *
style.use('ggplot')
import pandas as pd
import numpy as np
import time
import torch.profiler
import torch.autograd.profiler as profiler
from scipy import stats as st
import sklearn.preprocessing as preprocess
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import PolynomialFeatures
import torch.optim as optim
import optuna



random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7fc52c69d490>

### Context Integrated RNN - CiRNN and CxAttention

In [2]:
# Predefine 'm' depending on the number of context features
# For attention computation Ref: Effective Approaches to Attention-bsaed Neural Machine Translation, Loung et al. 2015
# Attention methods: DOT, GENERAL, CONCAT

# Get CPU or GPU device for training
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

class ContextGRU(torch.nn.Module):
    """
     simple GRU cell network 
    """

    def __init__(self, input_dim, hidden_dim, output_dim, context_dim, fc_dim):
        super(ContextGRU, self).__init__()
        
        self.n_x = input_dim
        self.n_h = hidden_dim
        self.n_y = output_dim
        self.n_z = context_dim  
        self.m = 9  #dimension of basis function vector (polynomial features) for 3 context features 
        self.n_l = fc_dim  #hidden dimension of fully connected layer (attention hidden units)
       
       

        # reset gate components
        self.linear_reset_w1 = nn.Linear(self.n_x * self.m, self.n_h, bias=True)
        self.linear_reset_r1 = nn.Linear(self.n_h, self.n_h, bias=True)


        self.linear_reset_w2 = nn.Linear(self.n_x * self.m, self.n_h, bias=True)
        self.linear_reset_r2 = nn.Linear(self.n_h, self.n_h, bias=True)
        self.activation_1 = nn.Sigmoid()

        # update gate components
        self.linear_gate_w3 = nn.Linear(self.n_x * self.m, self.n_h, bias=True)
        self.linear_gate_r3 = nn.Linear(self.n_h, self.n_h, bias=True)
        self.activation_2 = nn.Sigmoid()

        self.activation_3 = nn.Tanh()
        
        # Attention layer
        #self.attn = nn.Linear(self.n_h, self.n_h)
        self.attn = nn.Linear(self.n_h * self.m, self.n_h)
        self.concat_linear = nn.Linear(self.n_h*2, self.n_h)
                
        # Fully connected layer with attention units as input
        self.fc_layer = nn.Sequential(
            nn.Linear(self.n_h, self.n_l),
            nn.ReLU(inplace = True),
            nn.Dropout(p=0.2)
        )
        
        
        #output from FC layer
        self.linear_output = nn.Linear(self.n_l, self.n_y, bias=True)
        
                    
                
    def reset_gate(self, xg, h):  #xg is the kronecker product of x and  basis function G(z)
        x_1 = self.linear_reset_w1(xg)
        h_1 = self.linear_reset_r1(h)
        # gate update
        r = self.activation_1(x_1 + h_1)
        return r

    def update_gate(self, xg, h):
        x_2 = self.linear_reset_w2(xg)
        h_2 = self.linear_reset_r2(h)
        s = self.activation_2( h_2 + x_2)
        return s

    
    def update_component(self, xg, h, r):
        x_3 = self.linear_gate_w3(xg)
        h_3 = r * self.linear_gate_r3(h) 
        h_tilda = self.activation_3(x_3+h_3)
        return h_tilda
    
    def attention(self,h,hg):
        h_t = h[:,-1,:] #hidden state at last time step: (batch_size,hidden_dim)
        attn_score = self.attn(hg)  #h : (batch_size, seq_len, hidden_dim)
        #print(attn_score.shape)
        attn_scores = torch.bmm(attn_score,h_t.unsqueeze(2) ) 
        #print(attn_scores.shape)
        attn_weights = F.softmax(attn_scores.squeeze(2), dim=1)
        #print(attn_weights.shape)
        cxt = torch.bmm(h.transpose(1,2),attn_weights.unsqueeze(2)).squeeze(2)
        #print(cxt.shape)
        attn_hidden = torch.tanh(self.concat_linear(torch.cat((cxt,h_t),dim=1)))
        #print(attn_hidden.shape)
        return attn_hidden , attn_weights      
        
    
    def compute_fc_output(self, a):
        fc_output = self.fc_layer(a)
        return fc_output
        
        
    def compute_output(self,o):
        y_pred = self.linear_output(o)
        return y_pred


    def cell_forward(self, x, h, G):
        
        """
        Implements a single forward step of the Context GRU-cell 
        
        Input Arguments:
            x (mini-batch): input x at time step t , (n,n_x) : (batch_size, input_dim)
            h : hidden state at time step t-1, (n,n_h) : (batch_size, hidden_dim)
            G : vector of basis funcitons (m,n)           
        
        Returns:
            h_new: hidden state at time step t, (n,n_h)
                    
        """
        
        # kronecker product of x and G(zt)
        n = x.shape[0]
        xg = torch.zeros(n,self.n_x*self.m).to(device)
        
        for i in range(n):
        
            xg[i,:] = torch.kron(x[i,:],G[:,i])
            
         
                     
        # Equation 1. reset gate vector
        r = self.reset_gate(xg, h)

        # Equation 2: the update gate - the shared update gate vector z
        s = self.update_gate(xg, h)

        # Equation 3: The almost output component
        h_tilda = self.update_component(xg,h,r)

        # Equation 4: the new hidden state
        h_new = (1-s) * h_tilda  + s * h

        #output

        #y_pred = self.compute_output(h)
        
        #kronecker product of h and G 
        n = h_new.shape[0]
        hg = torch.zeros(n,self.n_h*self.m).to(device)
        
        for i in range(n):
            hg[i,:] = torch.kron(h_new[i,:],G[:,i])
        

        return h_new, hg
    

    def forward(self, x, z):
                             
        """
        Implement the forward propagation of the recurrent neural network 

        Input Arguments:
        x (mini_batch): primary input for every time-step in mini-batches of shape (n, T, n_x)
        z (mini_batch): context input for every time-step in mini-batches of shape (n,T,n_z)
               

        Returns:
            h -- Hidden states for every time-step, numpy array of shape (n, T, n_h)
            y_pred -- Predictions for every time-step, numpy array of shape (n, T, n_y), 
            here T is 1 for Seq to Vec RNN
        """
                             
        # Retrieve dimensions from shapes of x 
        #print(x.shape)
        #print(z.shape)
        n,T,n_x = x.shape
        n_y = self.n_y
        n_h = self.n_h
        n_z = self.n_z
       
        
                                    
        # initialize "h" 
   
        h = self.init_hidden(n,T,n_h)
        hg = self.init_hidden(n,T,n_h*self.m)
        
        #y_pred = np.zeros((m,T_x,n_y))
        #y_pred is single value for one sample, m=1
        
        #basis function vector
        G = self.apply_basis(z[:,0,:])  #G: size of (n,m)
      
        #for initial time step the hidden state is 0
        h_temp = h.clone()        
        h_init = h_temp[:,0,:]        
        #h_curr, y_curr = self.cell_forward(x[:,0,:],h_init,torch.t(G))
        h_curr, hg_curr = self.cell_forward(x[:,0,:],h_init,torch.t(G))  
        
        # loop over all time-steps
        for t in range(1,T):
            
            #compute the vector of basis functions

            G = self.apply_basis(z[:,t,:])  #G: size of (n,m)

            # Update next hidden state
            # ignore yt_pred for seq to vector
            h[:,t,:]= h_curr
            hg[:,t,:]= hg_curr
            
            h_temp = h.clone()
            h_prev = h_temp[:,t,:]  #h_prev: (n,n_h)
            h_curr, hg_curr = self.cell_forward(x[:,t,:],h_prev, torch.t(G))
             
            #y_pred[t,:] = yt_pred
           
        
        #compute the predicted output from the last cell i.e at last time step T
        y_pred = torch.zeros(n,1,1,device = 'cuda:0')
        
                
        # Save the last hidden state 
        h[:,t,:] = h_curr   # t = T-1 , the last time step
        hg[:,t,:] = hg_curr
        
        
              
        # Push through the attention layer
        attn_weights = None
        a , attn_weights = self.attention(h, hg)
        
        
        # Push though the linear layer
        fc_output = self.compute_fc_output(a)
        
        
        #compute the output
        y_pred[:,0,:] = self.compute_output(fc_output)
        
       
        
        return h, y_pred, attn_weights
    
    
    def init_hidden(self, n:int,T:int, n_h:int):
        #initialise the hidden state
        #n : batch-size
        #T : Input sequence length
        #returns h of size (n,T,n_h) 
        return torch.zeros(n,T,n_h,device = 'cuda:0')
    
    
    def apply_basis(self,zt):
        '''
        apply the basis function: polynomial degree 2
        [z0, z1, z2, z0z0, z0z1, z0z2....]
        input arguments:
            zt: context vector (n,n_z) for mini-batch of size n and n_z context dim
        Returns:
            G : tensor of basis functions, (m,n)
            
        for 3 context features m = 9
        '''
                      
        #poly = PolynomialFeatures(2, include_bias=False, interaction_only=True)
        poly = PolynomialFeatures(2, include_bias=False)
        G = torch.tensor(poly.fit_transform(zt.cpu().numpy())).to(device) #fit_transform returns nd array
        
                           
        return G  

    


Using cuda device


In [3]:
class Optimization:
    def __init__(self, model, loss_fn, optimizer):
        self.model = model
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.train_losses = []
        self.val_losses = []
        self.weights = []      #used for visualising weights
        self.settings = []     #saving settings for visualising weights
        self.inputs = []       #saving input for visualising 
    
    def train_step(self, x, y, z):
        
       # with profiler.record_function("TRAIN STEP FUNCTION"):
        # Sets model to train mode
        self.model.train()

        # Makes predictions
        h, yhat, attn_weigths = self.model(x, z)
        
        
        # Computes loss
        loss = self.loss_fn(y, yhat)
        
        #with profiler.record_function("LOSS_BACKWARD"):
        # Computes gradients
        loss.backward()

        # Updates parameters and zeroes gradients
        self.optimizer.step()
        self.optimizer.zero_grad()

        # Returns the loss
        
        return loss.item()

    def train(self, train_loader, val_loader, batch_size, n_epochs=50, np_features=1, nc_features=1):
        '''
        np_features = # primary input features
        nc_features = # context input features
        '''
        #model_path = f'models/{self.model}_{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}'
        times = []
        for epoch in range(1, n_epochs + 1):
            
            start_epoch = time.time()
            
            batch_losses = []
            batch_count = 0
            for x_batch, z_batch, y_batch in train_loader:
                batch_count += 1
                x_batch = x_batch.view([batch_size,-1, np_features]).to(device)
                y_batch = y_batch.to(device)
                z_batch = z_batch.view([batch_size,-1, nc_features]).to(device)
                                
                #with profiler.profile(with_stack=True, profile_memory=True) as prof:
                loss = self.train_step(x_batch, y_batch, z_batch)
                #print(prof.key_averages(group_by_stack_n=5).table(sort_by = 'self_cpu_time_total', row_limit = 5))    
                
                batch_losses.append(loss)
                
#             if (epoch % 10 == 0):
#                     #if (batch_count % 200 == 0):
#                         #save the model weights for each batch for analysis
#                         #self.save_model(self.model, batch_count, str(z_batch[-1,:,:].detach().cpu().numpy()))
#                 for param_tensor in model.state_dict():
#                     if (param_tensor == 'linear_reset_w1.weight'):
#                         param_val = model.state_dict()[param_tensor].cpu().numpy().tolist()
#                         self.weights.append(param_val)
#                         self.settings.append(z_batch[-1,:,:].detach().cpu().numpy().tolist())
            #self.model.to(device)   
                
               
            training_loss = np.mean(batch_losses)
            self.train_losses.append(training_loss)
            
            
            with torch.no_grad():
                batch_val_losses = []
                for x_val, z_val, y_val in val_loader:
                    x_val = x_val.view([batch_size, -1, np_features]).to(device, non_blocking=True)
                    y_val = y_val.to(device)
                    z_val = z_val.view([batch_size, -1, nc_features]).to(device,non_blocking=True)
                    self.model.eval()

                    # with profiler.profile(with_stack=True, profile_memory=True) as prof:
                    h,yhat, attn_weights = self.model(x_val, z_val)  
                    # print(prof.key_averages(group_by_stack_n=5).table(sort_by = 'self_cpu_time_total', row_limit = 5))
                    
                    val_loss = self.loss_fn(y_val, yhat).item()
                    batch_val_losses.append(val_loss)
                validation_loss = np.mean(batch_val_losses)
                self.val_losses.append(validation_loss)

            if (epoch % 5 == 0):
                print(
                    f"[{epoch}/{n_epochs}] Training loss: {training_loss:.4f}\t Validation loss: {validation_loss:.4f}"
                )
                
           
                
            torch.cuda.synchronize()
            end_epoch = time.time()
            elapsed = end_epoch - start_epoch
            times.append(elapsed)
        
        total_time = sum(times)
        avg_time = sum(times)/n_epochs
            
        print(f"Average Training time: {avg_time:.4f} s for epochs {n_epochs}") 
        
        print(f"Total Training time: {total_time:.4f} s for epochs {n_epochs}")  
        

        #torch.save(self.model.state_dict(), model_path)
        
        return validation_loss  #this will be used by otuna to optimize
    
    def evaluate(self, test_loader, batch_size=1, np_features=1, nc_features = 1):
            with torch.no_grad():
                predictions = []
                values = []
                attn_weights = []
                for x_test, z_test, y_test in test_loader:
                    
                    x_test = x_test.view([batch_size,-1, np_features]).to(device, non_blocking=True)
                    y_test = y_test.to(device)
                    z_test = z_test.view([batch_size,-1, nc_features]).to(device, non_blocking=True)
                    self.model.eval()
                    h,yhat, attn_wts = self.model(x_test, z_test)
                    predictions.append(yhat.detach().cpu().numpy())
                    values.append(y_test.detach().cpu().numpy())  
                    attn_weights.append(attn_wts.detach().cpu().numpy())
           
            return predictions, values, attn_weights

    def plot_losses(self):
            plt.plot(self.train_losses, label="Training loss")
            plt.plot(self.val_losses, label="Validation loss")
            plt.legend()
            plt.title("Losses")
            plt.xlabel("Epochs")
            plt.ylabel("Loss(MSE)")
            plt.show()
            plt.close()
            
    def save_model(self, model, batch_id, settings_val):
        
        # path = define this
        #save model
        file_name = 'FD002_Params.txt'
        file_path = os.path.join(path,file_name)
        f = open(file_path, 'a')
        f.write('Batch'+ str(batch_id)+'\n')
        f.write('-------\n')
        for param_tensor in model.state_dict():
            param_val = model.state_dict()[param_tensor].cpu().numpy().tolist()
            f.write(param_tensor + "\t" + str(param_val))
            f.write('\n---------------\n')
        f.write('Settings\n')
        f.write(settings_val + '\n')
        f.write('---------------\n')
        f.write('\n')
       
        f.close()
        
        
    def visualise_weights(self):
        
#         col = ['r','b','g']
#         nrows = len(self.weights)
#         for i in range(nrows):
#             wtmatrix = np.array(self.weights[i])
#             print(wtmatrix.shape)
#             fig = plt.figure()
#             print(wtmatrix[0:9, 0:9])
#             #plt.imshow(wtmatrix[0:9, 0:9])
#             sns.heatmap(wtmatrix[0:10, 0:10])
            
#         fig = plt.figure()  
#         for i in range(nrows):
#             wtmatrix = np.array(self.weights[i])
#             plt.plot(wtmatrix[0:10],wtmatrix [0:10],color = col[i],marker = '.')

        return self.weights, self.settings, self.inputs
        
            
        


### Example - Training

In [23]:
from torch.utils.data import TensorDataset, DataLoader

batch_size = 128

#transform the arrays into torch tensors
train_features = torch.Tensor(X_train)  #X_train : num_samples, seq_len, num_dim
train_targets = torch.Tensor(Y_train)
train_cx_features = torch.Tensor(Z_train)

val_features = torch.Tensor(X_val)
val_targets = torch.Tensor(Y_val)
val_cx_features = torch.Tensor(Z_val)


train = TensorDataset(train_features,train_cx_features, train_targets)
val = TensorDataset(val_features, val_cx_features,val_targets)


train_loader = DataLoader(train, batch_size=batch_size, shuffle=False, drop_last=True)
val_loader = DataLoader(val, batch_size=batch_size, shuffle=False, drop_last=True)

examples = iter(train_loader)
samples,context,targets = examples.next()
print(samples.shape, context.shape,targets.shape)

torch.Size([128, 15, 6]) torch.Size([128, 15, 3]) torch.Size([128, 1, 1])


In [2]:

input_dim = X_train.shape[2]
output_dim = Y_train.shape[2]
context_dim = Z_train.shape[2]

hidden_dim = 15
layer_dim = 1
batch_size = 128
dropout = 0.2
n_epochs = 100
learning_rate = 0.0007
weight_decay = 1e-6

#dimensions of fully connected layer
fc_dim = 25


model = ContextGRU(input_dim, hidden_dim, output_dim, context_dim, fc_dim)

model = model.to(device)

params_list = model.parameters()

loss_fn = nn.MSELoss(reduction="mean")

optimizer = optim.Adam(params_list,lr=learning_rate, weight_decay=weight_decay)
#optimizer = optim.RMSprop(params_list, lr=learning_rate, alpha=0.99, eps=1e-08, weight_decay=weight_decay)
#optimizer = optim.SGD(params_list,lr=learning_rate, weight_decay=weight_decay)
opt = Optimization(model=model, loss_fn=loss_fn, optimizer=optimizer)
opt.train(train_loader, val_loader, batch_size=batch_size, n_epochs=n_epochs, np_features=input_dim, nc_features=context_dim)
opt.plot_losses()
#opt.visualise_weights()

### Example - Testing

In [3]:
 
    
test_features = torch.Tensor(X_test)
test_targets = torch.Tensor(Y_test)
test_cx_features = torch.Tensor(Z_test)

test = TensorDataset(test_features,test_cx_features, test_targets)

#test_loader = DataLoader(test, batch_size=X_test.shape[0], shuffle=False, drop_last=True)
test_loader_one = DataLoader(test, batch_size=1, shuffle=False)


predictions, values, attn_weights= opt.evaluate(test_loader_one, batch_size=1, np_features=input_dim, nc_features = context_dim)
#flatten the multi-dimension array to 1-D array
vals = np.concatenate(values, axis=0).ravel()  
preds = np.concatenate(predictions, axis=0).ravel()

