# Testing the conditional layer without MAML

In [1]:
#import learn2learn as l2l
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pickle
import sys
import matplotlib.pyplot as plt
from collections import OrderedDict

sys.path.insert(1, "..")

from ts_dataset import TSDataset
from base_models import LSTMModel, FCN
from metrics import torch_mae as mae
from pytorchtools import EarlyStopping

In [2]:

dataset_name = "HR"
dataset_name = "POLLUTION"
window_size = 32
window_size = 5
task_size = 50
batch_size = 64
input_dim = 13
input_dim = 14
output_dim = 1
hidden_dim = 120


In [3]:
train_data = pickle.load(  open( "../../Data/TRAIN-"+dataset_name+"-W"+str(window_size)+"-T"+str(task_size)+"-NOML.pickle", "rb" ) )
train_data_ML = pickle.load( open( "../../Data/TRAIN-"+dataset_name+"-W"+str(window_size)+"-T"+str(task_size)+"-ML.pickle", "rb" ) )
validation_data = pickle.load( open( "../../Data/VAL-"+dataset_name+"-W"+str(window_size)+"-T"+str(task_size)+"-NOML.pickle", "rb" ) )
validation_data_ML = pickle.load( open( "../../Data/VAL-"+dataset_name+"-W"+str(window_size)+"-T"+str(task_size)+"-ML.pickle", "rb" ) )
test_data = pickle.load( open( "../../Data/TEST-"+dataset_name+"-W"+str(window_size)+"-T"+str(task_size)+"-NOML.pickle", "rb" ) )
test_data_ML = pickle.load( open( "../../Data/TEST-"+dataset_name+"-W"+str(window_size)+"-T"+str(task_size)+"-ML.pickle", "rb" ) )

In [4]:
def get_task_encoder_input(data_ML):
    
    task_encoder_input = np.concatenate((data_ML.x[:,:,0,:], data_ML.y), axis=2)
    
    return task_encoder_input


   
class LSTMDecoder(nn.Module):
    
    def __init__(self, batch_size, seq_len, output_dim, n_layers, hidden_dim, latent_dim, device):
        
        super(LSTMDecoder, self).__init__()
        
        self.batch_size = batch_size
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.latent_dim = latent_dim
        self.sequence_length = seq_len
        
        self.lstm = nn.LSTM(1, hidden_dim, n_layers, batch_first=True)
        
        self.latent_to_hidden = nn.Linear(self.latent_dim, self.hidden_dim)
        self.hidden_to_output = nn.Linear(self.hidden_dim, self.output_dim)
        
    
        self.decoder_inputs = torch.zeros( self.batch_size, self.sequence_length, 1, requires_grad=True).to(device)
        self.c_0 = torch.zeros(self.n_layers, self.batch_size, self.hidden_dim, requires_grad=True).to(device)

        nn.init.xavier_uniform_(self.latent_to_hidden.weight)
        nn.init.xavier_uniform_(self.hidden_to_output.weight)
        
        self.to(device)
        
        
    def forward(self, latent):
        
        h_state = self.latent_to_hidden(latent).unsqueeze(0)
        h_0 = torch.cat([h_state for _ in range(self.n_layers)], axis=0)
        decoder_output, _ = self.lstm(self.decoder_inputs, (h_0, self.c_0))
        out = self.hidden_to_output(decoder_output)
        
        return out
    



class Lambda(nn.Module):

    """Lambda module converts output of encoder to latent vector

    :param hidden_size: hidden size of the encoder
    :param latent_length: latent vector length
    https://github.com/abhmalik/timeseries-clustering-vae/blob/master/vrae/vrae.py

    """

    def __init__(self, hidden_dim, latent_dim):

        super(Lambda, self).__init__()

        self.hidden_dim = hidden_dim
        self.latent_dim = latent_dim

        self.hidden_to_mean = nn.Linear(self.hidden_dim, self.latent_dim)
        self.hidden_to_logvar = nn.Linear(self.hidden_dim, self.latent_dim)

        nn.init.xavier_uniform_(self.hidden_to_mean.weight)
        nn.init.xavier_uniform_(self.hidden_to_logvar.weight)


    def forward(self, cell_output):

        """Given last hidden state of encoder, passes through a linear layer, and finds the mean and variance

        :param cell_output: last hidden state of encoder
        :return: latent vector

        """

        self.latent_mean = self.hidden_to_mean(cell_output)
        self.latent_logvar = self.hidden_to_logvar(cell_output)

        if self.training:
            std = torch.exp(0.5 * self.latent_logvar)
            eps = torch.randn_like(std)
            return eps.mul(std).add_(self.latent_mean)
        else:
            return self.latent_mean


In [7]:
task_model = LSTMModel(batch_size=batch_size, seq_len = window_size, input_dim = input_dim, n_layers = 2, hidden_dim = hidden_dim, output_dim =1)
task_encoder = LSTMModel(batch_size=batch_size, seq_len = task_size, input_dim = input_dim+1, n_layers = 1, hidden_dim = hidden_dim, output_dim =1)
task_decoder = LSTMDecoder(batch_size = 1, n_layers =1 , seq_len = task_size, output_dim = input_dim +1,  hidden_dim = hidden_dim, latent_dim = hidden_dim, device = device)
lmbd = Lambda(hidden_dim, hidden_dim)

class MultimodalLearner(nn.Module):
    
    def __init__(self, task_model, task_encoder, task_decoder, lmbd):
        
        super(MultimodalLearner, self).__init__()
        
        self.task_model = task_model
        self.task_encoder = task_encoder
        self.task_decoder = task_decoder
        self.lmbd = lmbd
        self.modulation_layer = nn.Linear(task_encoder.hidden_dim, task_model.hidden_dim*2)
        self.output_layer = nn.Linear(task_model.hidden_dim, 1)
        self.task_decoder = task_decoder
        self.rec_loss = nn.SmoothL1Loss(size_average=False)
    
    
    def conditional_layer(self, x, embedding):
        
        ###apply by deffault the affine transformation
        
        gammas, betas = torch.split(embedding, x.size(1), dim=-1)
        gammas = gammas + torch.ones_like(gammas)
        x = x*gammas + betas
        
        return x

    def compute_loss(self, x_decoded, x):
        
        
        latent_mean, latent_logvar = self.lmbd.latent_mean, self.lmbd.latent_logvar
        kl_loss = -0.5 * torch.mean(1 + latent_logvar - latent_mean.pow(2) - latent_logvar.exp())
        recon_loss = self.rec_loss(x_decoded, x)
        
        return recon_loss + kl_loss, kl_loss, recon_loss
    
    def forward (self, x, task, params=None, embeddings=None):
        
        if params is None:
            params = OrderedDict(self.named_parameters())
            
        x = self.task_model.encoder(x)
        encoding = self.task_encoder.encoder(task)
        latent = self.lmbd(encoding)
        task_rec = self.task_decoder(latent)
        
        modulation_embeddings = self.modulation_layer(encoding)
        modulated_output = self.conditional_layer(x, modulation_embeddings)
        output = self.output_layer(modulated_output)
        
        loss = self.compute_loss(task_rec, task)
        
        return output, loss

 

In [11]:
def test(data_ML, multimodal_learner, loss_fn):
    
    total_tasks, task_size, window_size, input_dim = data_ML.x.shape
    
    task_data = torch.FloatTensor(get_task_encoder_input(data_ML))
    x_tensor = torch.FloatTensor(data_ML.x)
    y_tensor = torch.FloatTensor(data_ML.y) 

    count = 0.0
    accum_loss = 0.0
    
    for task_id in range(0, total_tasks, total_tasks//100):
        
        task =task_data[task_id:task_id+1].cuda()
        x = x_tensor[task_id+1].cuda()
        y = y_tensor[task_id+1].cuda()

        y_pred, (vrae_loss, kl_loss, rec_loss) = multimodal_learner(x, task)

        loss = loss_fn(y_pred, y)
        accum_loss += loss.cpu().detach().numpy()
        count +=1
        
    return accum_loss/count
        

        
epochs = 500
total_tasks = len(train_data_ML)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

multimodal_learner = MultimodalLearner(task_model, task_encoder, task_decoder, lmbd)
multimodal_learner.to(device)

opt = torch.optim.Adam(multimodal_learner.parameters(), lr = 0.0001)

task_data = torch.FloatTensor(get_task_encoder_input(train_data_ML))
x_tensor = torch.FloatTensor(train_data_ML.x)
y_tensor = torch.FloatTensor(train_data_ML.y)

loss_fn = nn.SmoothL1Loss(size_average=False)
#loss_fn = mae

for epoch in range(epochs):

    multimodal_learner.train()
    multimodal_learner.zero_grad()
    
    task_id = np.random.randint(0, total_tasks-1)
    task =task_data[task_id:task_id+1].cuda()
    x = x_tensor[task_id+1].cuda()
    y = y_tensor[task_id+1].cuda()
    
    y_pred, (vrae_loss, kl_loss, rec_loss) = multimodal_learner(x, task)
    
    loss = loss_fn(y_pred, y) + vrae_loss
    
    opt.zero_grad()
    loss.backward()
    
    opt.step()
    
    multimodal_learner.eval()
    with torch.no_grad():
        val_loss = test(validation_data_ML, multimodal_learner, mae)
        test_loss = test(test_data_ML, multimodal_learner, mae)
    
    print("Train loss:",loss)
    print("Val loss:", val_loss)
    print("Test loss:", test_loss)

Train loss: tensor(155.9806, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.0889653386193372
Test loss: 0.1124294653332027
Train loss: tensor(190.5320, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.08385251000346172
Test loss: 0.10719440502142256
Train loss: tensor(162.5385, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.07954846497270324
Test loss: 0.10285253091576961
Train loss: tensor(131.7786, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.07582304910978391
Test loss: 0.09916017518037616
Train loss: tensor(159.6736, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.07262590542169554
Test loss: 0.09597204983381942
Train loss: tensor(112.8615, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.06944818806257985
Test loss: 0.09314426516948066
Train loss: tensor(196.4892, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.06644067486659402
Test loss: 0.08985814708515559
Train loss: tensor(246.2649, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.063

Train loss: tensor(123.6537, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.03498177371386971
Test loss: 0.04287846912952638
Train loss: tensor(185.8405, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.034964310102874326
Test loss: 0.04285437624790881
Train loss: tensor(473.6626, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.03473196817295892
Test loss: 0.0426219000416522
Train loss: tensor(181.7393, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.034538935949759826
Test loss: 0.04243226031228752
Train loss: tensor(103.1476, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.03438708729864586
Test loss: 0.042288343385230784
Train loss: tensor(202.6670, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.03425983985265096
Test loss: 0.04215385702134359
Train loss: tensor(143.3434, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.034238713562843345
Test loss: 0.04209426499091753
Train loss: tensor(344.3893, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 

Train loss: tensor(269.6441, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.04099651000329426
Test loss: 0.04483531204301237
Train loss: tensor(199.3589, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.04113766596253429
Test loss: 0.04489040986109193
Train loss: tensor(153.0044, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.0411347322609453
Test loss: 0.04482569773266516
Train loss: tensor(162.0400, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.041096681320951095
Test loss: 0.044768704304305636
Train loss: tensor(145.5974, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.04104510504042819
Test loss: 0.04471227821737233
Train loss: tensor(165.7450, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.041097721359914255
Test loss: 0.044790565469624975
Train loss: tensor(110.1301, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.04114386902323791
Test loss: 0.044869217588243505
Train loss: tensor(155.8040, device='cuda:0', grad_fn=<AddBackward0>)
Val loss:

Train loss: tensor(136.3674, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.03864172929454417
Test loss: 0.04259799568369837
Train loss: tensor(155.7238, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.03849276534858204
Test loss: 0.04245240266185881
Train loss: tensor(364.9574, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.03829288160694497
Test loss: 0.0421467916210099
Train loss: tensor(221.6950, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.0386044371340956
Test loss: 0.04244163433218946
Train loss: tensor(304.7272, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.03889625809554543
Test loss: 0.0427226058050695
Train loss: tensor(158.0247, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.039182911006112896
Test loss: 0.04301718533002209
Train loss: tensor(228.8185, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.03946438408678486
Test loss: 0.043259671566509963
Train loss: tensor(117.0640, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.04

Train loss: tensor(333.6547, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.050884396325619446
Test loss: 0.05318004896145056
Train loss: tensor(151.9789, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.04997580422177201
Test loss: 0.053259972169405166
Train loss: tensor(180.1105, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.04921749125101737
Test loss: 0.05350364035567139
Train loss: tensor(165.3530, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.04874281576346783
Test loss: 0.053991477844296115
Train loss: tensor(201.6448, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.04854925520540703
Test loss: 0.054670488922902855
Train loss: tensor(121.2903, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.04849442458223729
Test loss: 0.05559587460195664
Train loss: tensor(243.1237, device='cuda:0', grad_fn=<AddBackward0>)
Val loss: 0.04881928723660253
Test loss: 0.056654431792621564


KeyboardInterrupt: 

In [197]:
vrae_loss

tensor(223.9370, device='cuda:0', grad_fn=<AddBackward0>)