## Import Libraries

In [None]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data_utils
import random
import math
import time

import operator
from queue import PriorityQueue

## Google Drive

In [None]:
import os
!pip install wandb
from google.colab import drive
drive.mount('/content/drive')
import sys
sys.path.append('./drive/MyDrive/Colab Notebooks/CS6910/Assignment 3/')

Collecting wandb
[?25l  Downloading https://files.pythonhosted.org/packages/98/5f/45439b4767334b868e1c8c35b1b0ba3747d8c21be77b79f09eed7aa3c72b/wandb-0.10.30-py2.py3-none-any.whl (1.8MB)
[K     |▏                               | 10kB 23.3MB/s eta 0:00:01[K     |▍                               | 20kB 29.8MB/s eta 0:00:01[K     |▌                               | 30kB 22.8MB/s eta 0:00:01[K     |▊                               | 40kB 16.9MB/s eta 0:00:01[K     |█                               | 51kB 8.3MB/s eta 0:00:01[K     |█                               | 61kB 7.6MB/s eta 0:00:01[K     |█▎                              | 71kB 8.6MB/s eta 0:00:01[K     |█▌                              | 81kB 9.5MB/s eta 0:00:01[K     |█▋                              | 92kB 9.9MB/s eta 0:00:01[K     |█▉                              | 102kB 8.0MB/s eta 0:00:01[K     |██                              | 112kB 8.0MB/s eta 0:00:01[K     |██▏                             | 122kB 8.0MB/s e

## Wandb

In [None]:
!pip install wandb --upgrade
import wandb
!wandb login

Requirement already up-to-date: wandb in /usr/local/lib/python3.7/dist-packages (0.10.30)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## Set up device

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
device

device(type='cuda')

## Assign model name

In [None]:
modelName = 'Encoder_Decoder_attention_Dakshina_Hi'

## Set up Dataset

In [None]:
with open(r"./drive/MyDrive/Colab Notebooks/CS6910/Assignment 3/hi/lexicons/hi.translit.sampled.train.tsv", 'r', encoding="utf8") as f:
    train_lines = f.read().split("\n")
    
with open(r"./drive/MyDrive/Colab Notebooks/CS6910/Assignment 3/hi/lexicons/hi.translit.sampled.dev.tsv", 'r', encoding="utf8") as f:
    val_lines = f.read().split("\n")

In [None]:
train_num_samples = len(train_lines)
val_num_samples = len(val_lines)

In [None]:
train_input_texts = []
train_target_texts = []

val_input_texts = []
val_target_texts = []

input_characters = set()
target_characters = set()

In [None]:
## Train Data ##

for line in train_lines[: min(train_num_samples, len(train_lines) - 1)]:
    target_text, input_text, _ = line.split("\t")
    target_text = "\t" + target_text + "\n"
    train_input_texts.append(input_text)
    train_target_texts.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

input_characters.add(' ')
target_characters.add(' ')
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)

train_max_encoder_seq_length = max([len(txt) for txt in train_input_texts])
train_max_decoder_seq_length = max([len(txt) for txt in train_target_texts])

print("Number of train samples:", len(train_input_texts))
print("Number of unique input tokens:", num_encoder_tokens)
print("Number of unique output tokens:", num_decoder_tokens)

print("Max train sequence length for inputs:", train_max_encoder_seq_length)
print("Max train sequence length for outputs:", train_max_decoder_seq_length)

Number of train samples: 44204
Number of unique input tokens: 27
Number of unique output tokens: 66
Max train sequence length for inputs: 20
Max train sequence length for outputs: 21


In [None]:
## Val Data ##

for line in val_lines[: min(val_num_samples, len(val_lines) - 1)]:
    target_text, input_text, _ = line.split("\t")
    target_text = "\t" + target_text + "\n"
    val_input_texts.append(input_text)
    val_target_texts.append(target_text)
    
val_max_encoder_seq_length = max([len(txt) for txt in val_input_texts])
val_max_decoder_seq_length = max([len(txt) for txt in val_target_texts])

print("Number of val samples:", len(val_input_texts))

print("Max val sequence length for inputs:", val_max_encoder_seq_length)
print("Max val sequence length for outputs:", val_max_decoder_seq_length)

Number of val samples: 4358
Max val sequence length for inputs: 18
Max val sequence length for outputs: 16


In [None]:
# token indices
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

encoder_input_data_train = np.zeros(
    (len(train_input_texts), train_max_encoder_seq_length, num_encoder_tokens), dtype="float32"
)
decoder_input_data_train = np.zeros(
    (len(train_input_texts), train_max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)


encoder_input_data_val = np.zeros(
    (len(val_input_texts), train_max_encoder_seq_length, num_encoder_tokens), dtype="float32"
)
decoder_input_data_val = np.zeros(
    (len(val_input_texts), train_max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)

In [None]:
## Train Data ##

for i, (input_text, target_text) in enumerate(zip(train_input_texts, train_target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data_train[i, t, input_token_index[char]] = 1. 
    encoder_input_data_train[i, t + 1:, input_token_index[' ']] = 1.
    
    for t, char in enumerate(target_text):
        decoder_input_data_train[i, t, target_token_index[char]] = 1.
    decoder_input_data_train[i, t + 1:, target_token_index[' ']] = 1.
    
## Val Data ##

for i, (input_text, target_text) in enumerate(zip(val_input_texts, val_target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data_val[i, t, input_token_index[char]] = 1. 
    encoder_input_data_val[i, t + 1:, input_token_index[' ']] = 1.
    
    for t, char in enumerate(target_text):
        decoder_input_data_val[i, t, target_token_index[char]] = 1.
    decoder_input_data_val[i, t + 1:, target_token_index[' ']] = 1.

In [None]:
# Convert data to tensors so that u can pass through dataloaders

encoder_inp_train = torch.stack([torch.from_numpy(np.array(i)) for i in encoder_input_data_train])
decoder_inp_train = torch.stack([torch.from_numpy(np.array(i)) for i in decoder_input_data_train])

encoder_inp_val = torch.stack([torch.from_numpy(np.array(i)) for i in encoder_input_data_val])
decoder_inp_val = torch.stack([torch.from_numpy(np.array(i)) for i in decoder_input_data_val])

In [None]:
batch_size = 64

train_dataset = torch.utils.data.TensorDataset(encoder_inp_train, decoder_inp_train)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle = True)

val_dataset = torch.utils.data.TensorDataset(encoder_inp_val, decoder_inp_val)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size)#, shuffle = True)

## Build Model

In [None]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        stdv = 1. / math.sqrt(self.v.size(0))
        self.v.data.uniform_(-stdv, stdv)

    def forward(self, hidden, encoder_outputs):
        timestep = encoder_outputs.size(0)
        h = hidden.repeat(timestep, 1, 1).transpose(0, 1)
        encoder_outputs = encoder_outputs.transpose(0, 1)
        attn_energies = self.score(h, encoder_outputs) 
        return F.softmax(attn_energies, dim=1).unsqueeze(1) 

    def score(self, hidden, encoder_outputs):
        energy = F.relu(self.attn(torch.cat([hidden, encoder_outputs], 2)))
        energy = energy.transpose(1, 2)
        v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1)
        energy = torch.bmm(v, energy)
        return energy.squeeze(1)

In [None]:
class Encoder(nn.Module):
    def __init__(self,  num_encoder_tokens, hid_dim, n_layers, dropout, enc_embedding_dim = 0, module='LSTM'):
        super(Encoder,self).__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.module = module
        self.embedding_dim = enc_embedding_dim
        self.enc_inp = num_encoder_tokens
        if self.embedding_dim !=0:
            self.enc_inp = self.embedding_dim
            self.embedding = nn.Embedding(num_encoder_tokens, self.embedding_dim)
            
        if self.module =='LSTM':
            self.rnn = nn.LSTM(self.enc_inp, hid_dim, n_layers, dropout = dropout)
        elif self.module == 'RNN':
            self.rnn = nn.RNN(self.enc_inp, hid_dim, n_layers, dropout = dropout)
        elif self.module == 'GRU':
            self.rnn = nn.GRU(self.enc_inp, hid_dim, n_layers, dropout = dropout)
        
    def forward(self, inp):
        
        inp = inp.transpose(0,1)  # check input dimensions before embedding. Currently : (T, N)
        if self.embedding_dim !=0:
            inp = inp.argmax(2)
            inp = self.embedding(inp)
            
        outputs, hidden_cell = self.rnn(inp)
        
        return outputs, hidden_cell

In [None]:
class Decoder(nn.Module):
    def __init__(self, num_decoder_tokens, hid_dim, n_layers, dropout, dec_embedding_dim = 0, module='LSTM', atten=False):
        super(Decoder,self).__init__()
        
        self.output_dim = num_decoder_tokens
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.module = module
        self.atten = atten
        self.embedding_dim = dec_embedding_dim
        self.dec_inp = num_decoder_tokens
        
        if self.embedding_dim !=0:
            self.dec_inp = self.embedding_dim
            self.embedding = nn.Embedding(num_decoder_tokens, self.embedding_dim)
        
        if self.atten == False:
            if module=='LSTM':
                self.rnn = nn.LSTM(self.dec_inp, hid_dim, n_layers, dropout = dropout)
            if module=='RNN':
                self.rnn = nn.RNN(self.dec_inp, hid_dim, n_layers, dropout = dropout)
            if module=='GRU':
                self.rnn = nn.GRU(self.dec_inp, hid_dim, n_layers, dropout = dropout)
                
            self.fc_out = nn.Linear(self.hid_dim, self.output_dim)
                
        else:
            self.attention = Attention(self.hid_dim)
            
            if module=='LSTM':
                self.rnn = nn.LSTM(self.hid_dim + self.dec_inp, hid_dim, n_layers, dropout = dropout)
            if module=='RNN':
                self.rnn = nn.RNN(self.hid_dim + self.dec_inp, hid_dim, n_layers, dropout = dropout)
            if module=='GRU':
                self.rnn = nn.GRU(self.hid_dim + self.dec_inp, hid_dim, n_layers, dropout = dropout)
                
            self.fc_out = nn.Linear(self.hid_dim * 2, self.output_dim)
        
        
        
    def forward(self, inp, hidden_cell, encoder_states):
        
        if isinstance(hidden_cell, tuple):
            hidden = hidden_cell[0]
            cell = hidden_cell[1]
        else:
            hidden = hidden_cell
        
        if self.embedding_dim !=0:
            inp = inp.argmax(2)
            inp = self.embedding(inp)
        
        if self.atten == False:
            if self.module == 'LSTM':
                output, hidden = self.rnn(inp, (hidden, cell))
            else:
                output, hidden = self.rnn(inp,hidden)

            prediction = self.fc_out(output.squeeze(0))
            return prediction, hidden
        
        else:
            attn_weights = self.attention(hidden[-1], encoder_states)  # why -1 ? wat is dim of hidden?
            context = attn_weights.bmm(encoder_states.transpose(0, 1))
            context = context.transpose(0, 1)
            rnn_inp = torch.cat([inp, context], 2) 
            
            if self.module == 'LSTM':
                output, hidden = self.rnn(rnn_inp, (hidden, cell))
            else:
                output, hidden = self.rnn(rnn_inp, hidden)
            
            output = output.squeeze(0) 
            context = context.squeeze(0)
            output = self.fc_out(torch.cat([output, context], 1))
            
            return output, hidden, attn_weights

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.atten = self.decoder.atten
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, to_train, teacher_forcing_ratio = 0.5, beam_width = 3):
        
        if to_train == True:
        
            trg = trg.transpose(0,1)
            batch_size = trg.shape[1]
            trg_len = trg.shape[0]
            trg_vocab_size = self.decoder.output_dim
            
            outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

            enc_output, hidden_cell = self.encoder(src)
            inp = trg[0,:]

            for t in range(1, trg_len):
                
                if self.atten == False:
                    prediction, hidden_cell = self.decoder(inp.unsqueeze(0), hidden_cell, enc_output) # recursively set hidden, cell
                else:
                    prediction, hidden_cell, atten_weights = self.decoder(inp.unsqueeze(0), hidden_cell, enc_output)
                outputs[t] = prediction
                teacher_force = random.random() < teacher_forcing_ratio
                top1 = prediction.argmax(1)
                top1_one_hot = torch.zeros_like(prediction).to(self.device)
                top1_one_hot[:,top1] = 1.

                inp = trg[t] if teacher_force else top1_one_hot

            return outputs
        
        else :
            batch_size = trg.shape[0]
            trg_len = trg.shape[1]
            trg_vocab_size = self.decoder.output_dim

            enc_output, hidden_cell = self.encoder(src)
            outputs = self.beam_decode(beam_width, trg, hidden_cell, enc_output)
            
            return outputs
        
    def beam_decode(self, beam_width, target_tensor, decoder_hiddens, encoder_outputs=None):
        
        target_tensor = target_tensor.transpose(0,1)
        beam_width = beam_width
        topk = 1
        decoded_batch = []
        
        batch_loss = []
        EOS_token = target_token_index['\n']
        
        criterion_infer = torch.nn.CrossEntropyLoss(ignore_index = target_token_index[' '])
        
        for idx in range(target_tensor.size(1)):
            if isinstance(decoder_hiddens, tuple):
                decoder_hidden = (
                    decoder_hiddens[0][:, idx, :].contiguous().unsqueeze(1), decoder_hiddens[1][:, idx, :].contiguous().unsqueeze(1))
            else:
                decoder_hidden = decoder_hiddens[:, idx, :].contiguous().unsqueeze(1)
                
            encoder_output = encoder_outputs[:, idx, :].unsqueeze(1)
            decoder_input_token =  target_token_index["\t"]
            endnodes = []
            number_required = min((topk + 1), topk - len(endnodes))

            node = BeamSearchNode(decoder_hidden,   None,       decoder_input_token, 0,     1   , 0)
            nodes = PriorityQueue()

            nodes.put((-node.eval(), node))
            qsize = 1

            while True:
                if qsize > train_max_decoder_seq_length * beam_width: 
                    break

                score, n = nodes.get()
                decoder_input_token = n.wordid
                #print("wordid: ",n.wordid)
                decoder_hidden = n.h
                
                decoder_input = torch.zeros((1, 1, num_decoder_tokens)).to(self.device)
                decoder_input[0, 0, decoder_input_token] = 1.
                if n.wordid == EOS_token and n.prevNode != None:
                    
                    endnodes.append((score, n))
                    if len(endnodes) >= number_required:
                        break
                    else:
                        continue

                if self.atten == False:
                    decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_output)
                
                else:
                    decoder_output, decoder_hidden, atten_weights = self.decoder(decoder_input, decoder_hidden, encoder_output)
                
                log_softmax =  F.log_softmax(decoder_output, dim=1)
                log_prob, indexes = torch.topk(log_softmax, beam_width)
                #log_prob =  F.log_softmax(log_prob, dim=1)
                nextnodes = []
                # Below seems okay :-)
                #print("Verify target_tensor shape to check argmax(0)",target_tensor[n.leng, idx].shape)
                loss_at_t = criterion_infer(decoder_output, target_tensor[n.leng, idx].argmax(0).unsqueeze(0))
                
                for new_k in range(beam_width):
                    decoded_t = indexes[0][new_k].view(-1)
                    log_p = log_prob[0][new_k].item()
                    #print("decoded_t: ",decoded_t)
                    #print("decoded_t: ",n.wordid)
                    node = BeamSearchNode(decoder_hidden, n, decoded_t.item(), n.logp + log_p, n.leng + 1, n.loss + loss_at_t)
                    score = -node.eval()
                    
                    if n.leng < train_max_decoder_seq_length - 1:
                        nextnodes.append((score, node))
                    
                    else:
                        endnodes.append((score,node))

                for i in range(len(nextnodes)):
                    score, nn = nextnodes[i]
                    nodes.put((score, nn))
                    
                qsize += len(nextnodes) - 1
            
            if len(endnodes) == 0:
                endnodes = [nodes.get() for _ in range(topk)]

            utterances = []
            utterances_loss = []
            for score, n in sorted(endnodes, key=operator.itemgetter(0)):
                utterance = []
                utterance.append(n.wordid)
                loss_sum = n.loss
                utterances_loss.append(loss_sum)
                while n.prevNode != None:
                    n = n.prevNode
                    utterance.append(n.wordid)
                
                utterance = utterance[::-1]
                utterances.append(utterance)
                #print(utterance)
            decoded_batch.append(utterances)
            batch_loss.append(utterances_loss)
            
        return decoded_batch, batch_loss


class BeamSearchNode(object):
    def __init__(self, hiddenstate, previousNode, wordId, logProb, length, loss):
        self.h = hiddenstate
        self.prevNode = previousNode
        self.wordid = wordId
        self.logp = logProb
        self.leng = length
        self.loss = loss

    def eval(self, alpha=1.0):
        reward = 0
        return self.logp / float(self.leng - 1 + 1e-6) + alpha * reward 

    def __lt__(self, other):
        return self.leng < other.leng

    def __gt__(self, other):
        return self.leng > other.leng

## Helpers to decode back sentance

In [None]:
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

## Functions to match output sequence to calculate accuracy

In [None]:
def no_correct_in_batch_train(target, output):
    target = target.transpose(0,1)
    truth = output.argmax(2).transpose(0,1)
    no_correct = 0
    batch_size = target.shape[0]
    trgt_length = target.shape[1]
    for seq in range(batch_size):
        #decoded_sen = ""
        for char in range(1, trgt_length):
            if target[seq,char] == target_token_index['\n']:
                no_correct += 1
                break
            if target[seq,char] != truth[seq,char]:
                break
            #decoded_sen = decoded_sen + reverse_target_char_index[target[seq,char]]
        #print(decoded_sen)
    return no_correct, batch_size

In [None]:
def no_correct_in_batch_infer(target, output):
    target = target.transpose(0,1)
    no_correct = 0
    batch_size = target.shape[0]
    trgt_length = target.shape[1]
    
    # Verify batch_size  ------ Seems Okay
    #print("In Infer, BS = 256 , TL = 21: ",target.shape)
    #print("Target BS: ", batch_size)
    #print("Output BS: ", len(output))
    for seq in range(batch_size):
        #true_sen = ""
        #decoded_sen = ""
        
        for char in range(1,trgt_length):
            
            if target[seq,char] == target_token_index['\n']:
                no_correct += 1
                break
            if char== len(output[seq][0]) or target[seq,char] != output[seq][0][char]:
                break
            #true_sen = true_sen + reverse_target_char_index[target[seq,char].item()]
            #decoded_sen = decoded_sen + reverse_target_char_index[output[seq][0][char]]
        
        #print("true output is ",true_sen)
        #print("decoded output is ",decoded_sen)
    return no_correct, batch_size

## Train Function

In [None]:
def train(model, iterator, optimizer, criterion, clip, teacher_forcing_ratio):
    
    model.train()
    
    epoch_loss = 0
    
    total_no_correct = 0
    total_samples = 0
    for i, (src,trg) in enumerate(iterator):
        
        optimizer.zero_grad()
        src, trg = src.to(device), trg.to(device)
        
        
        output = model(src, trg, teacher_forcing_ratio = teacher_forcing_ratio, to_train = True).to(device) 
        trg = trg.transpose(0,1)
        
        trg = trg.argmax(2)
        no_correct , samples = no_correct_in_batch_train(trg, output)
        total_no_correct += no_correct
        total_samples += samples
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].reshape(-1)
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator), total_no_correct/total_samples

## Evaluate function

In [None]:
def evaluate(model, iterator, beam_width):
    
    model.eval()
    
    epoch_loss = 0
    total_no_correct = 0
    total_samples = 0
    with torch.no_grad():
    
        for i, (src,trg) in enumerate(iterator):

            src, trg = src.to(device), trg.to(device)

            output = model( src, trg, beam_width = beam_width, to_train= False)
            
            trg = trg.transpose(0,1)

            trg = trg.argmax(2)
            no_correct , samples = no_correct_in_batch_infer(trg, output[0])
            total_no_correct += no_correct
            total_samples += samples
            
            loss = 0
            total_chars = 0
            for sample in range(len(output[1])):
                for utterances in range(len(output[1][sample])):
                    loss+= output[1][sample][utterances] 
                    total_chars += len(output[0][sample][utterances])
            
            epoch_loss += loss / total_chars
            
    return epoch_loss / len(iterator), total_no_correct/total_samples

## Function to calculate time

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

## Start Training and evaluating the model

In [None]:
def train_evaluate(model, train_loader,val_loader, optimizer, criterion, CLIP, N_EPOCHS, teacher_forcing_ratio, beam_width):
    teacher_forcing_ratio = teacher_forcing_ratio
    
    for epoch in range(N_EPOCHS):

        start_time = time.time()
        
        train_loss, train_accuracy = train(model, train_loader, optimizer, criterion, CLIP, teacher_forcing_ratio)
        valid_loss, valid_accuracy = evaluate(model, val_loader, beam_width)

        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\tTrain Accuracy: {train_accuracy:.3f}')
        print(f'\tVal. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')
        print(f'\tVal Accuracy: {valid_accuracy:.3f}')
        
        wandb.log({'epoch': epoch,'train loss': train_loss,'train PPL':math.exp(train_loss), 'train accuracy': train_accuracy,
                   'val loss': valid_loss,'valid PPL':math.exp(valid_loss), 'val accuracy': valid_accuracy})
        
    return model

In [None]:
def sp_train():
    config_defaults = {
        'epochs': 20,
        'cell_type':'LSTM',
        #'enc_embedding_dim': 10,
        #'dec_embedding_dim': 30,
        'no_encoder_decoder_layers': 1 ,
        'hidden_layer_size': 256,
        'dropout':0.5,
        'beam_search_size': 3,
        #'lr': 1e-3,
        #'optimizer': 'adam',
        'teacher_forcing_ratio': 0.5
    }
    # Initialize a new wandb run
    wandb.init(config=config_defaults)
    # Config is a variable that holds and saves hyperparameters and inputs
    config = wandb.config
    run_name="cell:["+config.cell_type+"] layers:["+str(config.no_encoder_decoder_layers)+"] HL_size:["+str(config.hidden_layer_size)+"] dp:["+str(config.dropout)+"] beam_width:["+str(config.beam_search_size)+"] tfr:["+str(config.teacher_forcing_ratio)+"]"
    wandb.run.name=run_name
    teacher_forcing_ratio = config.teacher_forcing_ratio
    CLIP = 1
    enc_embedding_dim = 0
    dec_embedding_dim = 0
    enc = Encoder( num_encoder_tokens, config.hidden_layer_size, n_layers = config.no_encoder_decoder_layers, enc_embedding_dim = enc_embedding_dim, dropout = config.dropout, module=config.cell_type)
    dec = Decoder( num_decoder_tokens, config.hidden_layer_size, n_layers = config.no_encoder_decoder_layers, dec_embedding_dim = dec_embedding_dim, dropout = config.dropout, module=config.cell_type, atten = True)

    model = Seq2Seq(enc, dec, device).to(device)
    learning_rate = 1e-3
    #if config.optimizer == 'adam':
    optimizer = optim.Adam(model.parameters(), lr = learning_rate)
    
    #elif config.optimizer == 'RMSprop':
    #  optimizer = optim.RMSprop(model.parameters(), lr = config.learning_rate)
    #scheduler = StepLR(optimizer, step_size=10, gamma=0.5)
    criterion = nn.CrossEntropyLoss(ignore_index = target_token_index[' '])
    
    train_evaluate(model, train_loader, val_loader, optimizer, criterion, CLIP, config.epochs, teacher_forcing_ratio, config.beam_search_size)

In [None]:
sweep_config = {
    'method': 'random', 
    'metric': {
      'name': 'val accuracy',
      'goal': 'maximize'   
    },
    'parameters': {
        'epochs': {
            'values':[20]
        },
        'cell_type':{
            'values': [
                       'GRU', 
                       'LSTM'
                       ]
        },
        'no_encoder_decoder_layers': {
            'values':[1,2,3] 
        },
        'hidden_layer_size':{
            'values':[64,256,512]
        },
        'dropout':{
            'values': [0.20,0.50]
        },
        'beam_search_size': {
            'values':[3,4]
        },
        'teacher_forcing_ratio': {
            'values':[0.2,0.5]
        }
    }
}

In [None]:
#wandb.agent(sweep_id = 'g9ym6ooa', project=modelName+"_Sweep_1", function = sp_train)

[34m[1mwandb[0m: Agent Starting Run: 21j0x79y with config:
[34m[1mwandb[0m: 	beam_search_size: 2
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.6573622668013036
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_layer_size: 512
[34m[1mwandb[0m: 	no_encoder_decoder_layers: 2
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.27609870077288134


Epoch: 01 | Time: 1m 21s
	Train Loss: 2.215 | Train PPL:   9.162
	Train Accuracy: 0.024
	Val. Loss: 1.146 |  Val. PPL:   3.147
	Val Accuracy: 0.093
Epoch: 02 | Time: 1m 25s
	Train Loss: 1.130 | Train PPL:   3.094
	Train Accuracy: 0.141
	Val. Loss: 0.876 |  Val. PPL:   2.400
	Val Accuracy: 0.187
Epoch: 03 | Time: 1m 18s
	Train Loss: 0.911 | Train PPL:   2.487
	Train Accuracy: 0.223
	Val. Loss: 0.799 |  Val. PPL:   2.223
	Val Accuracy: 0.281
Epoch: 04 | Time: 1m 19s
	Train Loss: 0.779 | Train PPL:   2.178
	Train Accuracy: 0.284
	Val. Loss: 0.829 |  Val. PPL:   2.290
	Val Accuracy: 0.309
Epoch: 05 | Time: 1m 18s
	Train Loss: 0.681 | Train PPL:   1.975
	Train Accuracy: 0.330
	Val. Loss: 0.860 |  Val. PPL:   2.362
	Val Accuracy: 0.341
Epoch: 06 | Time: 1m 18s
	Train Loss: 0.606 | Train PPL:   1.833
	Train Accuracy: 0.373
	Val. Loss: 0.888 |  Val. PPL:   2.430
	Val Accuracy: 0.372
Epoch: 07 | Time: 1m 17s
	Train Loss: 0.554 | Train PPL:   1.740
	Train Accuracy: 0.405
	Val. Loss: 0.913 |  Val

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,14.0
train loss,0.31189
train PPL,1.366
train accuracy,0.59035
val loss,1.09596
valid PPL,2.99206
val accuracy,0.39927
_runtime,1196.0
_timestamp,1620844752.0
_step,14.0


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train loss,█▄▃▃▂▂▂▂▂▁▁▁▁▁▁
train PPL,█▃▂▂▂▁▁▁▁▁▁▁▁▁▁
train accuracy,▁▂▃▄▅▅▆▆▆▇▇▇███
val loss,█▃▁▂▂▃▃▄▄▅▅▅▆▆▇
valid PPL,█▂▁▂▂▃▃▃▃▅▄▅▅▆▇
val accuracy,▁▃▅▆▆▇▇▇▇▇▇▇███
_runtime,▁▂▂▃▃▄▄▅▅▅▆▆▇▇█
_timestamp,▁▂▂▃▃▄▄▅▅▅▆▆▇▇█
_step,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█


[34m[1mwandb[0m: Agent Starting Run: xye116q4 with config:
[34m[1mwandb[0m: 	beam_search_size: 4
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.6303525822456986
[34m[1mwandb[0m: 	epochs: 14
[34m[1mwandb[0m: 	hidden_layer_size: 256
[34m[1mwandb[0m: 	no_encoder_decoder_layers: 1
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.44932953528102826


  "num_layers={}".format(dropout, num_layers))


Epoch: 01 | Time: 1m 2s
	Train Loss: 2.186 | Train PPL:   8.898
	Train Accuracy: 0.033
	Val. Loss: 1.179 |  Val. PPL:   3.251
	Val Accuracy: 0.110
Epoch: 02 | Time: 1m 4s
	Train Loss: 1.062 | Train PPL:   2.891
	Train Accuracy: 0.155
	Val. Loss: 1.053 |  Val. PPL:   2.866
	Val Accuracy: 0.239
Epoch: 03 | Time: 1m 4s
	Train Loss: 0.839 | Train PPL:   2.314
	Train Accuracy: 0.230
	Val. Loss: 1.065 |  Val. PPL:   2.900
	Val Accuracy: 0.281
Epoch: 04 | Time: 1m 3s
	Train Loss: 0.732 | Train PPL:   2.080
	Train Accuracy: 0.278
	Val. Loss: 1.078 |  Val. PPL:   2.939
	Val Accuracy: 0.305
Epoch: 05 | Time: 1m 4s
	Train Loss: 0.658 | Train PPL:   1.931
	Train Accuracy: 0.315
	Val. Loss: 1.037 |  Val. PPL:   2.820
	Val Accuracy: 0.340
Epoch: 06 | Time: 1m 4s
	Train Loss: 0.603 | Train PPL:   1.827
	Train Accuracy: 0.344
	Val. Loss: 1.053 |  Val. PPL:   2.865
	Val Accuracy: 0.357
Epoch: 07 | Time: 1m 5s
	Train Loss: 0.556 | Train PPL:   1.744
	Train Accuracy: 0.376
	Val. Loss: 1.076 |  Val. PPL: 

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,13.0
train loss,0.35994
train PPL,1.43325
train accuracy,0.5221
val loss,1.14178
valid PPL,3.13234
val accuracy,0.37678
_runtime,904.0
_timestamp,1620845660.0
_step,13.0


0,1
epoch,▁▂▂▃▃▄▄▅▅▆▆▇▇█
train loss,█▄▃▂▂▂▂▂▁▁▁▁▁▁
train PPL,█▂▂▂▁▁▁▁▁▁▁▁▁▁
train accuracy,▁▃▄▅▅▅▆▆▇▇▇███
val loss,█▂▂▃▁▂▃▃▃▂▃▅▅▆
valid PPL,█▂▂▃▁▂▃▃▃▂▃▅▅▆
val accuracy,▁▄▅▆▇▇████████
_runtime,▁▂▂▃▃▄▄▅▅▆▆▇▇█
_timestamp,▁▂▂▃▃▄▄▅▅▆▆▇▇█
_step,▁▂▂▃▃▄▄▅▅▆▆▇▇█


[34m[1mwandb[0m: Agent Starting Run: 0pbsf6yu with config:
[34m[1mwandb[0m: 	beam_search_size: 4
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.3706694889169139
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_layer_size: 512
[34m[1mwandb[0m: 	no_encoder_decoder_layers: 1
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5212485851830646


  "num_layers={}".format(dropout, num_layers))


Epoch: 01 | Time: 1m 11s
	Train Loss: 1.818 | Train PPL:   6.162
	Train Accuracy: 0.071
	Val. Loss: 1.048 |  Val. PPL:   2.851
	Val Accuracy: 0.191
Epoch: 02 | Time: 1m 11s
	Train Loss: 0.875 | Train PPL:   2.399
	Train Accuracy: 0.221
	Val. Loss: 1.058 |  Val. PPL:   2.880
	Val Accuracy: 0.285
Epoch: 03 | Time: 1m 10s
	Train Loss: 0.675 | Train PPL:   1.964
	Train Accuracy: 0.300
	Val. Loss: 1.015 |  Val. PPL:   2.760
	Val Accuracy: 0.336
Epoch: 04 | Time: 1m 9s
	Train Loss: 0.577 | Train PPL:   1.782
	Train Accuracy: 0.355
	Val. Loss: 1.040 |  Val. PPL:   2.828
	Val Accuracy: 0.349
Epoch: 05 | Time: 1m 11s
	Train Loss: 0.510 | Train PPL:   1.665
	Train Accuracy: 0.401
	Val. Loss: 1.022 |  Val. PPL:   2.779
	Val Accuracy: 0.380
Epoch: 06 | Time: 1m 11s
	Train Loss: 0.445 | Train PPL:   1.560
	Train Accuracy: 0.443
	Val. Loss: 1.045 |  Val. PPL:   2.845
	Val Accuracy: 0.383
Epoch: 07 | Time: 1m 9s
	Train Loss: 0.404 | Train PPL:   1.497
	Train Accuracy: 0.475
	Val. Loss: 1.091 |  Val. 

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,14.0
train loss,0.19863
train PPL,1.21973
train accuracy,0.68229
val loss,1.24819
valid PPL,3.48404
val accuracy,0.39858
_runtime,1085.0
_timestamp,1620846750.0
_step,14.0


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train loss,█▄▃▃▂▂▂▂▂▁▁▁▁▁▁
train PPL,█▃▂▂▂▁▁▁▁▁▁▁▁▁▁
train accuracy,▁▃▄▄▅▅▆▆▆▇▇▇███
val loss,▂▂▁▂▁▂▃▄▃▅▆▇███
valid PPL,▂▂▁▂▁▂▃▃▃▅▆▇███
val accuracy,▁▄▆▆▇▇▇██▇█████
_runtime,▁▁▂▂▃▃▄▄▅▅▆▆▇▇█
_timestamp,▁▁▂▂▃▃▄▄▅▅▆▆▇▇█
_step,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█


[34m[1mwandb[0m: Agent Starting Run: kr2boabt with config:
[34m[1mwandb[0m: 	beam_search_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.5128859912270034
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_layer_size: 256
[34m[1mwandb[0m: 	no_encoder_decoder_layers: 3
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.33569141820857873


Epoch: 01 | Time: 1m 22s
	Train Loss: 2.746 | Train PPL:  15.583
	Train Accuracy: 0.001
	Val. Loss: 1.910 |  Val. PPL:   6.750
	Val Accuracy: 0.005
Epoch: 02 | Time: 1m 24s
	Train Loss: 1.699 | Train PPL:   5.470
	Train Accuracy: 0.035
	Val. Loss: 1.115 |  Val. PPL:   3.049
	Val Accuracy: 0.106
Epoch: 03 | Time: 1m 19s
	Train Loss: 1.199 | Train PPL:   3.315
	Train Accuracy: 0.121
	Val. Loss: 1.055 |  Val. PPL:   2.872
	Val Accuracy: 0.243
Epoch: 04 | Time: 1m 14s
	Train Loss: 0.979 | Train PPL:   2.663
	Train Accuracy: 0.181
	Val. Loss: 1.062 |  Val. PPL:   2.893
	Val Accuracy: 0.259
Epoch: 05 | Time: 1m 17s
	Train Loss: 0.868 | Train PPL:   2.381
	Train Accuracy: 0.222
	Val. Loss: 1.044 |  Val. PPL:   2.840
	Val Accuracy: 0.300
Epoch: 06 | Time: 1m 15s
	Train Loss: 0.797 | Train PPL:   2.219
	Train Accuracy: 0.254
	Val. Loss: 1.081 |  Val. PPL:   2.947
	Val Accuracy: 0.309
Epoch: 07 | Time: 1m 15s
	Train Loss: 0.731 | Train PPL:   2.078
	Train Accuracy: 0.284
	Val. Loss: 1.041 |  Val

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,14.0
train loss,0.511
train PPL,1.66696
train accuracy,0.41446
val loss,1.07061
valid PPL,2.91716
val accuracy,0.41326
_runtime,1163.0
_timestamp,1620847918.0
_step,14.0


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train loss,█▅▃▂▂▂▂▂▁▁▁▁▁▁▁
train PPL,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁
train accuracy,▁▂▃▄▅▅▆▆▆▇▇▇███
val loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁
valid PPL,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val accuracy,▁▃▅▅▆▆▇▇▇▇▇████
_runtime,▁▂▂▃▃▄▄▅▅▆▆▇▇██
_timestamp,▁▂▂▃▃▄▄▅▅▆▆▇▇██
_step,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█


[34m[1mwandb[0m: Agent Starting Run: ayfuw1g0 with config:
[34m[1mwandb[0m: 	beam_search_size: 4
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3163892689028457
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_layer_size: 512
[34m[1mwandb[0m: 	no_encoder_decoder_layers: 3
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.2755690541018389


Epoch: 01 | Time: 1m 41s
	Train Loss: 2.422 | Train PPL:  11.271
	Train Accuracy: 0.013
	Val. Loss: 1.241 |  Val. PPL:   3.459
	Val Accuracy: 0.066
Epoch: 02 | Time: 1m 40s
	Train Loss: 1.164 | Train PPL:   3.203
	Train Accuracy: 0.142
	Val. Loss: 0.859 |  Val. PPL:   2.361
	Val Accuracy: 0.217
Epoch: 03 | Time: 1m 36s
	Train Loss: 0.861 | Train PPL:   2.366
	Train Accuracy: 0.247
	Val. Loss: 0.907 |  Val. PPL:   2.476
	Val Accuracy: 0.320
Epoch: 04 | Time: 1m 34s
	Train Loss: 0.705 | Train PPL:   2.023
	Train Accuracy: 0.318
	Val. Loss: 0.975 |  Val. PPL:   2.651
	Val Accuracy: 0.327
Epoch: 05 | Time: 1m 36s
	Train Loss: 0.619 | Train PPL:   1.856
	Train Accuracy: 0.368
	Val. Loss: 0.944 |  Val. PPL:   2.570
	Val Accuracy: 0.368
Epoch: 06 | Time: 1m 34s
	Train Loss: 0.558 | Train PPL:   1.748
	Train Accuracy: 0.408
	Val. Loss: 0.998 |  Val. PPL:   2.714
	Val Accuracy: 0.373
Epoch: 07 | Time: 1m 34s
	Train Loss: 0.504 | Train PPL:   1.655
	Train Accuracy: 0.444
	Val. Loss: 0.971 |  Val

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,9.0
train loss,0.39572
train PPL,1.48545
train accuracy,0.52357
val loss,1.00769
valid PPL,2.73927
val accuracy,0.39559
_runtime,973.0
_timestamp,1620848896.0
_step,9.0


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train loss,█▄▃▂▂▂▁▁▁▁
train PPL,█▂▂▁▁▁▁▁▁▁
train accuracy,▁▃▄▅▆▆▇▇██
val loss,█▁▂▃▃▄▃▃▃▄
valid PPL,█▁▂▃▂▃▃▃▃▃
val accuracy,▁▄▆▆▇▇████
_runtime,▁▂▃▃▄▅▆▆▇█
_timestamp,▁▂▃▃▄▅▆▆▇█
_step,▁▂▃▃▄▅▆▆▇█


[34m[1mwandb[0m: Agent Starting Run: pdargzx0 with config:
[34m[1mwandb[0m: 	beam_search_size: 4
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.6874028282409137
[34m[1mwandb[0m: 	epochs: 9
[34m[1mwandb[0m: 	hidden_layer_size: 256
[34m[1mwandb[0m: 	no_encoder_decoder_layers: 1
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.6838079996770052


  "num_layers={}".format(dropout, num_layers))


Epoch: 01 | Time: 1m 1s
	Train Loss: 2.093 | Train PPL:   8.113
	Train Accuracy: 0.040
	Val. Loss: 1.244 |  Val. PPL:   3.468
	Val Accuracy: 0.154
Epoch: 02 | Time: 1m 4s
	Train Loss: 0.935 | Train PPL:   2.548
	Train Accuracy: 0.180
	Val. Loss: 1.280 |  Val. PPL:   3.595
	Val Accuracy: 0.246
Epoch: 03 | Time: 1m 3s
	Train Loss: 0.724 | Train PPL:   2.062
	Train Accuracy: 0.257
	Val. Loss: 1.177 |  Val. PPL:   3.246
	Val Accuracy: 0.308
Epoch: 04 | Time: 1m 4s
	Train Loss: 0.610 | Train PPL:   1.840
	Train Accuracy: 0.307
	Val. Loss: 1.203 |  Val. PPL:   3.331
	Val Accuracy: 0.346
Epoch: 05 | Time: 1m 4s
	Train Loss: 0.547 | Train PPL:   1.727
	Train Accuracy: 0.342
	Val. Loss: 1.209 |  Val. PPL:   3.349
	Val Accuracy: 0.368
Epoch: 06 | Time: 1m 4s
	Train Loss: 0.497 | Train PPL:   1.644
	Train Accuracy: 0.374
	Val. Loss: 1.219 |  Val. PPL:   3.385
	Val Accuracy: 0.375
Epoch: 07 | Time: 1m 5s
	Train Loss: 0.457 | Train PPL:   1.579
	Train Accuracy: 0.405
	Val. Loss: 1.173 |  Val. PPL: 

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,8.0
train loss,0.40277
train PPL,1.49597
train accuracy,0.45043
val loss,1.18794
valid PPL,3.28032
val accuracy,0.3894
_runtime,583.0
_timestamp,1620849484.0
_step,8.0


0,1
epoch,▁▂▃▄▅▅▆▇█
train loss,█▃▂▂▂▁▁▁▁
train PPL,█▂▂▁▁▁▁▁▁
train accuracy,▁▃▅▆▆▇▇██
val loss,▆█▁▃▃▄▁▄▂
valid PPL,▆█▁▃▃▄▁▄▂
val accuracy,▁▄▅▇▇▇███
_runtime,▁▂▃▄▄▅▆▇█
_timestamp,▁▂▃▄▄▅▆▇█
_step,▁▂▃▄▅▅▆▇█


[34m[1mwandb[0m: Agent Starting Run: h3794x4v with config:
[34m[1mwandb[0m: 	beam_search_size: 4
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.6901369288202513
[34m[1mwandb[0m: 	epochs: 7
[34m[1mwandb[0m: 	hidden_layer_size: 512
[34m[1mwandb[0m: 	no_encoder_decoder_layers: 3
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.37735781077633646


Epoch: 01 | Time: 1m 33s
	Train Loss: 1.907 | Train PPL:   6.734
	Train Accuracy: 0.043
	Val. Loss: 0.980 |  Val. PPL:   2.664
	Val Accuracy: 0.154
Epoch: 02 | Time: 1m 34s
	Train Loss: 1.117 | Train PPL:   3.056
	Train Accuracy: 0.140
	Val. Loss: 0.836 |  Val. PPL:   2.307
	Val Accuracy: 0.200
Epoch: 03 | Time: 1m 33s
	Train Loss: 0.976 | Train PPL:   2.653
	Train Accuracy: 0.181
	Val. Loss: 0.787 |  Val. PPL:   2.198
	Val Accuracy: 0.235
Epoch: 04 | Time: 1m 34s
	Train Loss: 0.904 | Train PPL:   2.468
	Train Accuracy: 0.209
	Val. Loss: 0.761 |  Val. PPL:   2.140
	Val Accuracy: 0.236
Epoch: 05 | Time: 1m 32s
	Train Loss: 0.852 | Train PPL:   2.344
	Train Accuracy: 0.230
	Val. Loss: 0.756 |  Val. PPL:   2.130
	Val Accuracy: 0.258
Epoch: 06 | Time: 1m 31s
	Train Loss: 0.817 | Train PPL:   2.264
	Train Accuracy: 0.245
	Val. Loss: 0.746 |  Val. PPL:   2.108
	Val Accuracy: 0.263
Epoch: 07 | Time: 1m 31s
	Train Loss: 0.786 | Train PPL:   2.195
	Train Accuracy: 0.257
	Val. Loss: 0.766 |  Val

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,6.0
train loss,0.78599
train PPL,2.19458
train accuracy,0.25681
val loss,0.76594
valid PPL,2.15102
val accuracy,0.25195
_runtime,653.0
_timestamp,1620850142.0
_step,6.0


0,1
epoch,▁▂▃▅▆▇█
train loss,█▃▂▂▁▁▁
train PPL,█▂▂▁▁▁▁
train accuracy,▁▄▆▆▇██
val loss,█▄▂▁▁▁▂
valid PPL,█▄▂▁▁▁▂
val accuracy,▁▄▆▆██▇
_runtime,▁▂▃▅▆▇█
_timestamp,▁▂▃▅▆▇█
_step,▁▂▃▅▆▇█


[34m[1mwandb[0m: Agent Starting Run: ldac1jfk with config:
[34m[1mwandb[0m: 	beam_search_size: 4
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.5804100617503596
[34m[1mwandb[0m: 	epochs: 14
[34m[1mwandb[0m: 	hidden_layer_size: 256
[34m[1mwandb[0m: 	no_encoder_decoder_layers: 3
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.2785522200213755


Epoch: 01 | Time: 1m 32s
	Train Loss: 2.825 | Train PPL:  16.856
	Train Accuracy: 0.001
	Val. Loss: 2.005 |  Val. PPL:   7.426
	Val Accuracy: 0.002
Epoch: 02 | Time: 1m 28s
	Train Loss: 1.832 | Train PPL:   6.248
	Train Accuracy: 0.026
	Val. Loss: 1.200 |  Val. PPL:   3.321
	Val Accuracy: 0.062
Epoch: 03 | Time: 1m 21s
	Train Loss: 1.318 | Train PPL:   3.735
	Train Accuracy: 0.090
	Val. Loss: 1.008 |  Val. PPL:   2.741
	Val Accuracy: 0.116
Epoch: 04 | Time: 1m 26s
	Train Loss: 1.119 | Train PPL:   3.061
	Train Accuracy: 0.145
	Val. Loss: 0.874 |  Val. PPL:   2.396
	Val Accuracy: 0.190
Epoch: 05 | Time: 1m 21s
	Train Loss: 1.001 | Train PPL:   2.721
	Train Accuracy: 0.182
	Val. Loss: 0.875 |  Val. PPL:   2.399
	Val Accuracy: 0.250
Epoch: 06 | Time: 1m 20s
	Train Loss: 0.888 | Train PPL:   2.431
	Train Accuracy: 0.221
	Val. Loss: 0.931 |  Val. PPL:   2.537
	Val Accuracy: 0.305
Epoch: 07 | Time: 1m 17s
	Train Loss: 0.814 | Train PPL:   2.257
	Train Accuracy: 0.247
	Val. Loss: 0.957 |  Val

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,13.0
train loss,0.58741
train PPL,1.79933
train accuracy,0.37307
val loss,0.99345
valid PPL,2.70054
val accuracy,0.37517
_runtime,1134.0
_timestamp,1620851281.0
_step,13.0


0,1
epoch,▁▂▂▃▃▄▄▅▅▆▆▇▇█
train loss,█▅▃▃▂▂▂▂▁▁▁▁▁▁
train PPL,█▃▂▂▁▁▁▁▁▁▁▁▁▁
train accuracy,▁▁▃▄▄▅▆▆▇▇▇███
val loss,█▃▂▁▁▁▂▂▁▂▂▂▂▂
valid PPL,█▂▁▁▁▁▁▁▁▁▁▁▁▁
val accuracy,▁▂▃▄▅▆▇▇▇▇▇███
_runtime,▁▂▂▃▃▄▄▅▅▆▆▇▇█
_timestamp,▁▂▂▃▃▄▄▅▅▆▆▇▇█
_step,▁▂▂▃▃▄▄▅▅▆▆▇▇█


[34m[1mwandb[0m: Agent Starting Run: wyzlovjw with config:
[34m[1mwandb[0m: 	beam_search_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.30590148758973007
[34m[1mwandb[0m: 	epochs: 9
[34m[1mwandb[0m: 	hidden_layer_size: 256
[34m[1mwandb[0m: 	no_encoder_decoder_layers: 3
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.46174312442511345


Epoch: 01 | Time: 1m 42s
	Train Loss: 2.736 | Train PPL:  15.432
	Train Accuracy: 0.002
	Val. Loss: 1.706 |  Val. PPL:   5.505
	Val Accuracy: 0.008
Epoch: 02 | Time: 1m 23s
	Train Loss: 1.511 | Train PPL:   4.532
	Train Accuracy: 0.067
	Val. Loss: 1.226 |  Val. PPL:   3.408
	Val Accuracy: 0.161
Epoch: 03 | Time: 1m 18s
	Train Loss: 0.986 | Train PPL:   2.681
	Train Accuracy: 0.182
	Val. Loss: 1.139 |  Val. PPL:   3.123
	Val Accuracy: 0.258
Epoch: 04 | Time: 1m 16s
	Train Loss: 0.805 | Train PPL:   2.237
	Train Accuracy: 0.246
	Val. Loss: 1.100 |  Val. PPL:   3.003
	Val Accuracy: 0.321
Epoch: 05 | Time: 1m 18s
	Train Loss: 0.714 | Train PPL:   2.043
	Train Accuracy: 0.288
	Val. Loss: 1.106 |  Val. PPL:   3.022
	Val Accuracy: 0.341
Epoch: 06 | Time: 1m 18s
	Train Loss: 0.649 | Train PPL:   1.914
	Train Accuracy: 0.318
	Val. Loss: 1.047 |  Val. PPL:   2.850
	Val Accuracy: 0.359
Epoch: 07 | Time: 1m 18s
	Train Loss: 0.600 | Train PPL:   1.823
	Train Accuracy: 0.348
	Val. Loss: 1.073 |  Val

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,8.0
train loss,0.52669
train PPL,1.69333
train accuracy,0.39146
val loss,1.07425
valid PPL,2.92779
val accuracy,0.4011
_runtime,734.0
_timestamp,1620852020.0
_step,8.0


0,1
epoch,▁▂▃▄▅▅▆▇█
train loss,█▄▂▂▂▁▁▁▁
train PPL,█▂▂▁▁▁▁▁▁
train accuracy,▁▂▄▅▆▇▇██
val loss,█▃▂▂▂▁▁▁▁
valid PPL,█▂▂▁▁▁▁▁▁
val accuracy,▁▄▅▇▇▇███
_runtime,▁▂▃▄▅▅▆▇█
_timestamp,▁▂▃▄▅▅▆▇█
_step,▁▂▃▄▅▅▆▇█


[34m[1mwandb[0m: Agent Starting Run: 996tqiqb with config:
[34m[1mwandb[0m: 	beam_search_size: 2
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.38245465291467134
[34m[1mwandb[0m: 	epochs: 14
[34m[1mwandb[0m: 	hidden_layer_size: 256
[34m[1mwandb[0m: 	no_encoder_decoder_layers: 3
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.44424613228301557


Epoch: 01 | Time: 1m 42s
	Train Loss: 2.769 | Train PPL:  15.939
	Train Accuracy: 0.001
	Val. Loss: 1.901 |  Val. PPL:   6.693
	Val Accuracy: 0.004
Epoch: 02 | Time: 1m 22s
	Train Loss: 1.750 | Train PPL:   5.756
	Train Accuracy: 0.033
	Val. Loss: 1.280 |  Val. PPL:   3.596
	Val Accuracy: 0.114
Epoch: 03 | Time: 1m 16s
	Train Loss: 1.166 | Train PPL:   3.208
	Train Accuracy: 0.126
	Val. Loss: 1.135 |  Val. PPL:   3.111
	Val Accuracy: 0.230
Epoch: 04 | Time: 1m 13s
	Train Loss: 0.942 | Train PPL:   2.564
	Train Accuracy: 0.193
	Val. Loss: 1.094 |  Val. PPL:   2.987
	Val Accuracy: 0.287
Epoch: 05 | Time: 1m 14s
	Train Loss: 0.811 | Train PPL:   2.250
	Train Accuracy: 0.246
	Val. Loss: 1.097 |  Val. PPL:   2.994
	Val Accuracy: 0.320
Epoch: 06 | Time: 1m 16s
	Train Loss: 0.730 | Train PPL:   2.075
	Train Accuracy: 0.279
	Val. Loss: 1.110 |  Val. PPL:   3.036
	Val Accuracy: 0.342
Epoch: 07 | Time: 1m 15s
	Train Loss: 0.672 | Train PPL:   1.959
	Train Accuracy: 0.311
	Val. Loss: 1.043 |  Val

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,13.0
train loss,0.44861
train PPL,1.56614
train accuracy,0.44946
val loss,1.13089
valid PPL,3.09842
val accuracy,0.41625
_runtime,1096.0
_timestamp,1620853121.0
_step,13.0


0,1
epoch,▁▂▂▃▃▄▄▅▅▆▆▇▇█
train loss,█▅▃▂▂▂▂▂▁▁▁▁▁▁
train PPL,█▃▂▁▁▁▁▁▁▁▁▁▁▁
train accuracy,▁▁▃▄▅▅▆▆▇▇▇▇██
val loss,█▃▂▁▁▂▁▁▁▁▂▁▁▂
valid PPL,█▂▁▁▁▁▁▁▁▁▁▁▁▁
val accuracy,▁▃▅▆▆▇▇▇██████
_runtime,▁▂▂▃▃▄▄▅▅▆▆▇▇█
_timestamp,▁▂▂▃▃▄▄▅▅▆▆▇▇█
_step,▁▂▂▃▃▄▄▅▅▆▆▇▇█


[34m[1mwandb[0m: Agent Starting Run: 8qs4zadd with config:
[34m[1mwandb[0m: 	beam_search_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3406860935726627
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_layer_size: 256
[34m[1mwandb[0m: 	no_encoder_decoder_layers: 2
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.3162365032802797


Epoch: 01 | Time: 1m 18s
	Train Loss: 2.627 | Train PPL:  13.831
	Train Accuracy: 0.005
	Val. Loss: 1.456 |  Val. PPL:   4.287
	Val Accuracy: 0.028
Epoch: 02 | Time: 1m 11s
	Train Loss: 1.309 | Train PPL:   3.701
	Train Accuracy: 0.099
	Val. Loss: 1.118 |  Val. PPL:   3.059
	Val Accuracy: 0.175
Epoch: 03 | Time: 1m 11s
	Train Loss: 0.976 | Train PPL:   2.653
	Train Accuracy: 0.190
	Val. Loss: 1.060 |  Val. PPL:   2.888
	Val Accuracy: 0.272
Epoch: 04 | Time: 1m 10s
	Train Loss: 0.832 | Train PPL:   2.298
	Train Accuracy: 0.246
	Val. Loss: 0.986 |  Val. PPL:   2.679
	Val Accuracy: 0.320
Epoch: 05 | Time: 1m 11s
	Train Loss: 0.741 | Train PPL:   2.098
	Train Accuracy: 0.290
	Val. Loss: 0.991 |  Val. PPL:   2.693
	Val Accuracy: 0.329
Epoch: 06 | Time: 1m 11s
	Train Loss: 0.670 | Train PPL:   1.955
	Train Accuracy: 0.327
	Val. Loss: 0.996 |  Val. PPL:   2.708
	Val Accuracy: 0.340
Epoch: 07 | Time: 1m 10s
	Train Loss: 0.617 | Train PPL:   1.853
	Train Accuracy: 0.360
	Val. Loss: 1.021 |  Val

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,14.0
train loss,0.37667
train PPL,1.45743
train accuracy,0.52755
val loss,1.07732
valid PPL,2.93679
val accuracy,0.40799
_runtime,1085.0
_timestamp,1620854212.0
_step,14.0


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train loss,█▄▃▂▂▂▂▂▁▁▁▁▁▁▁
train PPL,█▂▂▁▁▁▁▁▁▁▁▁▁▁▁
train accuracy,▁▂▃▄▅▅▆▆▆▇▇▇███
val loss,█▃▂▁▁▁▂▂▁▁▂▂▂▂▂
valid PPL,█▃▂▁▁▁▁▁▁▁▂▂▂▂▂
val accuracy,▁▄▆▆▇▇▇▇███████
_runtime,▁▂▂▂▃▃▄▄▅▅▆▆▇▇█
_timestamp,▁▂▂▂▃▃▄▄▅▅▆▆▇▇█
_step,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█


[34m[1mwandb[0m: Agent Starting Run: xkfmrhf7 with config:
[34m[1mwandb[0m: 	beam_search_size: 2
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2712930749063079
[34m[1mwandb[0m: 	epochs: 14
[34m[1mwandb[0m: 	hidden_layer_size: 256
[34m[1mwandb[0m: 	no_encoder_decoder_layers: 3
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.7349997779313252


Epoch: 01 | Time: 1m 26s
	Train Loss: 2.029 | Train PPL:   7.607
	Train Accuracy: 0.039
	Val. Loss: 1.036 |  Val. PPL:   2.819
	Val Accuracy: 0.111
Epoch: 02 | Time: 1m 19s
	Train Loss: 0.992 | Train PPL:   2.697
	Train Accuracy: 0.177
	Val. Loss: 1.078 |  Val. PPL:   2.937
	Val Accuracy: 0.277
Epoch: 03 | Time: 1m 17s
	Train Loss: 0.737 | Train PPL:   2.089
	Train Accuracy: 0.263
	Val. Loss: 1.135 |  Val. PPL:   3.111
	Val Accuracy: 0.320
Epoch: 04 | Time: 1m 18s
	Train Loss: 0.624 | Train PPL:   1.866
	Train Accuracy: 0.310
	Val. Loss: 1.200 |  Val. PPL:   3.322
	Val Accuracy: 0.338
Epoch: 05 | Time: 1m 17s
	Train Loss: 0.558 | Train PPL:   1.746
	Train Accuracy: 0.347
	Val. Loss: 1.163 |  Val. PPL:   3.201
	Val Accuracy: 0.374
Epoch: 06 | Time: 1m 17s
	Train Loss: 0.508 | Train PPL:   1.663
	Train Accuracy: 0.374
	Val. Loss: 1.170 |  Val. PPL:   3.222
	Val Accuracy: 0.375
Epoch: 07 | Time: 1m 13s
	Train Loss: 0.468 | Train PPL:   1.596
	Train Accuracy: 0.402
	Val. Loss: 1.187 |  Val

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,13.0
train loss,0.33415
train PPL,1.39675
train accuracy,0.51432
val loss,1.25068
valid PPL,3.49272
val accuracy,0.41831
_runtime,1092.0
_timestamp,1620855312.0
_step,13.0


0,1
epoch,▁▂▂▃▃▄▄▅▅▆▆▇▇█
train loss,█▄▃▂▂▂▂▁▁▁▁▁▁▁
train PPL,█▂▂▂▁▁▁▁▁▁▁▁▁▁
train accuracy,▁▃▄▅▆▆▆▇▇▇▇███
val loss,▁▂▄▆▅▅▆▅▇█▇▇▇█
valid PPL,▁▂▄▆▅▅▆▅▇█▇▇▇█
val accuracy,▁▅▆▆▇▇▇█▇█████
_runtime,▁▂▂▃▃▄▄▅▅▆▆▇▇█
_timestamp,▁▂▂▃▃▄▄▅▅▆▆▇▇█
_step,▁▂▂▃▃▄▄▅▅▆▆▇▇█


[34m[1mwandb[0m: Agent Starting Run: 3vscmtfx with config:
[34m[1mwandb[0m: 	beam_search_size: 4
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.27909888445142167
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_layer_size: 256
[34m[1mwandb[0m: 	no_encoder_decoder_layers: 3
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.2690853302397402


Epoch: 01 | Time: 1m 38s
	Train Loss: 2.782 | Train PPL:  16.157
	Train Accuracy: 0.001
	Val. Loss: 1.874 |  Val. PPL:   6.515
	Val Accuracy: 0.002
Epoch: 02 | Time: 1m 27s
	Train Loss: 1.714 | Train PPL:   5.554
	Train Accuracy: 0.036
	Val. Loss: 1.176 |  Val. PPL:   3.241
	Val Accuracy: 0.066
Epoch: 03 | Time: 1m 28s
	Train Loss: 1.196 | Train PPL:   3.308
	Train Accuracy: 0.124
	Val. Loss: 1.013 |  Val. PPL:   2.753
	Val Accuracy: 0.188
Epoch: 04 | Time: 1m 23s
	Train Loss: 0.956 | Train PPL:   2.601
	Train Accuracy: 0.199
	Val. Loss: 1.018 |  Val. PPL:   2.767
	Val Accuracy: 0.279
Epoch: 05 | Time: 1m 20s
	Train Loss: 0.834 | Train PPL:   2.302
	Train Accuracy: 0.250
	Val. Loss: 0.994 |  Val. PPL:   2.702
	Val Accuracy: 0.307
Epoch: 06 | Time: 1m 19s
	Train Loss: 0.744 | Train PPL:   2.105
	Train Accuracy: 0.291
	Val. Loss: 0.999 |  Val. PPL:   2.715
	Val Accuracy: 0.335
Epoch: 07 | Time: 1m 20s
	Train Loss: 0.690 | Train PPL:   1.994
	Train Accuracy: 0.320
	Val. Loss: 0.946 |  Val

In [None]:
sweep_id = wandb.sweep(sweep_config, project=modelName+"_Sweep_1")

Create sweep with ID: ms9uhn1q
Sweep URL: https://wandb.ai/rayanz/Encoder_Decoder_attention_Dakshina_Hi_Sweep_1/sweeps/ms9uhn1q


In [None]:
wandb.agent(sweep_id, sp_train)

[34m[1mwandb[0m: Agent Starting Run: va572bml with config:
[34m[1mwandb[0m: 	beam_search_size: 4
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_layer_size: 512
[34m[1mwandb[0m: 	no_encoder_decoder_layers: 2
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.2


Epoch: 01 | Time: 1m 33s
	Train Loss: 2.128 | Train PPL:   8.400
	Train Accuracy: 0.038
	Val. Loss: 1.072 |  Val. PPL:   2.921
	Val Accuracy: 0.100


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.
