## Importing libraries and training data

In [9]:
# Install required packages
!pip install wandb
!pip install GPUtil



In [10]:
import zipfile
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
import gc
import random
import math
import wandb
from GPUtil import showUtilization as gpu_usage
from numba import cuda

wandb.login(key='4734e60951ce310dbe17484eeeb5b3366b54850f')

# zip_file_path = '/kaggle/input/aksharantar-sampled/aksharantar_sampled.zip'
# extracted_folder_path = '/kaggle/input/aksharantar-sampled'

# with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
#     zip_ref.extractall(extracted_folder_path)

# extracted_folder_contents = os.listdir(extracted_folder_path)
# print("Contents of extracted folder:", extracted_folder_contents)

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print("Trained on: " + str(device))

train_dataset = pd.read_csv('/kaggle/input/aksharantar-sampled/aksharantar_sampled/hin/hin_train.csv', names=['English', 'Hindi'], header=None)
test_dataset = pd.read_csv('/kaggle/input/aksharantar-sampled/aksharantar_sampled/hin/hin_test.csv', names=['English', 'Hindi'], header=None)
val_dataset = pd.read_csv('/kaggle/input/aksharantar-sampled/aksharantar_sampled/hin/hin_valid.csv', names=['English', 'Hindi'], header=None)


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Trained on: cuda




In [11]:
def clear_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()
    torch.cuda.empty_cache()
    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)
    print("GPU Usage after emptying the cache")
    gpu_usage()

def split_into_tokens(word):
    tokens = []
    for x in word:
        tokens.append(x)
    return tokens

def encode_english(word):
    tokens = []
    for x in word:
        tokens.append(eng_dict[x])
    for x in range(len(tokens), max_english_length):
        tokens.append(eng_dict['<pad>'])
    return tokens

def encode_hindi(word):
    tokens = []
    for x in word:
        tokens.append(hin_dict[x])
    tokens.append(hin_dict['<eow>'])
    for x in range(len(tokens), max_hindi_length + 1):
        tokens.append(hin_dict['<pad>'])
    return tokens

def encode_test_english(word):
    tokens = []
    for x in word:
        tokens.append(eng_dict[x])
    for x in range(len(tokens), test_max_english_length):
        tokens.append(eng_dict['<pad>'])
    return tokens

def encode_test_hindi(word):
    tokens = []
    for x in word:
        tokens.append(hin_dict[x])
    tokens.append(hin_dict['<eow>'])
    for x in range(len(tokens), test_max_hindi_length):
        tokens.append(hin_dict['<pad>'])
    return tokens

def encode_val_english(word):
    tokens = []
    for x in word:
        tokens.append(eng_dict[x])
    for x in range(len(tokens), val_max_english_length):
        tokens.append(eng_dict['<pad>'])
    return tokens

def encode_val_hindi(word):
    tokens = []
    for x in word:
        tokens.append(hin_dict[x])
    tokens.append(hin_dict['<eow>'])
    for x in range(len(tokens), val_max_hindi_length):
        tokens.append(hin_dict['<pad>'])
    return tokens

def get_word(characters):
    return "".join(characters)

def calculate_accuracy(target, predictions, flag):
    total = 0
    for x in range(len(target)):
        if torch.equal(target[x], predictions[x]):
            total += 1
    return total

def translate_predictions(target, predictions, df):
    i = len(df)
    for x in range(len(predictions)):
        original = []
        for y in target[x]:
            if y != 1:
                original.append(y)
            else:
                break
        predicted = []
        for y in predictions[x]:
            if y != 1:
                predicted.append(y)
            else:
                break
        df.loc[i, ['Original']] = get_word([reverse_hin[x.item()] for x in original])
        df.loc[i, ['Predicted']] = get_word([reverse_hin[x.item()] for x in predicted])
        i += 1
    return df

def valevaluate_att(attention,val_eng_word,val_hin_word,encoder,decoder,batchsize,hidden_size,char_embed_size,no_of_layers):
    with torch.no_grad():
        total_loss = 0
        total_acc = 0
        for x in range(0,len(val_dataset),batchsize):
            loss = 0
            input_tensor = val_eng_word[x:x+batchsize].to(device)
#             en_hidden = torch.zeros(2*no_of_layers,batchsize,hidden_size).to(device)
            if(input_tensor.size()[0] < batchsize):
                break
            en_hidden = torch.zeros(2*no_of_layers,batchsize,hidden_size).to(device)
            en_cell = torch.zeros(2*no_of_layers,batchsize,hidden_size).to(device)
            output,(hidden,cell) = encoder.forward(input_tensor,en_hidden,en_cell)
            del(input_tensor)
            del(en_hidden)
            del(en_cell)
            output = torch.split(output,[hidden_size,hidden_size],dim = 2)
            output = torch.add(output[0],output[1])/2
            input2 = []
            for y in range(batchsize):
                input2.append([0])
            input2 = torch.tensor(input2).to(device)
            hidden = hidden.resize(2,no_of_layers,batchsize,hidden_size)
            hidden1 = torch.add(hidden[0],hidden[1])/2
#             hidden1 = hidden[0]
            cell = cell.resize(2,no_of_layers,batchsize,hidden_size)
            cell1 = torch.add(cell[0],cell[1])/2
#             cell1 = cell[0]
            OGhidden = hidden1
            predicted = []
            predictions = []
            if(attention == True):
                temp = output
            else:
                temp = OGhidden
            for i in range(val_max_hindi_length):
                output1,(hidden1,cell1) = decoder.forward(input2,hidden1,cell1,temp,False)
                predicted.append(output1)
                output2 = decoder.softmax(output1)
                output3 = torch.argmax(output2,dim = 2)
                predictions.append(output3)
                input2 = output3
            predicted = torch.cat(tuple(x for x in predicted),dim =1).to(device).resize(val_max_hindi_length*batchsize,len(hin_dict))
            predictions = torch.cat(tuple(x for x in predictions),dim =1).to(device)
            total_acc += calculate_accuracy(val_hin_word[x:x+batchsize].to(device),predictions,x)
            loss  = nn.CrossEntropyLoss(reduction = 'sum')(predicted,val_hin_word[x:x+batchsize].reshape(-1).to(device))
            with torch.no_grad():
                total_loss += loss.item()
#             print(loss.item())
        validation_loss = total_loss/(len(val_dataset)*val_max_hindi_length)
        validation_accuracy = (total_acc/len(val_dataset))*100
        del(predictions)
        del(predicted)
        del(input2)
        del(output1)
        del(output2)
        del(output3)
        del(hidden1)
        del(cell1)
        del(OGhidden)
        del(output)
        del(cell)
        return validation_loss,validation_accuracy

In [12]:
split_into_tokens(train_dataset.iloc[0]['Hindi'])

max_english_length = 0
max_hindi_length = 0
test_max_english_length = 0
test_max_hindi_length = 0

for x in range(len(test_dataset)):
    temp = 0
    for y in test_dataset.iloc[x]['English']:
        temp += 1
    test_max_english_length = max(test_max_english_length, temp)

for x in range(len(test_dataset)):
    temp = 0
    for y in test_dataset.iloc[x]['Hindi']:
        temp += 1
    test_max_hindi_length = max(test_max_hindi_length, temp)

val_max_english_length = 0
val_max_hindi_length = 0

for x in range(len(val_dataset)):
    temp = 0
    for y in val_dataset.iloc[x]['English']:
        temp += 1
    val_max_english_length = max(val_max_english_length, temp)

for x in range(len(val_dataset)):
    temp = 0
    for y in val_dataset.iloc[x]['Hindi']:
        temp += 1
    val_max_hindi_length = max(val_max_hindi_length, temp)

english_vocab = []
for x in range(len(train_dataset)):
    temp = 0
    for y in train_dataset.iloc[x]['English']:
        temp += 1
        if y not in english_vocab:
            english_vocab.append(y)
    if temp > max_english_length:
        max_english_length = max(max_english_length, temp)

hindi_vocab = []
for x in range(len(train_dataset)):
    temp = 0
    for y in train_dataset.iloc[x]['Hindi']:
        temp += 1
        if y not in hindi_vocab:
            hindi_vocab.append(y)
    max_hindi_length = max(temp, max_hindi_length)
for x in range(len(test_dataset)):
    for y in test_dataset.iloc[x]['Hindi']:
        if y not in hindi_vocab:
            hindi_vocab.append(y)

english_vocab = sorted(english_vocab)
hindi_vocab = sorted(hindi_vocab)

eng_dict = {}
reverse_eng = {}

for x in range(len(english_vocab)):
    eng_dict[english_vocab[x]] = x + 3
    reverse_eng[x + 3] = english_vocab[x]
eng_dict['<sow>'] = 0
eng_dict['<eow>'] = 1
eng_dict['<pad>'] = 2
reverse_eng[0] = '<sow>'
reverse_eng[1] = '<eow>'
reverse_eng[2] = '<pad>'

hin_dict = {}
reverse_hin = {}
for x in range(len(hindi_vocab)):
    hin_dict[hindi_vocab[x]] = x + 3
    reverse_hin[x + 3] = hindi_vocab[x]
hin_dict['<sow>'] = 0
hin_dict['<eow>'] = 1
hin_dict['<pad>'] = 2
reverse_hin[0] = '<sow>'
reverse_hin[1] = '<eow>'
reverse_hin[2] = '<pad>'

encode_english(train_dataset.iloc[0]['English'])

eng_words = []
hin_words = []
for x in range(len(train_dataset)):
    eng_words.append(encode_english(train_dataset.iloc[x]['English']))
    hin_words.append(encode_hindi(train_dataset.iloc[x]['Hindi']))
eng_words = torch.tensor(eng_words)
hin_words = torch.tensor(hin_words)
max_hindi_length

max_hindi_length += 1
test_max_hindi_length += 1
val_max_hindi_length += 1
max_hindi_length

val_eng_words = []
val_hin_words = []
for x in range(len(val_dataset)):
    val_eng_words.append(encode_val_english(val_dataset.iloc[x]['English']))
    val_hin_words.append(encode_val_hindi(val_dataset.iloc[x]['Hindi']))
val_eng_words = torch.tensor(val_eng_words)
val_hin_words = torch.tensor(val_hin_words)

test_eng_words = []
test_hin_words = []
for x in range(len(test_dataset)):
    test_eng_words.append(encode_test_english(test_dataset.iloc[x]['English']))
    test_hin_words.append(encode_test_hindi(test_dataset.iloc[x]['Hindi']))
test_eng_words = torch.tensor(test_eng_words)
test_hin_words = torch.tensor(test_hin_words)

In [13]:
class Encoder(nn.Module):
    def __init__(self, char_embed_size, hidden_size, no_of_layers, dropout, rnn):
        super(Encoder, self).__init__()
        self.layer = no_of_layers
        self.rnn = rnn
        self.embedding = nn.Embedding(len(eng_dict), char_embed_size).to(device)
        self.embedding.weight.requires_grad = True
        self.drop = nn.Dropout(dropout)
        self.LSTM = nn.LSTM(char_embed_size, hidden_size, self.layer, batch_first=True, bidirectional=True).to(device)
        self.RNN = nn.RNN(char_embed_size, hidden_size, self.layer, batch_first=True, bidirectional=True).to(device)
        self.GRU = nn.GRU(char_embed_size, hidden_size, self.layer, batch_first=True, bidirectional=True).to(device)

    def forward(self, input, hidden, cell):
        embedded = self.embedding(input)
        embedded1 = self.drop(embedded)
        cell1 = cell
        if self.rnn == 'RNN':
            output, hidden1 = self.RNN(embedded1, hidden)
        elif self.rnn == 'LSTM':
            output, (hidden1, cell1) = self.LSTM(embedded1, (hidden, cell))
        elif self.rnn == 'GRU':
            output, hidden1 = self.GRU(embedded1, hidden)
        return output, (hidden1, cell1)

class DecoderWithAttention(nn.Module):
    def __init__(self,char_embed_size,hidden_size,no_of_layers,dropout,batchsize,rnn):
        super(DecoderWithAttention,self).__init__()
        self.layer = no_of_layers
        self.batchsize = batchsize
        self.hidden_size = hidden_size
        self.rnn = rnn
        self.embedding = nn.Embedding(len(hin_dict),char_embed_size).to(device)
        self.drop = nn.Dropout(dropout)
        self.embedding.weight.requires_grad = True
        self.U = nn.Linear(hidden_size,hidden_size,bias = False).to(device)
        self.W = nn.Linear(hidden_size,hidden_size,bias = False).to(device)
        self.V = nn.Linear(hidden_size,1,bias = False).to(device)

        self.LSTM = nn.LSTM(char_embed_size + hidden_size,hidden_size,self.layer,batch_first = True).to(device)
        self.RNN = nn.RNN(char_embed_size + hidden_size,hidden_size,self.layer,batch_first = True).to(device)
        self.GRU = nn.GRU(char_embed_size + hidden_size,hidden_size,self.layer,batch_first = True).to(device)
        self.linear = nn.Linear(hidden_size,len(hin_dict),bias=True).to(device)
        self.softmax = nn.Softmax(dim = 2).to(device)
    def forward(self,input,hidden,cell,encoder_outputs,matrix):
        embedded = self.embedding(input)
        temp1 = self.U(encoder_outputs)
        temp2 = self.W(hidden[-1])
        s1 = temp2.size()[0]
        s2 = temp2.size()[1]
        add = temp1 + temp2.resize(s1,1,s2)
        tanh = F.tanh(add)
        ejt = self.V(tanh)
        ajt = nn.Softmax(dim = 1)(ejt)
        ct = torch.zeros(self.batchsize,1,self.hidden_size).to(device)
        ct = torch.bmm(ajt.transpose(1,2),encoder_outputs)
        final_input = torch.cat((embedded,ct),dim = 2)
        final_input = self.drop(final_input)
        cell1 = cell
        if(self.rnn == 'LSTM'):
            output,(hidden1,cell1) = self.LSTM(final_input,(hidden,cell))
        elif(self.rnn == 'RNN'):
            output,hidden1 = self.RNN(final_input,hidden)
        elif(self.rnn == 'GRU'):
            output,hidden1 = self.GRU(final_input,hidden)
        output1 = self.linear(output)
        if(matrix == True):
            return ajt,output1,(hidden1,cell1)
        return output1,(hidden1,cell1)


def val_evaluate_attention(attention, val_eng_words, val_hin_words, encoder, decoder, batch_size, hidden_size, char_embed_size, no_of_layers):
    with torch.no_grad():
        total_loss = 0
        total_acc = 0
        for x in range(0, len(val_dataset), batch_size):
            loss = 0
            input_tensor = val_eng_words[x:x + batch_size].to(device)
            if input_tensor.size()[0] < batch_size:
                break
            en_hidden = torch.zeros(2 * no_of_layers, batch_size, hidden_size).to(device)
            en_cell = torch.zeros(2 * no_of_layers, batch_size, hidden_size).to(device)
            output, (hidden, cell) = encoder.forward(input_tensor, en_hidden, en_cell)
            del input_tensor
            del en_hidden
            del en_cell
            output = torch.split(output, [hidden_size, hidden_size], dim=2)
            output = torch.add(output[0], output[1]) / 2
            input2 = []
            for y in range(batch_size):
                input2.append([0])
            input2 = torch.tensor(input2).to(device)
            hidden = hidden.resize(2, no_of_layers, batch_size, hidden_size)
            hidden1 = torch.add(hidden[0], hidden[1]) / 2
            cell = cell.resize(2, no_of_layers, batch_size, hidden_size)
            cell1 = torch.add(cell[0], cell[1]) / 2
            OGhidden = hidden1
            predicted = []
            predictions = []
            if attention:
                temp = output
            else:
                temp = OGhidden
            for i in range(val_max_hindi_length):
                output1, (hidden1, cell1) = decoder.forward(input2, hidden1, cell1, temp, False)
                predicted.append(output1)
                output2 = decoder.softmax(output1)
                output3 = torch.argmax(output2, dim=2)
                predictions.append(output3)
                input2 = output3
            predicted = torch.cat(tuple(x for x in predicted), dim=1).to(device).resize(val_max_hindi_length * batch_size, len(hin_dict))
            predictions = torch.cat(tuple(x for x in predictions), dim=1).to(device)
            total_acc += calculate_accuracy(val_hin_words[x:x + batch_size].to(device), predictions, x)
            loss = nn.CrossEntropyLoss(reduction='sum')(predicted, val_hin_words[x:x + batch_size].reshape(-1).to(device))
            with torch.no_grad():
                total_loss += loss.item()
        validation_loss = total_loss / (len(val_dataset) * val_max_hindi_length)
        validation_accuracy = (total_acc / len(val_dataset)) * 100
        del predictions
        del predicted
        del input2
        del output1
        del output2
        del output3
        del hidden1
        del cell1
        del OGhidden
        del output
        del cell
        return validation_loss, validation_accuracy

In [14]:
def attention_train(batchsize,hidden_size,char_embed_size,no_of_layers,dropout,epochs,rnn):
    gc.collect()
    torch.autograd.set_detect_anomaly(True)
    encoder = Encoder(char_embed_size,hidden_size,no_of_layers,dropout,rnn).to(device)
    decoder = DecoderWithAttention(char_embed_size,hidden_size,no_of_layers,dropout,batchsize,rnn).to(device)
    # print(encoder.parameters)
    # print(decoder.parameters)
    opt_encoder = optim.Adam(encoder.parameters(),lr = 0.001)
    opt_decoder  = optim.Adam(decoder.parameters(),lr = 0.001)
    teacher_ratio = 0.5
    epoch_count = 0
    for _ in range(epochs):
        torch.cuda.empty_cache()
        total_loss = 0
        total_acc = 0
        for x in range(0,len(train_dataset),batchsize):
            loss = 0
            opt_encoder.zero_grad()
            opt_decoder.zero_grad()
            input_tensor = eng_words[x:x+batchsize].to(device)
            en_hidden = torch.zeros(2*no_of_layers,batchsize,hidden_size).to(device)
            en_cell = torch.zeros(2*no_of_layers,batchsize,hidden_size).to(device)
            if(input_tensor.size()[0] < batchsize):
                break
            output,(hidden,cell) = encoder.forward(input_tensor,en_hidden,en_cell)
            output = torch.split(output,[hidden_size,hidden_size],dim = 2)
            output = torch.add(output[0],output[1])/2
            input2 = []
            for y in range(batchsize):
                input2.append([0])
            input2 = torch.tensor(input2).to(device)
            hidden = hidden.resize(2,no_of_layers,batchsize,hidden_size)
            hidden1 = torch.add(hidden[0],hidden[1])/2
            cell = cell.resize(2,no_of_layers,batchsize,hidden_size)
            cell1 = torch.add(cell[0],cell[1])/2
            predicted = []
            predictions = []
#             use_teacher_forcing = True if random.random() < teacher_ratio else False
            for i in range(max_hindi_length):
                use_teacher_forcing = True if random.random() < teacher_ratio else False
                output1,(hidden1,cell1) = decoder.forward(input2,hidden1,cell1,output,False)
                predicted.append(output1)
                output2 = decoder.softmax(output1)
                output3 = torch.argmax(output2,dim = 2)
                predictions.append(output3)
                if(use_teacher_forcing):
                    input2 = hin_words[x:x+batchsize,i].to(device).resize(batchsize,1)
                else:
                    input2 = hin_words[x:x+batchsize,i].to(device).resize(batchsize,1)

            predicted = torch.cat(tuple(x for x in predicted),dim =1).to(device).resize(max_hindi_length*batchsize,len(hin_dict))
            predictions = torch.cat(tuple(x for x in predictions),dim =1).to(device)
            total_acc += calculate_accuracy(hin_words[x:x+batchsize].to(device),predictions,x)
            loss  = nn.CrossEntropyLoss(reduction = 'sum')(predicted,hin_words[x:x+batchsize].reshape(-1).to(device))
            with torch.no_grad():
                total_loss += loss.item()
            loss.backward(retain_graph = True)
            torch.nn.utils.clip_grad_norm_(encoder.parameters(),max_norm = 1)
            torch.nn.utils.clip_grad_norm_(decoder.parameters(),max_norm = 1)
            opt_encoder.step()
            opt_decoder.step()
        del(input_tensor)
        del(en_hidden)
        del(en_cell)
        del(predictions)
        del(predicted)
        del(input2)
        del(output1)
        del(output2)
        del(output3)
        del(hidden)
        del(hidden1)
        del(cell1)
        del(output)
        del(cell)
        training_loss = total_loss/(51200*max_hindi_length)
        training_accuracy = total_acc/512
        validation_loss,validation_accuracy = val_evaluate_attention(True,val_eng_words,val_hin_words,encoder,decoder,batchsize,hidden_size,char_embed_size,no_of_layers)
        wandb.log({'training_accuracy': training_accuracy, 'validation_accuracy': validation_accuracy, 'training_loss': training_loss, 'validation_loss' : validation_loss,'epoch': epoch_count + 1})
        print("Epoch: " + str(epoch_count + 1) + "/" + str(epochs) + "; Train loss: " + str(training_loss) + "; Val loss: " + str(validation_loss))
        epoch_count += 1    
    return encoder,decoder

In [15]:
def with_attention():
    wandb.init(project='CS6910_assignment_3')
    config = wandb.config
    wandb.run.name = "withatt_ctype_{}_nlayers_{}_hsize_{}_drop_{}_emb_{}_bs{}".format(config.cell_type,config.no_of_layers,config.hidden_size,config.dropout,config.input_embedding_size,config.batchsize)
    hidden_size = config.hidden_size
    char_embed_size = config.input_embedding_size
    no_of_layers = config.no_of_layers
    epochs = 10
    batchsize = config.batchsize
    dropout = config.dropout
    rnn = config.cell_type
    Encoder1,Decoder1 = attention_train(batchsize,hidden_size,char_embed_size,no_of_layers,dropout,epochs,rnn)
    clear_gpu_cache()

In [None]:
sweep_configuration = {
    'method': 'bayes',
    'name': 'sweep attention LSTM + GRU only',
    'metric': {
      'name': 'validation_accuracy',
      'goal': 'maximize'
    },
    'parameters': {
        'batchsize': {
            'values': [32,64,128,256]
        },
        'input_embedding_size': {
            'values': [16,32,64,256]
        },
        'no_of_layers': {
            'values': [1,2,3]
        },
        'hidden_size': {
            'values': [16,32,64,256]
        },
        'cell_type': {
            'values': ['LSTM','GRU']
        },
        'dropout': {
            'values': [0.2,0.3]
        },
    }
}
sweep_id = wandb.sweep(sweep = sweep_configuration,project = 'CS6910_assignment_3')
wandb.agent(sweep_id,function=with_attention,count = 50)
wandb.finish()

Create sweep with ID: pedrhuvz
Sweep URL: https://wandb.ai/sumanta_roy/CS6910_assignment_3/sweeps/pedrhuvz


[34m[1mwandb[0m: Agent Starting Run: vrqv3ok4 with config:
[34m[1mwandb[0m: 	batchsize: 128
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hidden_size: 16
[34m[1mwandb[0m: 	input_embedding_size: 64
[34m[1mwandb[0m: 	no_of_layers: 1




Epoch: 1/10; Train loss: 1.7747661163693382; Val loss: 2.301727669579642
Epoch: 2/10; Train loss: 1.1627933686120169; Val loss: 1.788969942501613
Epoch: 3/10; Train loss: 1.082102480388823; Val loss: 1.6121135581107366
Epoch: 4/10; Train loss: 1.0176423504239036; Val loss: 1.566360592842102
Epoch: 5/10; Train loss: 0.9618668842315674; Val loss: 1.5154785968008495
Epoch: 6/10; Train loss: 0.905125060081482; Val loss: 1.4739671179226466
Epoch: 7/10; Train loss: 0.8485791704768226; Val loss: 1.4543180210249764
Epoch: 8/10; Train loss: 0.7945302142415728; Val loss: 1.4223367742129736
Epoch: 9/10; Train loss: 0.7458493103299823; Val loss: 1.3835825721422832
Epoch: 10/10; Train loss: 0.7021054748126438; Val loss: 1.3604274619193304
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 17% |  2% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 17% |  1% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▁▁▁▁▁▂▂▅█
training_loss,█▄▃▃▃▂▂▂▁▁
validation_accuracy,▁▁▁▁▁▂▃▄▆█
validation_loss,█▄▃▃▂▂▂▁▁▁

0,1
epoch,10.0
training_accuracy,0.75977
training_loss,0.70211
validation_accuracy,2.49023
validation_loss,1.36043


[34m[1mwandb[0m: Agent Starting Run: 6wygk7eo with config:
[34m[1mwandb[0m: 	batchsize: 64
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	input_embedding_size: 32
[34m[1mwandb[0m: 	no_of_layers: 2




Epoch: 1/10; Train loss: 1.3379509730566115; Val loss: 1.3913389586267018
Epoch: 2/10; Train loss: 0.9205929716428121; Val loss: 1.3099210858345032
Epoch: 3/10; Train loss: 0.7744047242119199; Val loss: 1.2486665461744582
Epoch: 4/10; Train loss: 0.6758992040724981; Val loss: 1.2193769770009177
Epoch: 5/10; Train loss: 0.5976025608040038; Val loss: 1.1770711825007485
Epoch: 6/10; Train loss: 0.5189938893772307; Val loss: 1.1391471823056538
Epoch: 7/10; Train loss: 0.4415196726151875; Val loss: 1.1106396927720024
Epoch: 8/10; Train loss: 0.3809294908671152; Val loss: 1.0908050451959883
Epoch: 9/10; Train loss: 0.3396337008760089; Val loss: 1.084274634009316
Epoch: 10/10; Train loss: 0.31030986371494473; Val loss: 1.1026212204070318
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 23% |  2% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 23% |  1% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▁▁▂▂▃▄▆▇█
training_loss,█▅▄▃▃▂▂▁▁▁
validation_accuracy,▁▁▂▃▃▄▆▇██
validation_loss,█▆▅▄▃▂▂▁▁▁

0,1
epoch,10.0
training_accuracy,13.69141
training_loss,0.31031
validation_accuracy,19.01855
validation_loss,1.10262


[34m[1mwandb[0m: Agent Starting Run: yntwky48 with config:
[34m[1mwandb[0m: 	batchsize: 256
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 16
[34m[1mwandb[0m: 	no_of_layers: 1




Epoch: 1/10; Train loss: 1.3638707833063035; Val loss: 1.4011720645995367
Epoch: 2/10; Train loss: 0.8035332171122233; Val loss: 1.0582206930433
Epoch: 3/10; Train loss: 0.5112077084041777; Val loss: 1.0163647844677879
Epoch: 4/10; Train loss: 0.3873922610282898; Val loss: 0.9995161692301432
Epoch: 5/10; Train loss: 0.3155114165941874; Val loss: 1.0046458414622716
Epoch: 6/10; Train loss: 0.27079136144547233; Val loss: 1.0402381760733468
Epoch: 7/10; Train loss: 0.24319689546312603; Val loss: 1.046334805942717
Epoch: 8/10; Train loss: 0.22308077426183792; Val loss: 1.0359540383021038
Epoch: 9/10; Train loss: 0.2079809774671282; Val loss: 1.0141500575201852
Epoch: 10/10; Train loss: 0.1969833695320856; Val loss: 1.0322040546508062
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 42% |  6% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 42% |  2% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.019 MB of 0.019 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▁▃▄▅▆▇▇██
training_loss,█▅▃▂▂▁▁▁▁▁
validation_accuracy,▁▂▄▅▆▆▇▇██
validation_loss,█▂▁▁▁▂▂▂▁▂

0,1
epoch,10.0
training_accuracy,26.61719
training_loss,0.19698
validation_accuracy,29.32129
validation_loss,1.0322


[34m[1mwandb[0m: Agent Starting Run: 3psnze77 with config:
[34m[1mwandb[0m: 	batchsize: 32
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	input_embedding_size: 16
[34m[1mwandb[0m: 	no_of_layers: 3




Epoch: 1/10; Train loss: 1.337843587398529; Val loss: 1.5348521357490903
Epoch: 2/10; Train loss: 1.0077932354665937; Val loss: 1.5175286964291619
Epoch: 3/10; Train loss: 0.8507906155075345; Val loss: 1.4401241526717232
Epoch: 4/10; Train loss: 0.7323460574660983; Val loss: 1.3545040041208267
Epoch: 5/10; Train loss: 0.6410456160136632; Val loss: 1.3507182669071924
Epoch: 6/10; Train loss: 0.5664463019654864; Val loss: 1.2889437093621208
Epoch: 7/10; Train loss: 0.5106777611516771; Val loss: 1.3024553280501139
Epoch: 8/10; Train loss: 0.46559121160280137; Val loss: 1.315871034349714
Epoch: 9/10; Train loss: 0.42810612479845683; Val loss: 1.3202813352857317
Epoch: 10/10; Train loss: 0.397463353233678; Val loss: 1.3356822565907525
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 23% |  2% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 23% |  1% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▁▁▁▂▃▄▆▇█
training_loss,█▆▄▃▃▂▂▂▁▁
validation_accuracy,▁▁▁▂▃▅▆▆▇█
validation_loss,██▅▃▃▁▁▂▂▂

0,1
epoch,10.0
training_accuracy,9.71094
training_loss,0.39746
validation_accuracy,14.2334
validation_loss,1.33568


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: fyg7q7m0 with config:
[34m[1mwandb[0m: 	batchsize: 64
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	input_embedding_size: 32
[34m[1mwandb[0m: 	no_of_layers: 2




Epoch: 1/10; Train loss: 1.2155037666502453; Val loss: 1.3604648822829837
Epoch: 2/10; Train loss: 0.7749453155199687; Val loss: 1.1801527355398451
Epoch: 3/10; Train loss: 0.5473746209769021; Val loss: 1.0760080204123543
Epoch: 4/10; Train loss: 0.37642782415662496; Val loss: 1.033672156788054
Epoch: 5/10; Train loss: 0.2917591534058253; Val loss: 1.0293024082978566
Epoch: 6/10; Train loss: 0.249310804775783; Val loss: 1.0280204585620336
Epoch: 7/10; Train loss: 0.22424986825102852; Val loss: 1.0353834416185106
Epoch: 8/10; Train loss: 0.2068283222544761; Val loss: 1.0270743668079376
Epoch: 9/10; Train loss: 0.19461841075193315; Val loss: 1.0505528208755313
Epoch: 10/10; Train loss: 0.1857262777856418; Val loss: 1.0482041147493182
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 38% |  2% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 38% |  1% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▁▂▃▅▆▇▇██
training_loss,█▅▃▂▂▁▁▁▁▁
validation_accuracy,▁▂▃▅▆▇▇▇██
validation_loss,█▄▂▁▁▁▁▁▁▁

0,1
epoch,10.0
training_accuracy,27.96094
training_loss,0.18573
validation_accuracy,31.29883
validation_loss,1.0482


[34m[1mwandb[0m: Agent Starting Run: wwfr80uh with config:
[34m[1mwandb[0m: 	batchsize: 256
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 16
[34m[1mwandb[0m: 	input_embedding_size: 64
[34m[1mwandb[0m: 	no_of_layers: 1




Epoch: 1/10; Train loss: 2.146350613548642; Val loss: 1.549318137623015
Epoch: 2/10; Train loss: 1.2255677332196917; Val loss: 1.4601462455022902
Epoch: 3/10; Train loss: 1.1044542775835309; Val loss: 1.5160544202441262
Epoch: 4/10; Train loss: 1.0434682532719204; Val loss: 1.4997261478787376
Epoch: 5/10; Train loss: 0.9967922941843669; Val loss: 1.4922943228767032
Epoch: 6/10; Train loss: 0.9565222236088344; Val loss: 1.4537266379310971
Epoch: 7/10; Train loss: 0.9222978006090436; Val loss: 1.506903432664417
Epoch: 8/10; Train loss: 0.8941067904517764; Val loss: 1.4789768173581077
Epoch: 9/10; Train loss: 0.8649613094329834; Val loss: 1.4559007031576974
Epoch: 10/10; Train loss: 0.8346983823322115; Val loss: 1.4697615816479637
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 21% |  2% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 21% |  1% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▁▁▁▂▂▃▄▅█
training_loss,█▃▂▂▂▂▁▁▁▁
validation_accuracy,▁▁▁▂▂▃▄▅██
validation_loss,█▁▆▄▄▁▅▃▁▂

0,1
epoch,10.0
training_accuracy,0.17578
training_loss,0.8347
validation_accuracy,0.65918
validation_loss,1.46976


[34m[1mwandb[0m: Agent Starting Run: getyaa7r with config:
[34m[1mwandb[0m: 	batchsize: 128
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 32
[34m[1mwandb[0m: 	no_of_layers: 3




Epoch: 1/10; Train loss: 1.0158823046230134; Val loss: 1.1113069142614092
Epoch: 2/10; Train loss: 0.3176867071219853; Val loss: 1.0081427977198647
Epoch: 3/10; Train loss: 0.19741426059177944; Val loss: 0.9784522170112246
Epoch: 4/10; Train loss: 0.16193443369297755; Val loss: 0.9773990909258524
Epoch: 5/10; Train loss: 0.1414427906558627; Val loss: 0.9688491863863808
Epoch: 6/10; Train loss: 0.12673408156349544; Val loss: 0.9680306201889401
Epoch: 7/10; Train loss: 0.11522323320309322; Val loss: 0.9703780270758129
Epoch: 8/10; Train loss: 0.10539295642148881; Val loss: 1.0076872564497448
Epoch: 9/10; Train loss: 0.0968469129715647; Val loss: 1.0197817598070418
Epoch: 10/10; Train loss: 0.08918691210803531; Val loss: 1.0568595982733227
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 41% |  6% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 41% |  2% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.019 MB of 0.019 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▃▅▅▆▆▇▇██
training_loss,█▃▂▂▁▁▁▁▁▁
validation_accuracy,▁▅▆▇▇▇████
validation_loss,█▃▂▁▁▁▁▃▄▅

0,1
epoch,10.0
training_accuracy,53.51953
training_loss,0.08919
validation_accuracy,40.60059
validation_loss,1.05686


[34m[1mwandb[0m: Agent Starting Run: 95wdx3nm with config:
[34m[1mwandb[0m: 	batchsize: 64
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 256
[34m[1mwandb[0m: 	no_of_layers: 3




Epoch: 1/10; Train loss: 0.5186356178209895; Val loss: 1.0410206204368955
Epoch: 2/10; Train loss: 0.18645897791499183; Val loss: 1.0257859492585772
Epoch: 3/10; Train loss: 0.1501499311980747; Val loss: 1.0185971103963398
Epoch: 4/10; Train loss: 0.13160768854476157; Val loss: 0.9988633720647722
Epoch: 5/10; Train loss: 0.11922839728139696; Val loss: 1.03445602953434
Epoch: 6/10; Train loss: 0.10913293165110406; Val loss: 1.058715290256909
Epoch: 7/10; Train loss: 0.0998160669136615; Val loss: 1.0782522630123865
Epoch: 8/10; Train loss: 0.09302934360646066; Val loss: 1.0930682974202293
Epoch: 9/10; Train loss: 0.08680859345055762; Val loss: 1.142883912438438
Epoch: 10/10; Train loss: 0.08081687214828673; Val loss: 1.1544455119541712
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 38% |  4% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 38% |  2% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▄▅▅▆▆▇▇██
training_loss,█▃▂▂▂▁▁▁▁▁
validation_accuracy,▁▅▆▇██████
validation_loss,▃▂▂▁▃▄▅▅▇█

0,1
epoch,10.0
training_accuracy,56.06055
training_loss,0.08082
validation_accuracy,40.74707
validation_loss,1.15445


[34m[1mwandb[0m: Agent Starting Run: 2j0p1lz6 with config:
[34m[1mwandb[0m: 	batchsize: 64
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 256
[34m[1mwandb[0m: 	no_of_layers: 3




Epoch: 1/10; Train loss: 0.5155969780115854; Val loss: 1.0288983654408228
Epoch: 2/10; Train loss: 0.1865424759047372; Val loss: 0.9803512287991387
Epoch: 3/10; Train loss: 0.15095895551499866; Val loss: 1.0216624970946993
Epoch: 4/10; Train loss: 0.1326508119063718; Val loss: 1.0243370298828398
Epoch: 5/10; Train loss: 0.12020358087761061; Val loss: 1.0229703074409848
Epoch: 6/10; Train loss: 0.10970122564406622; Val loss: 1.0127306403148741
Epoch: 7/10; Train loss: 0.10194096240259352; Val loss: 1.071108450492223
Epoch: 8/10; Train loss: 0.09515728292720659; Val loss: 1.0778072760218667
Epoch: 9/10; Train loss: 0.08845917538517997; Val loss: 1.1270292372930617
Epoch: 10/10; Train loss: 0.08360384830761523; Val loss: 1.2024068066052027
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 38% |  4% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 38% |  2% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▄▅▆▆▇▇▇██
training_loss,█▃▂▂▂▁▁▁▁▁
validation_accuracy,▁▄▅▇▇███▇▇
validation_loss,▃▁▂▂▂▂▄▄▆█

0,1
epoch,10.0
training_accuracy,54.98242
training_loss,0.0836
validation_accuracy,39.47754
validation_loss,1.20241


[34m[1mwandb[0m: Agent Starting Run: np6k5w0g with config:
[34m[1mwandb[0m: 	batchsize: 64
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 64
[34m[1mwandb[0m: 	no_of_layers: 3




Epoch: 1/10; Train loss: 0.6117887920708883; Val loss: 1.062359719049363
Epoch: 2/10; Train loss: 0.1983212365564846; Val loss: 0.9883103122313818
Epoch: 3/10; Train loss: 0.1571667357569649; Val loss: 0.9648783348855519
Epoch: 4/10; Train loss: 0.13731499408682188; Val loss: 0.9962117572625478
Epoch: 5/10; Train loss: 0.12325662312763078; Val loss: 0.9863328202849343
Epoch: 6/10; Train loss: 0.1124983755889393; Val loss: 1.0466563602288563
Epoch: 7/10; Train loss: 0.1029634181587469; Val loss: 1.0494012045008796
Epoch: 8/10; Train loss: 0.09443883616299856; Val loss: 1.0708822026139213
Epoch: 9/10; Train loss: 0.08723695637924331; Val loss: 1.1274541878984088
Epoch: 10/10; Train loss: 0.08135175990561644; Val loss: 1.1503402150812603
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 39% |  4% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 39% |  2% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▄▅▅▆▆▇▇██
training_loss,█▃▂▂▂▁▁▁▁▁
validation_accuracy,▁▅▆▇█▇████
validation_loss,▅▂▁▂▂▄▄▅▇█

0,1
epoch,10.0
training_accuracy,56.13086
training_loss,0.08135
validation_accuracy,40.40527
validation_loss,1.15034


[34m[1mwandb[0m: Agent Starting Run: 1bfers1m with config:
[34m[1mwandb[0m: 	batchsize: 32
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 256
[34m[1mwandb[0m: 	no_of_layers: 2




Epoch: 1/10; Train loss: 0.3670325856762273; Val loss: 1.0516122175114495
Epoch: 2/10; Train loss: 0.17324430533463048; Val loss: 1.0494093412444705
Epoch: 3/10; Train loss: 0.1501543305317561; Val loss: 1.040504966818151
Epoch: 4/10; Train loss: 0.13716285576777798; Val loss: 1.068502412488063
Epoch: 5/10; Train loss: 0.12959277160110927; Val loss: 1.1231218668676557
Epoch: 6/10; Train loss: 0.12342819657708917; Val loss: 1.1646583027073316
Epoch: 7/10; Train loss: 0.11809746777017911; Val loss: 1.2183088511228561
Epoch: 8/10; Train loss: 0.11466942246116343; Val loss: 1.2050637698599271
Epoch: 9/10; Train loss: 0.11220779922391687; Val loss: 1.25040568624224
Epoch: 10/10; Train loss: 0.10996306467623938; Val loss: 1.23500798926467
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 36% |  3% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 36% |  2% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▄▅▆▇▇▇███
training_loss,█▃▂▂▂▁▁▁▁▁
validation_accuracy,▁▄▆▆▇▇▇███
validation_loss,▁▁▁▂▄▅▇▆█▇

0,1
epoch,10.0
training_accuracy,45.9043
training_loss,0.10996
validation_accuracy,39.33105
validation_loss,1.23501


[34m[1mwandb[0m: Agent Starting Run: ftyuim49 with config:
[34m[1mwandb[0m: 	batchsize: 256
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 256
[34m[1mwandb[0m: 	no_of_layers: 3




Epoch: 1/10; Train loss: 0.958378494807652; Val loss: 1.2049621230080014
Epoch: 2/10; Train loss: 0.34283692802701676; Val loss: 0.989008176894415
Epoch: 3/10; Train loss: 0.19618044597761972; Val loss: 0.9535602785292125
Epoch: 4/10; Train loss: 0.1547136878967285; Val loss: 0.9446884677523658
Epoch: 5/10; Train loss: 0.13283246710186913; Val loss: 0.938572625319163
Epoch: 6/10; Train loss: 0.11807130336761475; Val loss: 0.9457208939961025
Epoch: 7/10; Train loss: 0.10567332920574006; Val loss: 0.9836999802362352
Epoch: 8/10; Train loss: 0.09602345895199549; Val loss: 0.998662204969497
Epoch: 9/10; Train loss: 0.08591805117470877; Val loss: 1.0278277170090449
Epoch: 10/10; Train loss: 0.07718030804679507; Val loss: 1.038930206071763
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 50% | 10% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 50% |  3% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▃▄▅▆▆▇▇██
training_loss,█▃▂▂▁▁▁▁▁▁
validation_accuracy,▁▅▇▇▇█████
validation_loss,█▂▁▁▁▁▂▃▃▄

0,1
epoch,10.0
training_accuracy,58.67383
training_loss,0.07718
validation_accuracy,41.18652
validation_loss,1.03893


[34m[1mwandb[0m: Agent Starting Run: jad5pjv7 with config:
[34m[1mwandb[0m: 	batchsize: 256
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 32
[34m[1mwandb[0m: 	no_of_layers: 3




Epoch: 1/10; Train loss: 1.2799189889998663; Val loss: 1.302054314386277
Epoch: 2/10; Train loss: 0.5774129948161897; Val loss: 1.0423006273451305
Epoch: 3/10; Train loss: 0.2803301818030221; Val loss: 0.9989715076628185
Epoch: 4/10; Train loss: 0.2029778684888567; Val loss: 0.9650936410540626
Epoch: 5/10; Train loss: 0.16890299848147802; Val loss: 0.9650169497444516
Epoch: 6/10; Train loss: 0.148468112661725; Val loss: 0.9697323356355939
Epoch: 7/10; Train loss: 0.1348655144941239; Val loss: 0.9411189329056513
Epoch: 8/10; Train loss: 0.12303543493861244; Val loss: 0.9766071807770502
Epoch: 9/10; Train loss: 0.11309551633539654; Val loss: 0.9505949133918399
Epoch: 10/10; Train loss: 0.10408415757474446; Val loss: 0.9529586292448498
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 52% |  9% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 52% |  3% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▂▄▅▆▆▇▇██
training_loss,█▄▂▂▁▁▁▁▁▁
validation_accuracy,▁▄▆▆▇▇▇███
validation_loss,█▃▂▁▁▂▁▂▁▁

0,1
epoch,10.0
training_accuracy,48.32031
training_loss,0.10408
validation_accuracy,40.2832
validation_loss,0.95296


[34m[1mwandb[0m: Agent Starting Run: 64i5wwcz with config:
[34m[1mwandb[0m: 	batchsize: 128
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 256
[34m[1mwandb[0m: 	no_of_layers: 2




Epoch: 1/10; Train loss: 0.6201989846570152; Val loss: 0.9636650482813517
Epoch: 2/10; Train loss: 0.2016810998746327; Val loss: 0.9522435296149481
Epoch: 3/10; Train loss: 0.1547781035729817; Val loss: 0.9912030469803583
Epoch: 4/10; Train loss: 0.13470659761201767; Val loss: 0.9462260717437381
Epoch: 5/10; Train loss: 0.12114624818166098; Val loss: 0.9595169368244353
Epoch: 6/10; Train loss: 0.10992670534622101; Val loss: 0.9661697106701987
Epoch: 7/10; Train loss: 0.10038591884431385; Val loss: 1.014244712534405
Epoch: 8/10; Train loss: 0.09295136692978087; Val loss: 1.0223027354195005
Epoch: 9/10; Train loss: 0.08649801907085237; Val loss: 1.0568319303648812
Epoch: 10/10; Train loss: 0.07965672590902874; Val loss: 1.0884724401292347
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 40% |  5% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 40% |  3% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▄▅▅▆▆▇▇██
training_loss,█▃▂▂▂▁▁▁▁▁
validation_accuracy,▁▄▆▇▇█████
validation_loss,▂▁▃▁▂▂▄▅▆█

0,1
epoch,10.0
training_accuracy,56.56445
training_loss,0.07966
validation_accuracy,40.69824
validation_loss,1.08847


[34m[1mwandb[0m: Agent Starting Run: nc0zguur with config:
[34m[1mwandb[0m: 	batchsize: 64
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 64
[34m[1mwandb[0m: 	no_of_layers: 1




Epoch: 1/10; Train loss: 0.5386910382055101; Val loss: 0.9777080367008845
Epoch: 2/10; Train loss: 0.20235102284522283; Val loss: 0.9732629330385298
Epoch: 3/10; Train loss: 0.16570977269184023; Val loss: 0.9658126476265135
Epoch: 4/10; Train loss: 0.1491251165384338; Val loss: 0.960847645288422
Epoch: 5/10; Train loss: 0.13840423987025308; Val loss: 0.9749725099120822
Epoch: 6/10; Train loss: 0.12948696251426425; Val loss: 1.0028580242679233
Epoch: 7/10; Train loss: 0.12334535761958076; Val loss: 1.0063465911717642
Epoch: 8/10; Train loss: 0.11802629120293118; Val loss: 1.014306814188049
Epoch: 9/10; Train loss: 0.11368407899425144; Val loss: 1.0150713792869024
Epoch: 10/10; Train loss: 0.10918038774813925; Val loss: 1.0396944752761297
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 36% |  2% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 36% |  1% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▄▅▆▆▇▇▇██
training_loss,█▃▂▂▁▁▁▁▁▁
validation_accuracy,▁▄▅▆▇▇▇███
validation_loss,▂▂▁▁▂▅▅▆▆█

0,1
epoch,10.0
training_accuracy,46.49023
training_loss,0.10918
validation_accuracy,41.08887
validation_loss,1.03969


[34m[1mwandb[0m: Agent Starting Run: 1fvup832 with config:
[34m[1mwandb[0m: 	batchsize: 64
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 256
[34m[1mwandb[0m: 	no_of_layers: 2




Epoch: 1/10; Train loss: 0.45339496199573787; Val loss: 1.0020088269597007
Epoch: 2/10; Train loss: 0.17720372798896972; Val loss: 1.0006278448161625
Epoch: 3/10; Train loss: 0.14646566152572632; Val loss: 0.977571482459704
Epoch: 4/10; Train loss: 0.13075669527053832; Val loss: 0.9911122911033177
Epoch: 5/10; Train loss: 0.11978391412468184; Val loss: 1.017619102483704
Epoch: 6/10; Train loss: 0.1111371679249264; Val loss: 1.055724778345653
Epoch: 7/10; Train loss: 0.10340989598206111; Val loss: 1.0608312970116025
Epoch: 8/10; Train loss: 0.09737099477222988; Val loss: 1.0936229342506045
Epoch: 9/10; Train loss: 0.09206387638336136; Val loss: 1.1092562178770702
Epoch: 10/10; Train loss: 0.08792289871190276; Val loss: 1.1613833563668388
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 39% |  3% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 34% |  2% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▄▅▆▆▇▇▇██
training_loss,█▃▂▂▂▁▁▁▁▁
validation_accuracy,▁▃▅▇▇▇▇██▇
validation_loss,▂▂▁▂▃▄▄▅▆█

0,1
epoch,10.0
training_accuracy,53.22266
training_loss,0.08792
validation_accuracy,40.01465
validation_loss,1.16138


[34m[1mwandb[0m: Agent Starting Run: ndx5hlt1 with config:
[34m[1mwandb[0m: 	batchsize: 64
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 256
[34m[1mwandb[0m: 	no_of_layers: 2




Epoch: 1/10; Train loss: 0.43936059020814444; Val loss: 1.0058133013191677
Epoch: 2/10; Train loss: 0.17540163559573038; Val loss: 0.9654469057208016
Epoch: 3/10; Train loss: 0.14663533535741624; Val loss: 0.9767429466758456
Epoch: 4/10; Train loss: 0.13223732678663164; Val loss: 1.001878592939604
Epoch: 5/10; Train loss: 0.12108598644534747; Val loss: 1.0180911932672774
Epoch: 6/10; Train loss: 0.11246381693652698; Val loss: 1.0169577038004285
Epoch: 7/10; Train loss: 0.10488331005686806; Val loss: 1.0538003494342167
Epoch: 8/10; Train loss: 0.09812661916727111; Val loss: 1.0609458876507623
Epoch: 9/10; Train loss: 0.09363905334756488; Val loss: 1.0903985699017842
Epoch: 10/10; Train loss: 0.08854457122229395; Val loss: 1.1285157671996526
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 40% |  3% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 40% |  2% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▄▅▆▆▇▇▇██
training_loss,█▃▂▂▂▁▁▁▁▁
validation_accuracy,▁▄▆▆▆▇▇██▇
validation_loss,▃▁▁▃▃▃▅▅▆█

0,1
epoch,10.0
training_accuracy,52.82422
training_loss,0.08854
validation_accuracy,40.64941
validation_loss,1.12852


[34m[1mwandb[0m: Agent Starting Run: 9lni96vc with config:
[34m[1mwandb[0m: 	batchsize: 64
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 64
[34m[1mwandb[0m: 	no_of_layers: 1




Epoch: 1/10; Train loss: 0.5533937226874488; Val loss: 0.9454148780731928
Epoch: 2/10; Train loss: 0.20369253619795755; Val loss: 0.957376587958563
Epoch: 3/10; Train loss: 0.16778540548824128; Val loss: 0.9655162968805858
Epoch: 4/10; Train loss: 0.14998812837969688; Val loss: 0.9521584730772745
Epoch: 5/10; Train loss: 0.1394532889340605; Val loss: 0.9827076267628443
Epoch: 6/10; Train loss: 0.13109067840235575; Val loss: 0.97848841547966
Epoch: 7/10; Train loss: 0.12459303285394396; Val loss: 1.0202348147119795
Epoch: 8/10; Train loss: 0.11859152337624913; Val loss: 1.016376283906755
Epoch: 9/10; Train loss: 0.11419226103595324; Val loss: 1.0240000599906558
Epoch: 10/10; Train loss: 0.11038983597641899; Val loss: 1.0409615628776097
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 37% |  2% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 33% |  1% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▄▅▆▆▇▇███
training_loss,█▂▂▂▁▁▁▁▁▁
validation_accuracy,▁▃▅▆▇▇▇███
validation_loss,▁▂▂▁▄▃▆▆▇█

0,1
epoch,10.0
training_accuracy,45.7793
training_loss,0.11039
validation_accuracy,40.30762
validation_loss,1.04096


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ha74myoa with config:
[34m[1mwandb[0m: 	batchsize: 64
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 64
[34m[1mwandb[0m: 	no_of_layers: 2




Epoch: 1/10; Train loss: 0.7157838568800972; Val loss: 1.0035415646575747
Epoch: 2/10; Train loss: 0.2656262223919233; Val loss: 0.9527370638790584
Epoch: 3/10; Train loss: 0.17445646654991875; Val loss: 0.9273336785180228
Epoch: 4/10; Train loss: 0.14328916147351264; Val loss: 0.9645422356469291
Epoch: 5/10; Train loss: 0.1247944303779375; Val loss: 0.9647666641644069
Epoch: 6/10; Train loss: 0.11038345462509563; Val loss: 0.9828856253907794
Epoch: 7/10; Train loss: 0.09838524822677885; Val loss: 1.0126632168179466
Epoch: 8/10; Train loss: 0.08793043094021934; Val loss: 1.0371121175232387
Epoch: 9/10; Train loss: 0.07860412675000372; Val loss: 1.0922290959528513
Epoch: 10/10; Train loss: 0.0705422149988867; Val loss: 1.115759776461692
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 40% |  3% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 34% |  3% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▃▄▅▆▆▇▇██
training_loss,█▃▂▂▂▁▁▁▁▁
validation_accuracy,▁▅▇▇██████
validation_loss,▄▂▁▂▂▃▄▅▇█

0,1
epoch,10.0
training_accuracy,60.53125
training_loss,0.07054
validation_accuracy,39.72168
validation_loss,1.11576


[34m[1mwandb[0m: Agent Starting Run: oy6lq1z2 with config:
[34m[1mwandb[0m: 	batchsize: 128
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 256
[34m[1mwandb[0m: 	no_of_layers: 3




Epoch: 1/10; Train loss: 0.6924664845353081; Val loss: 1.001958585920788
Epoch: 2/10; Train loss: 0.21269008860701608; Val loss: 1.033870015825544
Epoch: 3/10; Train loss: 0.15882505856809162; Val loss: 0.9752534088634309
Epoch: 4/10; Train loss: 0.13503180932430994; Val loss: 0.9747013705117362
Epoch: 5/10; Train loss: 0.1197173745007742; Val loss: 0.978735910994666
Epoch: 6/10; Train loss: 0.10684297950494857; Val loss: 1.003050105912345
Epoch: 7/10; Train loss: 0.0970028357278733; Val loss: 1.0345639983812969
Epoch: 8/10; Train loss: 0.08858817212638401; Val loss: 1.0508189854167758
Epoch: 9/10; Train loss: 0.0808310462179638; Val loss: 1.1133715340069361
Epoch: 10/10; Train loss: 0.07351395834059943; Val loss: 1.1181519145057315
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 41% |  6% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 37% |  3% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▄▅▅▆▆▇▇██
training_loss,█▃▂▂▂▁▁▁▁▁
validation_accuracy,▁▄▆▇▇█████
validation_loss,▂▄▁▁▁▂▄▅██

0,1
epoch,10.0
training_accuracy,59.16992
training_loss,0.07351
validation_accuracy,39.86816
validation_loss,1.11815


[34m[1mwandb[0m: Agent Starting Run: bghn7fp8 with config:
[34m[1mwandb[0m: 	batchsize: 32
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 32
[34m[1mwandb[0m: 	no_of_layers: 2




Epoch: 1/10; Train loss: 0.48548930667695545; Val loss: 1.108253582247666
Epoch: 2/10; Train loss: 0.19174757966328235; Val loss: 1.0278046028245063
Epoch: 3/10; Train loss: 0.16059590639812607; Val loss: 1.0455223719278972
Epoch: 4/10; Train loss: 0.14575245029869532; Val loss: 1.0687929959524245
Epoch: 5/10; Train loss: 0.13396974384429908; Val loss: 1.0914011665043377
Epoch: 6/10; Train loss: 0.12564770918871676; Val loss: 1.115010145519461
Epoch: 7/10; Train loss: 0.11940333520727499; Val loss: 1.1386059004636038
Epoch: 8/10; Train loss: 0.11411089130810328; Val loss: 1.1818877660802432
Epoch: 9/10; Train loss: 0.10966681008182821; Val loss: 1.184365553515298
Epoch: 10/10; Train loss: 0.1057312702635924; Val loss: 1.2214661790501504
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 37% |  3% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 37% |  2% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▄▅▆▆▇▇▇██
training_loss,█▃▂▂▂▁▁▁▁▁
validation_accuracy,▁▆▆▇▇█████
validation_loss,▄▁▂▂▃▄▅▇▇█

0,1
epoch,10.0
training_accuracy,47.70117
training_loss,0.10573
validation_accuracy,38.59863
validation_loss,1.22147


[34m[1mwandb[0m: Agent Starting Run: yrzz8l3o with config:
[34m[1mwandb[0m: 	batchsize: 64
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 256
[34m[1mwandb[0m: 	no_of_layers: 2




Epoch: 1/10; Train loss: 0.4539048217875617; Val loss: 1.0541053712368011
Epoch: 2/10; Train loss: 0.17926600344124294; Val loss: 0.9924970155670529
Epoch: 3/10; Train loss: 0.14804209727616538; Val loss: 0.9888348543927783
Epoch: 4/10; Train loss: 0.13119130479437965; Val loss: 0.98372896867139
Epoch: 5/10; Train loss: 0.12028065190428779; Val loss: 1.0126442724750155
Epoch: 6/10; Train loss: 0.11118928351572582; Val loss: 1.0476687195755185
Epoch: 7/10; Train loss: 0.10421738245657512; Val loss: 1.05659314706212
Epoch: 8/10; Train loss: 0.09792839185112999; Val loss: 1.0678860474200476
Epoch: 9/10; Train loss: 0.09293231988237018; Val loss: 1.127703920716331
Epoch: 10/10; Train loss: 0.08768059780200323; Val loss: 1.1212360248679207
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 40% |  3% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 40% |  2% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▄▅▆▆▇▇▇██
training_loss,█▃▂▂▂▁▁▁▁▁
validation_accuracy,▁▅▅▆▇█▇▇██
validation_loss,▄▁▁▁▂▄▅▅██

0,1
epoch,10.0
training_accuracy,53.24414
training_loss,0.08768
validation_accuracy,40.7959
validation_loss,1.12124


[34m[1mwandb[0m: Agent Starting Run: sm4v44j3 with config:
[34m[1mwandb[0m: 	batchsize: 128
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 64
[34m[1mwandb[0m: 	no_of_layers: 1




Epoch: 1/10; Train loss: 0.7350108537787483; Val loss: 0.937432926325571
Epoch: 2/10; Train loss: 0.25301550300348374; Val loss: 0.9320667669886634
Epoch: 3/10; Train loss: 0.19034224251906076; Val loss: 0.9623797677812123
Epoch: 4/10; Train loss: 0.16431383388383047; Val loss: 0.9338191804431734
Epoch: 5/10; Train loss: 0.14958240960325514; Val loss: 0.9387460606438773
Epoch: 6/10; Train loss: 0.13941053938297998; Val loss: 0.9453738218262082
Epoch: 7/10; Train loss: 0.13163380303553174; Val loss: 0.9373318794227782
Epoch: 8/10; Train loss: 0.12485050143230529; Val loss: 0.9606181567623502
Epoch: 9/10; Train loss: 0.11926749185437248; Val loss: 0.9481423071452549
Epoch: 10/10; Train loss: 0.11464781301362174; Val loss: 1.0092644492785137
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 40% |  4% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 40% |  1% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▄▅▆▆▇▇▇██
training_loss,█▃▂▂▁▁▁▁▁▁
validation_accuracy,▁▄▆▆▇▇████
validation_loss,▁▁▄▁▂▂▁▄▂█

0,1
epoch,10.0
training_accuracy,44.41211
training_loss,0.11465
validation_accuracy,39.62402
validation_loss,1.00926


[34m[1mwandb[0m: Agent Starting Run: lso4wtxf with config:
[34m[1mwandb[0m: 	batchsize: 128
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 64
[34m[1mwandb[0m: 	no_of_layers: 2




Epoch: 1/10; Train loss: 0.735563026098978; Val loss: 1.0079281273342313
Epoch: 2/10; Train loss: 0.22417288825625464; Val loss: 0.9920039943286351
Epoch: 3/10; Train loss: 0.16826446990172067; Val loss: 0.9653114832582927
Epoch: 4/10; Train loss: 0.14522489987668538; Val loss: 0.9751508661678859
Epoch: 5/10; Train loss: 0.13026488001857486; Val loss: 0.9902770207041786
Epoch: 6/10; Train loss: 0.11926758689539774; Val loss: 0.983922663189116
Epoch: 7/10; Train loss: 0.11030333683604286; Val loss: 0.9533527067729405
Epoch: 8/10; Train loss: 0.10202255961440858; Val loss: 1.0209732538177854
Epoch: 9/10; Train loss: 0.09426234600089846; Val loss: 1.0067495079267592
Epoch: 10/10; Train loss: 0.08825853438604446; Val loss: 1.0446907480557759
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 35% |  4% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 35% |  2% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▄▅▆▆▆▇▇██
training_loss,█▂▂▂▁▁▁▁▁▁
validation_accuracy,▁▅▆▇▇▇██▇█
validation_loss,▅▄▂▃▄▃▁▆▅█

0,1
epoch,10.0
training_accuracy,53.40039
training_loss,0.08826
validation_accuracy,40.33203
validation_loss,1.04469


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: h5952pxl with config:
[34m[1mwandb[0m: 	batchsize: 32
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 64
[34m[1mwandb[0m: 	no_of_layers: 1




Epoch: 1/10; Train loss: 0.4173388080724648; Val loss: 1.0016557623942692
Epoch: 2/10; Train loss: 0.1880168030304568; Val loss: 0.9841734291542144
Epoch: 3/10; Train loss: 0.16049752294307662; Val loss: 1.0105686297728902
Epoch: 4/10; Train loss: 0.14688188734508695; Val loss: 1.0330255237363635
Epoch: 5/10; Train loss: 0.13740189769083544; Val loss: 1.0943386104135286
Epoch: 6/10; Train loss: 0.1301170121523596; Val loss: 1.1063668515001024
Epoch: 7/10; Train loss: 0.12474006969304312; Val loss: 1.0929949099109286
Epoch: 8/10; Train loss: 0.12066956014505455; Val loss: 1.1120949320140339
Epoch: 9/10; Train loss: 0.11641118454791251; Val loss: 1.132937079739003
Epoch: 10/10; Train loss: 0.11291069267761139; Val loss: 1.1794332954145612
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 29% |  2% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 29% |  1% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▄▅▆▆▇▇▇██
training_loss,█▃▂▂▂▁▁▁▁▁
validation_accuracy,▁▄▅▆▇▇▇▇██
validation_loss,▂▁▂▃▅▅▅▆▆█

0,1
epoch,10.0
training_accuracy,45.16406
training_loss,0.11291
validation_accuracy,39.50195
validation_loss,1.17943


[34m[1mwandb[0m: Agent Starting Run: oik1dw4t with config:
[34m[1mwandb[0m: 	batchsize: 256
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 64
[34m[1mwandb[0m: 	no_of_layers: 3




Epoch: 1/10; Train loss: 1.1118851947784423; Val loss: 1.253683090209961
Epoch: 2/10; Train loss: 0.432751206216358; Val loss: 1.0605395237604778
Epoch: 3/10; Train loss: 0.22875294344765798; Val loss: 0.9568558079855782
Epoch: 4/10; Train loss: 0.17400633318083628; Val loss: 0.9582951012111846
Epoch: 5/10; Train loss: 0.1477882877417973; Val loss: 0.9315567697797503
Epoch: 6/10; Train loss: 0.13123257756233214; Val loss: 0.9557452372142247
Epoch: 7/10; Train loss: 0.11817537529127939; Val loss: 0.9246803295044672
Epoch: 8/10; Train loss: 0.10791690721398309; Val loss: 1.0081891616185505
Epoch: 9/10; Train loss: 0.09874811805429913; Val loss: 0.9821443273907616
Epoch: 10/10; Train loss: 0.08916984629063379; Val loss: 0.9809337116423107
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 55% | 10% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 55% |  3% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▂▄▅▆▆▇▇██
training_loss,█▃▂▂▁▁▁▁▁▁
validation_accuracy,▁▄▆▇▇▇█▇██
validation_loss,█▄▂▂▁▂▁▃▂▂

0,1
epoch,10.0
training_accuracy,53.35547
training_loss,0.08917
validation_accuracy,42.30957
validation_loss,0.98093


[34m[1mwandb[0m: Agent Starting Run: wq1qmfu1 with config:
[34m[1mwandb[0m: 	batchsize: 64
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 64
[34m[1mwandb[0m: 	no_of_layers: 2




Epoch: 1/10; Train loss: 0.5293782014506204; Val loss: 1.001643065895353
Epoch: 2/10; Train loss: 0.18869357338973455; Val loss: 0.9844796267293748
Epoch: 3/10; Train loss: 0.15285218150842758; Val loss: 0.9805244434447515
Epoch: 4/10; Train loss: 0.13545332757490022; Val loss: 0.9874035127106167
Epoch: 5/10; Train loss: 0.12261549782185327; Val loss: 1.0216592976025172
Epoch: 6/10; Train loss: 0.11259799295947666; Val loss: 1.0044834706045331
Epoch: 7/10; Train loss: 0.10425758564046451; Val loss: 1.03802475191298
Epoch: 8/10; Train loss: 0.09799816838332585; Val loss: 1.0650010385683604
Epoch: 9/10; Train loss: 0.09147952306128683; Val loss: 1.0948370673826762
Epoch: 10/10; Train loss: 0.08629855267703533; Val loss: 1.1330885518164862
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 34% |  3% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 34% |  2% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▄▅▆▆▇▇▇██
training_loss,█▃▂▂▂▁▁▁▁▁
validation_accuracy,▁▄▆▇▇▇████
validation_loss,▂▁▁▁▃▂▄▅▆█

0,1
epoch,10.0
training_accuracy,53.97852
training_loss,0.0863
validation_accuracy,41.21094
validation_loss,1.13309


[34m[1mwandb[0m: Agent Starting Run: 1j3oa0tw with config:
[34m[1mwandb[0m: 	batchsize: 256
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 256
[34m[1mwandb[0m: 	no_of_layers: 3




Epoch: 1/10; Train loss: 0.966731390271868; Val loss: 1.114686029297965
Epoch: 2/10; Train loss: 0.3142979792186192; Val loss: 0.955573110353379
Epoch: 3/10; Train loss: 0.19056012971060618; Val loss: 0.9233794325873965
Epoch: 4/10; Train loss: 0.1524955758026668; Val loss: 0.9506671939577375
Epoch: 5/10; Train loss: 0.13180987761134194; Val loss: 0.9483182487033662
Epoch: 6/10; Train loss: 0.11726421631517864; Val loss: 0.975596961520967
Epoch: 7/10; Train loss: 0.10530837422325498; Val loss: 0.9603224879219419
Epoch: 8/10; Train loss: 0.09515092520486741; Val loss: 0.9669331596011207
Epoch: 9/10; Train loss: 0.08524814137390682; Val loss: 1.022066632906596
Epoch: 10/10; Train loss: 0.077130128854797; Val loss: 1.0725236847287132
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 59% | 10% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 59% |  3% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▃▄▅▆▆▇▇██
training_loss,█▃▂▂▁▁▁▁▁▁
validation_accuracy,▁▅▆▇▇▇████
validation_loss,█▂▁▂▂▃▂▃▅▆

0,1
epoch,10.0
training_accuracy,58.1543
training_loss,0.07713
validation_accuracy,40.06348
validation_loss,1.07252


[34m[1mwandb[0m: Agent Starting Run: n9a2jn9m with config:
[34m[1mwandb[0m: 	batchsize: 256
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 256
[34m[1mwandb[0m: 	no_of_layers: 2




Epoch: 1/10; Train loss: 0.8560391462416876; Val loss: 1.031551378113883
Epoch: 2/10; Train loss: 0.28304686705271404; Val loss: 0.9468044269652593
Epoch: 3/10; Train loss: 0.18271166392735072; Val loss: 0.9489333799907139
Epoch: 4/10; Train loss: 0.15140215998604184; Val loss: 0.9023098264421735
Epoch: 5/10; Train loss: 0.1330444886570885; Val loss: 0.9460565930321103
Epoch: 6/10; Train loss: 0.11994333485762278; Val loss: 0.9309921264648438
Epoch: 7/10; Train loss: 0.10998118065652393; Val loss: 0.9437835159755888
Epoch: 8/10; Train loss: 0.10132282401834215; Val loss: 0.9592479978288923
Epoch: 9/10; Train loss: 0.09358535985151926; Val loss: 0.9873813844862438
Epoch: 10/10; Train loss: 0.08679280198755718; Val loss: 1.0520003920509702
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 52% |  7% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 52% |  3% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.019 MB of 0.019 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▃▅▅▆▆▇▇██
training_loss,█▃▂▂▁▁▁▁▁▁
validation_accuracy,▁▅▆▇▇█████
validation_loss,▇▃▃▁▃▂▃▄▅█

0,1
epoch,10.0
training_accuracy,54.13281
training_loss,0.08679
validation_accuracy,39.52637
validation_loss,1.052


[34m[1mwandb[0m: Agent Starting Run: v6xogr34 with config:
[34m[1mwandb[0m: 	batchsize: 128
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 64
[34m[1mwandb[0m: 	no_of_layers: 2




Epoch: 1/10; Train loss: 0.7164269676094963; Val loss: 0.9493304519426256
Epoch: 2/10; Train loss: 0.22460083314350673; Val loss: 0.9652248620986938
Epoch: 3/10; Train loss: 0.16802109522478922; Val loss: 0.951663491271791
Epoch: 4/10; Train loss: 0.14491477322010768; Val loss: 0.9390208919843038
Epoch: 5/10; Train loss: 0.12994267721970876; Val loss: 0.9313154348305294
Epoch: 6/10; Train loss: 0.1184908865463166; Val loss: 0.925822662455695
Epoch: 7/10; Train loss: 0.10873695810635885; Val loss: 0.9606014348211742
Epoch: 8/10; Train loss: 0.10138276064679737; Val loss: 0.9982157065754845
Epoch: 9/10; Train loss: 0.09372421552737555; Val loss: 1.0110974396978105
Epoch: 10/10; Train loss: 0.08742343796151025; Val loss: 1.0093359379541307
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 40% |  5% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 40% |  2% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▄▅▅▆▆▇▇██
training_loss,█▃▂▂▁▁▁▁▁▁
validation_accuracy,▁▄▆▆▇█████
validation_loss,▃▄▃▂▁▁▄▇██

0,1
epoch,10.0
training_accuracy,53.55078
training_loss,0.08742
validation_accuracy,41.99219
validation_loss,1.00934


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: rd2bwngt with config:
[34m[1mwandb[0m: 	batchsize: 128
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 64
[34m[1mwandb[0m: 	no_of_layers: 3


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111359387780087, max=1.0)…



Epoch: 1/10; Train loss: 0.8446222006706965; Val loss: 1.008161383015769
Epoch: 2/10; Train loss: 0.24656102007343655; Val loss: 0.9673175499552772
Epoch: 3/10; Train loss: 0.1745856924284072; Val loss: 0.9394309066590809
Epoch: 4/10; Train loss: 0.14751173834005993; Val loss: 0.9612573612303961
Epoch: 5/10; Train loss: 0.12954976453667597; Val loss: 0.9410982586088634
Epoch: 6/10; Train loss: 0.11617617775996526; Val loss: 0.9498451919782729
Epoch: 7/10; Train loss: 0.10454790108260655; Val loss: 0.9705173004241217
Epoch: 8/10; Train loss: 0.09569753205492383; Val loss: 0.9831406417347136
Epoch: 9/10; Train loss: 0.08719843650148028; Val loss: 1.0267029844579243
Epoch: 10/10; Train loss: 0.07907635228974479; Val loss: 1.0600490059171404
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 39% |  6% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 39% |  3% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▃▅▅▆▆▇▇██
training_loss,█▃▂▂▁▁▁▁▁▁
validation_accuracy,▁▅▆▇▇█████
validation_loss,▅▃▁▂▁▂▃▄▆█

0,1
epoch,10.0
training_accuracy,57.34766
training_loss,0.07908
validation_accuracy,40.57617
validation_loss,1.06005


[34m[1mwandb[0m: Agent Starting Run: kzf5te9g with config:
[34m[1mwandb[0m: 	batchsize: 256
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 256
[34m[1mwandb[0m: 	no_of_layers: 3




Epoch: 1/10; Train loss: 0.9895656061172485; Val loss: 1.1032550561995733
Epoch: 2/10; Train loss: 0.3612326726459322; Val loss: 0.9833807945251465
Epoch: 3/10; Train loss: 0.20565238282794043; Val loss: 0.9249402738752819
Epoch: 4/10; Train loss: 0.1581867493901934; Val loss: 0.930955137525286
Epoch: 5/10; Train loss: 0.13449913359823681; Val loss: 0.9290117422739664
Epoch: 6/10; Train loss: 0.11817624943596976; Val loss: 0.9509167387371972
Epoch: 7/10; Train loss: 0.10514981752350216; Val loss: 0.9905677693230766
Epoch: 8/10; Train loss: 0.0941530555486679; Val loss: 0.9680205470039731
Epoch: 9/10; Train loss: 0.08384213833581834; Val loss: 1.0180594126383464
Epoch: 10/10; Train loss: 0.0747570622534979; Val loss: 1.0635312682106381
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 57% |  9% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 57% |  3% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.019 MB of 0.019 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▂▄▅▆▆▇▇██
training_loss,█▃▂▂▁▁▁▁▁▁
validation_accuracy,▁▅▆▇▇▇████
validation_loss,█▃▁▁▁▂▄▃▅▆

0,1
epoch,10.0
training_accuracy,59.12305
training_loss,0.07476
validation_accuracy,40.50293
validation_loss,1.06353


[34m[1mwandb[0m: Agent Starting Run: csj5noqt with config:
[34m[1mwandb[0m: 	batchsize: 128
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 256
[34m[1mwandb[0m: 	no_of_layers: 3




Epoch: 1/10; Train loss: 0.6967387835184733; Val loss: 1.0060071860040938
Epoch: 2/10; Train loss: 0.21287803229831515; Val loss: 0.9996362356912523
Epoch: 3/10; Train loss: 0.15986574964863914; Val loss: 0.9792629877726237
Epoch: 4/10; Train loss: 0.13659471812702362; Val loss: 1.0071044195265997
Epoch: 5/10; Train loss: 0.12024541918720517; Val loss: 0.9939982138928913
Epoch: 6/10; Train loss: 0.1072023396264939; Val loss: 1.0312626446996416
Epoch: 7/10; Train loss: 0.09643002781130018; Val loss: 1.037904455548241
Epoch: 8/10; Train loss: 0.08757181580577578; Val loss: 1.0526733370054335
Epoch: 9/10; Train loss: 0.07866008809634617; Val loss: 1.0852615634600322
Epoch: 10/10; Train loss: 0.07135803709427516; Val loss: 1.0847898381096976
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 42% |  6% |
|  1 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 42% |  3% |
|  1 |  0% |  0% |


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
training_accuracy,▁▃▅▅▆▆▇▇██
training_loss,█▃▂▂▂▁▁▁▁▁
validation_accuracy,▁▅▆▇▇█████
validation_loss,▃▂▁▃▂▄▅▆██

0,1
epoch,10.0
training_accuracy,60.48438
training_loss,0.07136
validation_accuracy,40.9668
validation_loss,1.08479


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ylxw70oc with config:
[34m[1mwandb[0m: 	batchsize: 256
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 256
[34m[1mwandb[0m: 	no_of_layers: 3




Epoch: 1/10; Train loss: 0.9861873619897025; Val loss: 1.1181894484020414
