# Install libraries

In [None]:
!pip install -q wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.4/195.4 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m263.5/263.5 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h

# Import Libraries

In [None]:
import json
import random
from tqdm import tqdm
import pprint
import wandb
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
import torch.optim as optim


In [None]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
for i in range(torch.cuda.device_count()):
   print(torch.cuda.get_device_properties(i).name)

Tesla V100-SXM2-16GB


In [None]:
def load_file(file_path):
    """loads the test file and extracts all functions/derivatives"""
    data = open(file_path, "r").readlines()
    functions, derivatives = zip(*[line.strip().split("=") for line in data])
    return functions, derivatives

In [None]:
TRAIN_FILE = "train.txt"

functions, derivatives = load_file(TRAIN_FILE)

# train- 70%, val-20%, test-10% split

train_size = int(0.7 * len(functions))
val_size = int(0.2 * len(functions))
test_size = len(functions) - train_size - val_size

train_functions, val_functions, test_functions = functions[:train_size], functions[train_size:train_size+val_size], functions[train_size+val_size:]
train_derivatives, val_derivatives, test_derivatives = derivatives[:train_size], derivatives[train_size:train_size+val_size], derivatives[train_size+val_size:]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.59k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

In [None]:
#Addidng SOS to tokenizer

special_tokens_dict = {'bos_token': '<SOS>'}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

print(tokenizer.all_special_tokens)
print(tokenizer.all_special_ids)

['<SOS>', '</s>', '<unk>', '<pad>', '<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', '<extra_id_44>', '<extra_id_45>', '<extra_id_46>', '<extra_id_47>', '<extra_id_48>', '<extra_id_49>', '<extra_id_50>', '<extra_id_51>', '<extra_id_52>', '<extra_id_53>', '<extra_id_54>', '<extra_id_55>', '<extra_id_56>', '<ext

# Creating Dataset




In [None]:
class MathDataset(Dataset):
    def __init__(self, functions, derivatives, tokenizer, max_length=30):
        self.tokenizer = tokenizer
        self.functions = functions
        self.derivatives = derivatives
        self.max_length = max_length

    def __len__(self):
        return len(self.functions)

    def __getitem__(self, idx):
        function = self.functions[idx]
        derivative = self.derivatives[idx]

        # Add <SOS> token to the beginning of the function and derivative
        function = "<SOS>"+ function
        derivative = "<SOS>"+derivative

        # Tokenize function and derivative
        src = self.tokenizer(function, padding='max_length', max_length=self.max_length, truncation=True, return_tensors="pt")
        tgt = self.tokenizer(derivative, padding='max_length', max_length=self.max_length, truncation=True, return_tensors="pt")

        src_input_ids = src.input_ids.squeeze(0)  # Remove batch dimension
        tgt_input_ids = tgt.input_ids.squeeze(0)

        return src_input_ids, tgt_input_ids

train_dataset = MathDataset(train_functions, train_derivatives, tokenizer)
val_dataset = MathDataset(val_functions, val_derivatives, tokenizer)
test_dataset = MathDataset(test_functions, test_derivatives, tokenizer)


# # print first 3 samples

# for i in range(3):
#     print("Sample", i)
#     print("Function:", train_functions[i])
#     print("Derivative:", train_derivatives[i])
#     print("Tokenized Function:", tokenizer.decode(train_dataset[i][0]))
#     print("Tokenized Function IDs:", train_dataset[i][0])
#     print("Tokenized Derivative:", tokenizer.decode(train_dataset[i][1]))
#     print("Tokenized Derivative IDs:", train_dataset[i][1])


# Encoder-Decoder w Atten Model Architecture

In [None]:
class Encoder(nn.Module):
    def __init__(self,input_size, embedding_size, hidden_size, num_layers, p):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)

    def forward(self, x):
        # x shape: (seq_length, N) where N is batch size
        # embedding shape: (seq_length, N, embedding_size)
        embedding = self.dropout(self.embedding(x))

        # output shape: (seq_length, N, hidden_size)
        encoder_states, (hidden,cell) = self.rnn(embedding)

        return encoder_states, hidden, cell

class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.key_layer = nn.Linear(hidden_size, hidden_size)
        self.query_layer = nn.Linear(hidden_size, hidden_size)
        self.energy_layer = nn.Linear(hidden_size, 1)
        self.softmax = nn.Softmax(dim=0)

    def forward(self, hidden, encoder_outputs):
        # hidden shape: (1, N, hidden_size), encoder_outputs shape: (seq_length, N, hidden_size)
        seq_length, N, _ = encoder_outputs.shape
        hidden = hidden.repeat(seq_length, 1, 1)  # Repeat decoder hidden state seq_length times

        energy = torch.tanh(self.key_layer(encoder_outputs) + self.query_layer(hidden))
        attention = self.energy_layer(energy).squeeze(2)  # (seq_length, N)
        attention_weights = self.softmax(attention)  # (seq_length, N)
        context_vector = torch.einsum("sn,snh->nh", attention_weights, encoder_outputs)  # (N, hidden_size)

        return context_vector, attention_weights

class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, p, use_attention=True):
        super(Decoder, self).__init__()
        self.use_attention = use_attention
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.fc = nn.Linear(hidden_size, output_size)
        self.rnn = nn.LSTM(embedding_size + (hidden_size if use_attention else 0), hidden_size, num_layers, dropout=p)
        if use_attention:
            self.attention = Attention(hidden_size)


    def forward(self, x, hidden, cell, encoder_outputs=None):
        x = x.unsqueeze(0)  # x shape: (1, N)
        embedding = self.dropout(self.embedding(x))  # embedding shape: (1, N, embedding_size)

        # Apply attention if enabled
        if self.use_attention and encoder_outputs is not None:
            # hidden[0] means sending the hidden state from the first while hidden[-1] means sending hidden state from last layer
            context_vector, attention_weights = self.attention(hidden[-1], encoder_outputs)
            rnn_input = torch.cat((embedding, context_vector.unsqueeze(0)), dim=2)
        else:
            rnn_input = embedding
            attention_weights = None

        outputs, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
        predictions = self.fc(outputs).squeeze(0)

        return predictions, hidden, cell, attention_weights


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, use_attention=True):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.use_attention = use_attention

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(tokenizer)
        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(source.device)

        encoder_outputs, hidden, cell = self.encoder(source)

        x = target[0]  # Initial input to the decoder

        for t in range(1, target_len):

            if self.use_attention:
                output, hidden, cell, _ = self.decoder(x, hidden, cell, encoder_outputs)
            else:
                output, hidden, cell, _ = self.decoder(x, hidden, cell, None)

            outputs[t] = output
            best_guess = output.argmax(1)
            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs

    # Assuming the predict function needs to align with attention use
    def predict(self, source, eos_token, max_length=30):
        encoder_outputs, hidden, cell = self.encoder(source)
        inputs = source[0, :]  # Assuming the start symbol is the first input
        outputs = []

        for _ in range(max_length):
            if self.use_attention:
                output, hidden, cell, _ = self.decoder(inputs, hidden, cell, encoder_outputs)
            else:
                output, hidden, cell, _ = self.decoder(inputs, hidden, cell, None)

            best_guess = output.argmax(1)
            outputs.append(best_guess.item())
            inputs = best_guess
            if best_guess.item() == eos_token:
                break

        return outputs


# Create Train/Val Dataloader

In [None]:
def prepare_data_loaders(batch_size):
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    return train_loader, val_loader

# Initialize Model

In [None]:
def initialize_model_and_optim(config, device):
    encoder_params = {
        'input_size': config.input_size_encoder,
        'embedding_size': config.encoder_embedding_size,
        'hidden_size': config.hidden_size,
        'num_layers': config.num_layers,
        'p': config.enc_dropout
    }

    decoder_params = {
        'input_size': config.input_size_decoder,
        'embedding_size': config.decoder_embedding_size,
        'hidden_size': config.hidden_size,
        'output_size': config.output_size,
        'num_layers': config.num_layers,
        'p': config.dec_dropout,
        'use_attention': config.use_attention
    }

    encoder_net = Encoder(**encoder_params).to(device)
    decoder_net = Decoder(**decoder_params).to(device)
    model = Seq2Seq(encoder_net, decoder_net, config.use_attention).to(device)

    criterion = nn.CrossEntropyLoss(ignore_index=config.pad_idx)

    return model, criterion

# Choose optimizer



In [None]:
def get_optimizer(config, model):
    learning_rate = config.learning_rate
    if config.optimizer_type == "adam":
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    elif config.optimizer_type == "sgd":
        momentum = config.get("momentum", 0.9)  # Use a default value if not specified
        optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)
    else:
        raise ValueError("Unsupported optimizer type")
    return optimizer

# Train Loop

In [None]:
def train_model(config=None):

    with wandb.init(project="math-derivatives", config=config):
        config = wandb.config

        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        train_loader, val_loader = prepare_data_loaders(config.batch_size)
        model, criterion = initialize_model_and_optim(config, device)
        optimizer = get_optimizer(config, model)
        num_epochs = config.epochs


        for epoch in range(1,num_epochs+1):
            print(f'Epoch [{epoch} / {num_epochs}]')

            model.train()
            total_train_loss = 0
            progress_bar = tqdm(train_loader, total=len(train_loader), desc='Training')
            for _, (source, target) in enumerate(progress_bar):
                source = source.to(device)
                target = target.to(device)

                source = source.transpose(0, 1)  # [seq_len, batch_size]
                target = target.transpose(0, 1) # [seq_len, batch_size]

                # Pass the source and target for model's forward method
                output = model(source, target)

                # output shape: (trg_len, batch_size, output_dim)
                output = output[1:].reshape(-1, output.shape[2])
                target = target[1:].reshape(-1)

                optimizer.zero_grad()
                loss = criterion(output, target)

                total_train_loss += loss.item()

                # Backward pass
                loss.backward()

                # Clip to avoid exploding gradient issues, makes sure grads are
                # within a healthy range
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

                # Gradient descent step
                optimizer.step()


            average_train_loss = total_train_loss / len(train_loader)  # Compute the average training loss
            print(f'Average Training Loss: {average_train_loss:.4f}')
            wandb.log({"train_loss": average_train_loss, "epoch": epoch})

            model.eval()
            total_val_loss = 0
            progress_bar = tqdm(val_loader, total=len(val_loader), desc='Validation')
            for (source, target) in progress_bar:
                source = source.to(device)
                target = target.to(device)

                # Pass the source and target for model's forward method
                output = model(source, target)

                # output shape: (trg_len, batch_size, output_dim)
                output = output[1:].reshape(-1, output.shape[2])
                target = target[1:].reshape(-1)

                loss = criterion(output, target)
                total_val_loss += loss.item()

            average_val_loss = total_val_loss / len(val_loader)
            print(f'Average Validation Loss: {average_val_loss:.4f}')
            wandb.log({"val_loss": average_val_loss, "epoch": epoch})

        save_model_flag = config.get("save_model", False)
        if save_model_flag:
          save_model(model, config)


In [None]:
""""
Save model
Not for wandb sweep
"""

def save_model(model, config):
    MODEL_DIR = "models"
    model_date = "12-03-24"
    day_version =1

    model_path = f"{MODEL_DIR}/{model_date}_v{day_version}_seq2seq.pth"

    model_params_path = f"{MODEL_DIR}/{model_date}_v{day_version}_seq2seq_params.json"

    with open(model_params_path, 'w') as f:
        json.dump(config, f)


    torch.save(model.state_dict(), model_path)
    wandb.save(model_path)
    wandb.save(model_params_path)



# Run WandB Sweep


In [None]:
"""
Hyperparameter tuning uinsg wandb
"""

sweep_config = {
    'method': 'random',
    'metric': {
        'name': 'val_loss',
        'goal': 'minimize'
    },
}


# Hyperparameters to sweep/tune


parameters_dict = {
    'input_size_encoder': {
            'values': [len(tokenizer)]
        },
        'encoder_embedding_size': {
            'values': [128, 256, 512, 1024]
        },
        'input_size_decoder': {
            'values': [len(tokenizer)]
        },
        'decoder_embedding_size': {
            'values': [128, 256, 512, 1024]
        },
        'output_size': {
            'values': [len(tokenizer)]
        },
        'hidden_size': {
            'values': [256, 512, 1024]
        },
        'num_layers': {
            'values': [1, 2, 3]
        },
        'use_attention': {
            'values': [True, False]
        },
}

sweep_config["parameters"] = parameters_dict


# Other parameters that remain constant

parameters_dict.update({
        'enc_dropout': {
            'values': [0.5]
        },
        'dec_dropout': {
            'values': [ 0.5]
        },
        'epochs': {
            'values': [5]
        },
        'batch_size': {
            'values': [64]
        },
        'pad_idx': {
            'values': [tokenizer.pad_token_id]
        },
        'optimizer_type': {
            'values': ["adam"]
        },
        'learning_rate': {
            'values': [0.001]
        },
    })

pprint.pprint(sweep_config)

{'method': 'random',
 'metric': {'goal': 'minimize', 'name': 'val_loss'},
 'parameters': {'batch_size': {'values': [64]},
                'dec_dropout': {'values': [0.5]},
                'decoder_embedding_size': {'values': [128, 256, 512, 1024]},
                'enc_dropout': {'values': [0.5]},
                'encoder_embedding_size': {'values': [128, 256, 512, 1024]},
                'epochs': {'values': [5]},
                'hidden_size': {'values': [256, 512, 1024]},
                'input_size_decoder': {'values': [385]},
                'input_size_encoder': {'values': [385]},
                'learning_rate': {'values': [0.001]},
                'num_layers': {'values': [1, 2, 3]},
                'optimizer_type': {'values': ['adam']},
                'output_size': {'values': [385]},
                'pad_idx': {'values': [0]},
                'use_attention': {'values': [True, False]}}}


In [None]:
sweep_id = wandb.sweep(sweep_config, project="math-derivatives")

In [None]:
wandb.agent(sweep_id, train_model, count=5)


[34m[1mwandb[0m: Agent Starting Run: btseg2zc with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dec_dropout: 0.5
[34m[1mwandb[0m: 	decoder_embedding_size: 512
[34m[1mwandb[0m: 	enc_dropout: 0.5
[34m[1mwandb[0m: 	encoder_embedding_size: 512
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	input_size_decoder: 385
[34m[1mwandb[0m: 	input_size_encoder: 385
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	optimizer_type: adam
[34m[1mwandb[0m: 	output_size: 385
[34m[1mwandb[0m: 	pad_idx: 0
[34m[1mwandb[0m: 	use_attention: True
[34m[1mwandb[0m: Currently logged in as: [33msidb98[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch [1 / 5]


Training: 100%|██████████| 10938/10938 [16:56<00:00, 10.76it/s]


Average Training Loss: 0.3150


Validation: 100%|██████████| 3125/3125 [04:21<00:00, 11.93it/s]


Average Validation Loss: 8.4518
Epoch [2 / 5]


Training: 100%|██████████| 10938/10938 [17:14<00:00, 10.58it/s]


Average Training Loss: 0.0701


Validation: 100%|██████████| 3125/3125 [04:36<00:00, 11.32it/s]


Average Validation Loss: 8.4051
Epoch [3 / 5]


Training: 100%|██████████| 10938/10938 [17:40<00:00, 10.31it/s]


Average Training Loss: 0.0365


Validation: 100%|██████████| 3125/3125 [04:27<00:00, 11.70it/s]


Average Validation Loss: 8.1993
Epoch [4 / 5]


Training: 100%|██████████| 10938/10938 [17:17<00:00, 10.54it/s]


Average Training Loss: 0.0245


Validation: 100%|██████████| 3125/3125 [04:29<00:00, 11.59it/s]


Average Validation Loss: 8.0788
Epoch [5 / 5]


Training: 100%|██████████| 10938/10938 [17:22<00:00, 10.50it/s]


Average Training Loss: 0.0185


Validation: 100%|██████████| 3125/3125 [04:28<00:00, 11.66it/s]


Average Validation Loss: 8.2395


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▃▃▅▅▆▆██
train_loss,█▂▁▁▁
val_loss,█▇▃▁▄

0,1
epoch,5.0
train_loss,0.01851
val_loss,8.23949


[34m[1mwandb[0m: Agent Starting Run: tyein8n3 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dec_dropout: 0.5
[34m[1mwandb[0m: 	decoder_embedding_size: 512
[34m[1mwandb[0m: 	enc_dropout: 0.5
[34m[1mwandb[0m: 	encoder_embedding_size: 128
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_size_decoder: 385
[34m[1mwandb[0m: 	input_size_encoder: 385
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	optimizer_type: adam
[34m[1mwandb[0m: 	output_size: 385
[34m[1mwandb[0m: 	pad_idx: 0
[34m[1mwandb[0m: 	use_attention: False


Epoch [1 / 5]


Training: 100%|██████████| 10938/10938 [10:18<00:00, 17.68it/s]


Average Training Loss: 0.6405


Validation: 100%|██████████| 3125/3125 [02:33<00:00, 20.29it/s]


Average Validation Loss: 9.0517
Epoch [2 / 5]


Training: 100%|██████████| 10938/10938 [10:18<00:00, 17.69it/s]


Average Training Loss: 0.1902


Validation: 100%|██████████| 3125/3125 [02:34<00:00, 20.29it/s]


Average Validation Loss: 7.6406
Epoch [3 / 5]


Training: 100%|██████████| 10938/10938 [10:19<00:00, 17.66it/s]


Average Training Loss: 0.1142


Validation: 100%|██████████| 3125/3125 [02:35<00:00, 20.16it/s]


Average Validation Loss: 7.5002
Epoch [4 / 5]


Training: 100%|██████████| 10938/10938 [10:19<00:00, 17.66it/s]


Average Training Loss: 0.0735


Validation: 100%|██████████| 3125/3125 [02:34<00:00, 20.18it/s]


Average Validation Loss: 7.6362
Epoch [5 / 5]


Training: 100%|██████████| 10938/10938 [10:20<00:00, 17.63it/s]


Average Training Loss: 0.0483


Validation: 100%|██████████| 3125/3125 [02:34<00:00, 20.18it/s]


Average Validation Loss: 7.0866


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▃▃▅▅▆▆██
train_loss,█▃▂▁▁
val_loss,█▃▂▃▁

0,1
epoch,5.0
train_loss,0.04825
val_loss,7.08665


[34m[1mwandb[0m: Agent Starting Run: yzjz892r with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dec_dropout: 0.5
[34m[1mwandb[0m: 	decoder_embedding_size: 128
[34m[1mwandb[0m: 	enc_dropout: 0.5
[34m[1mwandb[0m: 	encoder_embedding_size: 256
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_size_decoder: 385
[34m[1mwandb[0m: 	input_size_encoder: 385
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	optimizer_type: adam
[34m[1mwandb[0m: 	output_size: 385
[34m[1mwandb[0m: 	pad_idx: 0
[34m[1mwandb[0m: 	use_attention: False


Epoch [1 / 5]


Training:  18%|█▊        | 1937/10938 [01:49<08:20, 17.97it/s][34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


# Run on Single Sets of Hyperparameter

In [None]:
config = {
    'input_size_encoder':len(tokenizer),
    'encoder_embedding_size': 256,
    'input_size_decoder': len(tokenizer),
    'decoder_embedding_size': 512,
    'output_size': len(tokenizer),
    'hidden_size': 512,
    'num_layers': 2,
    'enc_dropout': 0.5,
    'dec_dropout': 0.5,
    'use_attention': True,
    'epochs': 7,
    'batch_size': 64,
    'pad_idx': tokenizer.pad_token_id,
    'optimizer_type': "adam",
    'learning_rate': 0.001,
    'save_model' :True,
}

train_model(config)   #Uncomment this to train the model

wandb.finish()

Epoch [1 / 7]


Training: 100%|██████████| 10938/10938 [18:01<00:00, 10.11it/s]


Average Training Loss: 0.3383


Validation: 100%|██████████| 3125/3125 [04:44<00:00, 10.99it/s]


Average Validation Loss: 8.5401
Epoch [2 / 7]


Training: 100%|██████████| 10938/10938 [18:31<00:00,  9.84it/s]


Average Training Loss: 0.0603


Validation: 100%|██████████| 3125/3125 [04:48<00:00, 10.82it/s]


Average Validation Loss: 7.1883
Epoch [3 / 7]


Training: 100%|██████████| 10938/10938 [18:38<00:00,  9.78it/s]


Average Training Loss: 0.0301


Validation: 100%|██████████| 3125/3125 [05:02<00:00, 10.33it/s]


Average Validation Loss: 6.4590
Epoch [4 / 7]


Training: 100%|██████████| 10938/10938 [18:47<00:00,  9.70it/s]


Average Training Loss: 0.0203


Validation: 100%|██████████| 3125/3125 [04:47<00:00, 10.86it/s]


Average Validation Loss: 6.8734
Epoch [5 / 7]


Training: 100%|██████████| 10938/10938 [18:23<00:00,  9.91it/s]


Average Training Loss: 0.0160


Validation: 100%|██████████| 3125/3125 [04:50<00:00, 10.78it/s]


Average Validation Loss: 7.2889
Epoch [6 / 7]


Training: 100%|██████████| 10938/10938 [18:41<00:00,  9.75it/s]


Average Training Loss: 0.0137


Validation: 100%|██████████| 3125/3125 [04:49<00:00, 10.80it/s]


Average Validation Loss: 7.2291
Epoch [7 / 7]


Training: 100%|██████████| 10938/10938 [19:03<00:00,  9.57it/s]


Average Training Loss: 0.0122


Validation: 100%|██████████| 3125/3125 [04:46<00:00, 10.90it/s]
Traceback (most recent call last):
  File "<ipython-input-29-4a4d022f91d5>", line 77, in train_model
    save_model(model, config)
NameError: name 'save_model' is not defined


Average Validation Loss: 7.3972


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▃▃▅▅▆▆▇▇██
train_loss,█▂▁▁▁▁▁
val_loss,█▃▁▂▄▄▄

0,1
epoch,7.0
train_loss,0.01217
val_loss,7.39725


NameError: name 'save_model' is not defined