In [1]:
import os
import sys
sys.path.insert(1, os.path.abspath(os.path.join(os.path.dirname(f'..{os.sep}utils'))))
sys.path.insert(1, os.path.abspath(os.path.join(os.path.dirname( '..'))))
from utils.constants import *
import torch
import torch.nn as nn
from transformer_v2 import Transformer
from utils.function_utils import *
from func_load_model import *
from utils.optimizer_n_scheduler import *
from utils.logging_tensorboard import create_summary_writer, log_loss, log_learning_rate, log_gradients, log_attention_weights
from utils.distributions import *

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

batch_size = 64
max_len = MODEL_MAX_SEQ_LEN
d_model = MODEL_DIM
num_layers = MODEL_N_LAYERS
num_heads = MODEL_N_HEADS
dropout = MODEL_DROPOUT
num_epochs = 10
learning_rate = 1e-4
warmup_steps = 2000
weight_decay = 1e-4
VOCAB_SIZE = 32_000
d_ff = MODEL_FF
label_smoothing = MODEL_LABEL_SMOTHING

NUM_PHRASES = 1_000_000

n=0
LOGGING_FILE = f'runs{os.sep}translation_experiment_{n}'

In [3]:
tokenizer = load_tokenizer()
model = Transformer(VOCAB_SIZE,
                    VOCAB_SIZE, 
                    d_model, 
                    num_heads, 
                    num_layers, 
                    d_ff, 
                    dropout, 
                    max_len).to(device)

In [4]:
optimizer, scheduler = create_optimizer_and_scheduler(model, d_model, warmup_steps, learning_rate, weight_decay)

In [5]:
writer = create_summary_writer(LOGGING_FILE)

In [6]:
sentence_pairs = load_dataset(FILE_PATH, limit=NUM_PHRASES)
split_idx = int(len(sentence_pairs) * 0.9)
train_sentence_pairs = sentence_pairs[:split_idx]
val_sentence_pairs = sentence_pairs[split_idx:]

In [7]:
train_dataset = preprocess_data(train_sentence_pairs, tokenizer, max_len)
val_dataset = preprocess_data(val_sentence_pairs, tokenizer, max_len)

In [8]:
train_dataloader = create_dataloader(train_dataset, batch_size, tokenizer, shuffle=True, num_workers=0)
val_dataloader = create_dataloader(val_dataset, batch_size, tokenizer, shuffle=False, num_workers=0)

In [9]:
pad_idx = tokenizer.token_to_id("<pad>")
criterion = LabelSmoothingKLDivergenceLoss(label_smoothing, VOCAB_SIZE, ignore_index=pad_idx)

In [10]:
def train(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs, tgt_vocab, pad_idx, device, log_interval=100):
    global_step = 0

    for epoch in range(num_epochs):
        model.train()
        for batch_idx, (src, tgt) in enumerate(train_loader):
            src, tgt = src.to(device), tgt.to(device)
            src_mask, tgt_mask = generate_masks(src, tgt, pad_idx)
            
            optimizer.zero_grad()

            output = model(src, tgt[:, :-1], src_mask, tgt_mask[:, :-1, :-1])
            _, loss = criterion(output, tgt[:, 1:])
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

            scheduler.step()

            # Log loss, learning rate, weights, and attention weights to TensorBoard
            log_loss(writer, loss, global_step)
            log_learning_rate(writer, scheduler.learning_rate(), global_step)
            global_step += 1

        # Evaluate the model on the validation set after each epoch
        val_loss = evaluate_model(model, val_loader, criterion, device, pad_idx)
        print(f"Epoch: {epoch + 1}, Validation Loss: {val_loss:.4f}")

        bleu_score = evaluate_metrics(model, val_loader, tgt_vocab, pad_idx, device)
        print(f"Epoch: {epoch + 1}, BLEU Score: {bleu_score:.4f}")
        
        log_attention_weights(writer, model.attention_weights, global_step)

        # Save the model checkpoint after each epoch
        save_checkpoint(model, optimizer, scheduler, epoch, "checkpoint.pt")

In [11]:
train(model, train_dataloader, val_dataloader, criterion, optimizer, scheduler, num_epochs, VOCAB_SIZE, VOCAB_SIZE, device)

RuntimeError: The size of tensor a (93) must match the size of tensor b (92) at non-singleton dimension 3