In [1]:
import os
import sys
sys.path.insert(1, os.path.abspath(os.path.join(os.path.dirname(f'..{os.sep}utils'))))
sys.path.insert(1, os.path.abspath(os.path.join(os.path.dirname( '..'))))
from utils.constants import *
import torch
import torch.nn as nn
from transformer_v2 import Transformer
from utils.function_utils import *
from func_load_model_old import *
from utils.optimizer_n_scheduler import *
from utils.logging_tensorboard import create_summary_writer, log_loss, log_learning_rate, log_gradients, log_attention_weights
from utils.distributions import *
from torch.cuda.amp import GradScaler, autocast
from data_funcs import *

In [2]:
num_workers = os.cpu_count()

In [3]:
!nvidia-smi

Thu May  4 22:44:09 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 531.14                 Driver Version: 531.14       CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                      TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 Ti    WDDM | 00000000:29:00.0  On |                  N/A |
| 48%   44C    P0               45W / 200W|    508MiB /  8192MiB |      6%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

batch_size = 16
max_len = MODEL_MAX_SEQ_LEN
d_model = MODEL_DIM
num_layers = MODEL_N_LAYERS
num_heads = MODEL_N_HEADS
dropout = MODEL_DROPOUT
num_epochs = 10
learning_rate = 1e-4
warmup_steps = 2000
weight_decay = 1e-4
VOCAB_SIZE = 64_000
d_ff = MODEL_FF
label_smoothing = MODEL_LABEL_SMOTHING
FILE_PATH = 'data/en-pt.txt'
NUM_PHRASES = 1_000_000

n=1
LOGGING_FILE = f'runs{os.sep}translation_experiment_{n}'

## OLD-PREPROCESSING

In [5]:
tokenizer = load_tokenizer()
model = Transformer(VOCAB_SIZE,
                    VOCAB_SIZE, 
                    d_model, 
                    num_heads, 
                    num_layers, 
                    d_ff, 
                    dropout, 
                    max_len).to(device)

In [6]:
optimizer, scheduler = create_optimizer_and_scheduler(model, d_model, warmup_steps, learning_rate, weight_decay)

In [7]:
writer = create_summary_writer(LOGGING_FILE)

In [8]:
sentence_pairs = load_dataset(FILE_PATH, limit=NUM_PHRASES)
preprocessed_pairs = [(preprocess_text(en), preprocess_text(pt)) for en, pt in sentence_pairs]
split_idx = int(len(preprocessed_pairs) * 0.9)
train_sentence_pairs = preprocessed_pairs[:split_idx]
val_sentence_pairs = preprocessed_pairs[split_idx:]

In [9]:
train_dataset = preprocess_data(train_sentence_pairs, tokenizer, max_len)
val_dataset = preprocess_data(val_sentence_pairs, tokenizer, max_len)

In [10]:
train_dataloader = create_dataloader(train_dataset, batch_size, tokenizer, shuffle=True, num_workers=num_workers)
val_dataloader = create_dataloader(val_dataset, batch_size, tokenizer, shuffle=False, num_workers=num_workers)

In [11]:
pad_idx = tokenizer.token_to_id("<pad>")
criterion = LabelSmoothingKLDivergenceLoss(label_smoothing, VOCAB_SIZE, ignore_index=pad_idx)

In [None]:
def train(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs, tgt_vocab, pad_idx, device, log_interval=100):
    global_step = 0
    accumulation_steps = 4
    for epoch in range(num_epochs):
        print('Starting epoch: ', epoch+1)
        model.train()
        accumulation_steps = 4
        optimizer.zero_grad()
        for batch_idx, (src, tgt) in enumerate(train_loader):
            src, tgt = src.to(device), tgt.to(device)
            src_mask, tgt_mask = generate_masks(src, tgt, pad_idx)
            if batch_idx == 0: 
                _, enc_attention_weights, dec_self_attention_weights, dec_enc_attention_weights = model(src, tgt, src_mask, tgt_mask, return_attention=True)
                attention_weights = {
                    "encoder": enc_attention_weights,
                    "decoder_self": dec_self_attention_weights,
                    "decoder_enc_dec": dec_enc_attention_weights
                }
                log_attention_weights(writer, attention_weights, MODEL_N_LAYERS, MODEL_N_HEADS, global_step)

            output = model(src, tgt, src_mask, tgt_mask)
            _, loss = criterion(output, tgt)
            loss.backward()
            if (batch_idx + 1) % accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
            # Log loss, learning rate, weights, and attention weights to TensorBoard
            log_loss(writer, loss, global_step)
            log_learning_rate(writer, scheduler.learning_rate(), global_step)
            global_step += 1

            if (batch_idx + 1) % 10 == 0:
                print(f"Epoch {epoch + 1}/{num_epochs} | Batch {batch_idx + 1}/{len(train_loader)} | Train Loss: {loss.item():.4f}")
            


        # Evaluate the model on the validation set after each epoch
        val_loss = evaluate_model(model, val_loader, criterion, device, pad_idx)
        print(f"Epoch: {epoch + 1} | Validation Loss: {val_loss:.4f}")
        
        bleu_score = evaluate_metrics(model, val_loader, pad_idx, tokenizer, device)
        print(f"Epoch: {epoch + 1}, BLEU Score: {bleu_score:.4f}")

        # Save the model checkpoint after each epoch
        save_checkpoint(model, optimizer, scheduler, epoch, f"checkpoints{os.sep}checkpoint_epoch_{epoch+1}_val_loss_{val_loss:.4f}.pt")

In [None]:
train(model, train_dataloader, val_dataloader, criterion, optimizer, scheduler, num_epochs, VOCAB_SIZE, VOCAB_SIZE, device)

## NEW-PREPROCESSING

In [5]:
train_dataloader, test_dataloader, pad_idx_src, pad_idx_tgt, src_vocab, tgt_vocab = load_data(FILE_PATH, language_direction = LanguageDirection.PT2EN.name, limit = NUM_PHRASES, batch_size = batch_size, max_len = max_len)

In [6]:
model = Transformer(len(src_vocab),
                    len(tgt_vocab), 
                    d_model, 
                    num_heads, 
                    num_layers, 
                    d_ff, 
                    dropout, 
                    max_len).to(device)
optimizer, scheduler = create_optimizer_and_scheduler(model, d_model, warmup_steps, learning_rate, weight_decay)
writer = create_summary_writer(LOGGING_FILE)
criterion = LabelSmoothingKLDivergenceLoss(label_smoothing, len(tgt_vocab), ignore_index=pad_idx_tgt)

In [7]:
def train(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs, pad_idx_src,pad_idx_tgt, tgt_vocab, device, log_interval=100):
    global_step = 0
    accumulation_steps = 4
    for epoch in range(num_epochs):
        print('Starting epoch: ', epoch+1)
        model.train()
        accumulation_steps = 4
        optimizer.zero_grad()
        for batch_idx, (src, tgt) in enumerate(train_loader):
            src, tgt = src.to(device), tgt.to(device)
            src_mask, tgt_mask = generate_masks_new(src, tgt, pad_idx_src, pad_idx_tgt)
            if batch_idx == 0: 
                _, enc_attention_weights, dec_self_attention_weights, dec_enc_attention_weights = model(src, tgt, src_mask, tgt_mask, return_attention=True)
                attention_weights = {
                    "encoder": enc_attention_weights,
                    "decoder_self": dec_self_attention_weights,
                    "decoder_enc_dec": dec_enc_attention_weights
                }
                log_attention_weights(writer, attention_weights, MODEL_N_LAYERS, MODEL_N_HEADS, global_step)

            output = model(src, tgt, src_mask, tgt_mask)
            _, loss = criterion(output, tgt)
            loss.backward()
            if (batch_idx + 1) % accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                # Log loss, learning rate, weights, and attention weights to TensorBoard
                log_loss(writer, loss, global_step)
                log_learning_rate(writer, scheduler.learning_rate(), global_step)
            global_step += 1

            if (batch_idx + 1) % 1000 == 0:
                print(f"Epoch {epoch + 1}/{num_epochs} | Batch {batch_idx + 1}/{len(train_loader)} | Train Loss: {loss.item():.4f}")
            


        # Evaluate the model on the validation set after each epoch
        val_loss = evaluate_model(model, val_loader, criterion, device, pad_idx_src, pad_idx_tgt)
        print(f"Epoch: {epoch + 1} | Validation Loss: {val_loss:.4f}")

        #TODO: Implement model forward function withouth tgt_mask
        # May use greedy decoding or beam search
        
        bleu_score = evaluate_metrics(model, val_loader, pad_idx_src, pad_idx_tgt, tgt_vocab, max_len, device)
        print(f"Epoch: {epoch + 1}, BLEU Score: {bleu_score:.4f}")

        # Save the model checkpoint after each epoch
        save_checkpoint(model, optimizer, scheduler, epoch, f"checkpoints{os.sep}checkpoint_epoch_{epoch+1}_val_loss_{val_loss:.4f}.pt")

In [8]:
train(model, train_dataloader, test_dataloader, criterion, optimizer, scheduler, num_epochs, pad_idx_src, pad_idx_tgt, tgt_vocab, device)

Starting epoch:  1


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.59 GiB (GPU 0; 8.00 GiB total capacity; 5.09 GiB already allocated; 0 bytes free; 7.02 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF