In [1]:
# https://www.kaggle.com/datasets/dhruvildave/en-fr-translation-dataset
from typing import Generator
import pandas as pd
import torch
from torch import Tensor, optim
from tqdm import tqdm
from src.transformer import Transformer
from src.processor import SentenceProcessor, ListPairedSentences, make_generator_v2
from src.light import LitTransformer
from transformers import get_linear_schedule_with_warmup
from collections import deque

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
class RollingAverage:
    def __init__(self, window_size=1000):
        self.window_size = window_size
        self.losses = deque(maxlen=window_size)
        self.sum = 0.0
    
    def add(self, loss_value):
        """Add a new loss value"""
        # Convert tensor to float if needed
        if torch.is_tensor(loss_value):
            loss_value = loss_value.item()
        
        # If we're at capacity, subtract the value that will be removed
        if len(self.losses) == self.window_size:
            self.sum -= self.losses[0]
        
        # Add new value
        self.losses.append(loss_value)
        self.sum += loss_value
    
    def avg(self, last_n=None):
        """Get average of last n values (or all if n is None)"""
        if not self.losses:
            return 0.0
        
        if last_n is None:
            return self.sum / len(self.losses)
        
        # Get last n values
        n = min(last_n, len(self.losses))
        last_values = list(self.losses)[-n:]
        return sum(last_values) / n
    
    def __len__(self):
        return len(self.losses)

In [3]:
def get_first_masked_token(mask: torch.Tensor) -> Tensor:
    squeezed_mask = mask.squeeze(1).squeeze(1) # mask is shaped (bs, 1, 1, sequence_length)
    first_masked_indices = (squeezed_mask == 0).int().argmax(dim=1)
    first_masked_indices[squeezed_mask.sum(dim=1) == squeezed_mask.size(1)] = squeezed_mask.size(1)
    return first_masked_indices.to(dtype=torch.int32)

def mask_last_token(current_mask: torch.Tensor) -> torch.Tensor:
    first_masked_indices = get_first_masked_token(current_mask) # get the index of first masked token
    last_token_indices = torch.clamp(first_masked_indices - 1, min=0) # to avoid negative indices
    current_mask[torch.arange(current_mask.size(0)), 0, 0, last_token_indices] = 0 # set the last 1 token to 0
    return current_mask

In [4]:
def get_page(csv_path: str, page: int, rows_per_page: int):
    return pd.read_csv(csv_path, skiprows = 1 + page * rows_per_page, nrows=rows_per_page, header=None, names=["en", "fr"])

def make_generator(csv_path: str, rows_per_page: int) -> Generator[ListPairedSentences, None, None]:
    i = 0
    while True:
        page = get_page(csv_path, i, rows_per_page)
        fr_sentences = page["fr"].to_list()
        en_sentences = page["en"].to_list()
        yield ListPairedSentences(fr_sentences, en_sentences)
        i += 1

In [5]:
dtype=torch.bfloat16
csv_path = "archive/en-fr.csv"
processor = SentenceProcessor(200, "bert-base-uncased")
d_model = 512
model = Transformer(vocab_size=processor.vocab_size, max_sequence_len=processor.sequence_length, d_model=d_model).to(DEVICE).to(dtype)

# opt and loss
learning_rate = 3e-4
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# training loop qty
num_epochs = 1
batch_size = 48
break_at = None # Will never break
# get_num_steps(csv_path, batch_size) running this function is too long ... just use the cached value
num_steps = 22520376 // batch_size

num_training_steps = num_epochs * num_steps
num_warmup_steps = 10_000

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)
loss_tracker = RollingAverage(window_size=1000)

lit_model = LitTransformer(
    model, 
    processor,
    optimizer = optim.Adam,
    optimizer_kwargs= { 'lr': learning_rate },
    scheduler=get_linear_schedule_with_warmup,
    scheduler_kwargs={
        "num_warmup_steps": num_warmup_steps,
        "num_training_steps": num_training_steps
    }
).to(DEVICE).to(dtype)

In [None]:
import lightning as L

class CSVDataset(torch.utils.data.IterableDataset):
    def __init__(self, csv_path: str, batch_size: int):
        self.csv_path = csv_path
        self.batch_size = batch_size

    def __iter__(self):
        return make_generator_v2(self.csv_path, self.batch_size)

trainer = L.Trainer(
    limit_train_batches=num_steps,
    max_epochs=num_epochs,
    accumulate_grad_batches=4,
)
trainer.fit(
    model=lit_model,
    train_dataloaders=CSVDataset("archive/en-fr.csv", batch_size)
)

💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 4070 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type             | Params | Mode 
------------------------------------------------------
0 | _model   | Transformer      | 91.1 M | train
1 | _loss_fn | CrossEntropyLoss | 0      | train
------------------------------------------------------
91.1 M    Tr

Training: |          | 0/? [00:00<?, ?it/s]

In [None]:
sentences = make_generator(csv_path, 1)

In [None]:
sentence = next(sentences)
print(sentence.fr) 
print(sentence.en)
print(lit_model.translate(sentence.fr[0]))

['La découverte du spectre de la lumière blanche Des codes dans la lumière Le spectre électromagnétique Les spectres d’émission Les spectres d’absorption Les années-lumière La pollution lumineuse']
['The white light spectrum Codes in the light The electromagnetic spectrum Emission spectra Absorption spectra Light-years Light pollution']
the white light spectrum codes in the light the electromagnetic spectrum emission spectra absorption spectra light - years light pollution
