In [None]:
import torch
import torch.nn as nn
from torch.utils.data import random_split, Dataset, DataLoader
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
from pathlib import Path
from tqdm import tqdm
import random
from torch.utils.data import Subset
import warnings
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu
import math
import os

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
config = {
        "batch_size": 8,
        "num_epochs": 20,
        "lr": 10**-4,
        "seq_len": 350,
        "d_model": 512,
        "datasource": 'opus_books',
        "lang_src": "en",
        "lang_tgt": "it",
        "model_folder": "weights",
        "model_basename": "tmodel_",
        "preload": "latest",
        "tokenizer_file": "tokenizer_{0}.json",
        "experiment_name": "runs/tmodel"
}

In [4]:
def get_all_sentences(dataset:load_dataset, language:str):
    """
    Generator function that extracts all sentences in a specified language from a translation dataset.

    Args:
        dataset (load_dataset): A Hugging Face `load_dataset` object representing the dataset containing 
            translations in different languages.
        language (str): The target language for which sentences are to be extracted. This should correspond 
            to a key in the `translation` field of the dataset.

    Yields:
        str: Sentences in the specified language from the dataset.

    Example:
        >>> from datasets import load_dataset
        >>> dataset = load_dataset("opus_books", "en-fr")
        >>> language = "en"
        >>> sentences = get_all_sentences(dataset['train'], language)
        >>> for sentence in list(sentences)[:5]:
        ...     print(sentence)
    """
    
    for item in dataset:
        yield item['translation'][language]

def build_tokenizer(config, dataset:load_dataset, language:str) -> Tokenizer:
    """
    Builds or loads a tokenizer for a specified language using the Hugging Face `Tokenizers` library.

    Args:
        config (dict): A configuration dictionary containing the `tokenizer_file` key, which specifies 
            the file path template for saving/loading the tokenizer. The file path should include a 
            placeholder for the language.
        dataset (load_dataset): A Hugging Face `load_dataset` object representing the dataset containing 
            translations in different languages.
        language (str): The target language for which the tokenizer is being built or loaded. This should 
            correspond to a key in the `translation` field of the dataset.

    Returns:
        Tokenizer: A `Tokenizer` object built for the specified language.

    Example:
        >>> from tokenizers import Tokenizer
        >>> from datasets import load_dataset
        >>> config = {"tokenizer_file": "tokenizer_{language}.json"}
        >>> dataset = load_dataset("opus_books", "en-fr")
        >>> language = "en"
        >>> tokenizer = build_tokenizer(config, dataset['train'], language)
        >>> print(tokenizer.get_vocab_size())
    """
    
    tokenizer_path = Path(config['tokenizer_file'].format(language))
    if not Path.exists(tokenizer_path):
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)
        tokenizer.train_from_iterator(get_all_sentences(dataset, language), trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    print(f"Tokenizer language : {language} Build Complete.")
    
    return tokenizer

In [5]:
raw_dataset = load_dataset(f"{config['datasource']}", f"{config['lang_src']}-{config['lang_tgt']}", split='train')

tokenizer_src = build_tokenizer(config, raw_dataset, config['lang_src'])
tokenizer_tgt = build_tokenizer(config, raw_dataset, config['lang_tgt'])

README.md:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/5.73M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/32332 [00:00<?, ? examples/s]

Tokenizer language : en Build Complete.
Tokenizer language : it Build Complete.


In [6]:
raw_dataset[10]

{'id': '10',
 'translation': {'en': '"Jane, I don\'t like cavillers or questioners; besides, there is something truly forbidding in a child taking up her elders in that manner.',
  'it': '— Jane, non mi piace di essere interrogata. Sta male, del resto, che una bimba tratti così i suoi superiori.'}}

In [7]:
def causal_mask(size:int)->bool:
    """
    Creates a causal mask for autoregressive models.

    Args:
        size (int): The size of the square matrix for the mask.

    Returns:
        bool: A boolean tensor of shape (1, size, size), where the upper triangular part above the diagonal 
              is masked (False) and the rest is unmasked (True).

    Example:
        >>> import torch
        >>> mask = causal_mask(5)
        >>> print(mask)
    """
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
    return mask == 0

class BilingualDataset(Dataset):
    """
    A custom PyTorch dataset class for bilingual translation tasks.

    Args:
        dataset (Dataset): A Hugging Face `Dataset` object containing translation data with `translation` fields.
        tokenizer_src (Tokenizer): Tokenizer for the source language.
        tokenizer_tgt (Tokenizer): Tokenizer for the target language.
        src_lang (str): Source language key in the `translation` field of the dataset.
        tgt_lang (str): Target language key in the `translation` field of the dataset.
        seq_len (int): Fixed sequence length for the input and output tensors.

    Returns:
        dict: A dictionary containing the following keys:
            - "encoder_input" (torch.Tensor): Tokenized and padded source sentence (seq_len).
            - "decoder_input" (torch.Tensor): Tokenized and padded target sentence with `<SOS>` (seq_len).
            - "encoder_mask" (torch.Tensor): Boolean mask for the encoder input (1, 1, seq_len).
            - "decoder_mask" (torch.Tensor): Boolean mask for the decoder input (1, seq_len, seq_len).
            - "label" (torch.Tensor): Tokenized and padded target sentence with `<EOS>` (seq_len).
            - "src_text" (str): Original source sentence.
            - "tgt_text" (str): Original target sentence.

    Example:
        >>> from tokenizers import Tokenizer
        >>> from datasets import load_dataset
        >>> dataset = load_dataset("opus_books", "en-fr")
        >>> src_tokenizer = Tokenizer.from_file("tokenizer_en.json")
        >>> tgt_tokenizer = Tokenizer.from_file("tokenizer_fr.json")
        >>> bilingual_ds = BilingualDataset(
        ...     dataset=dataset['train'],
        ...     tokenizer_src=src_tokenizer,
        ...     tokenizer_tgt=tgt_tokenizer,
        ...     src_lang="en",
        ...     tgt_lang="fr",
        ...     seq_len=32
        ... )
        >>> sample = bilingual_ds[0]
        >>> print(sample["encoder_input"])
        >>> print(sample["decoder_input"])
        >>> print(sample["label"])
    """
    def __init__(self, dataset:Dataset, tokenizer_src:Tokenizer, tokenizer_tgt:Tokenizer, src_lang:str, tgt_lang:str, seq_len:int):
        super().__init__()
        self.seq_len = seq_len

        self.ds = dataset
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang

        self.sos_token = torch.tensor([tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64)

    def __len__(self)->int:
        return len(self.ds)

    def __getitem__(self, idx)->dict[str:torch.Tensor]:
        src_target_pair = self.ds[idx]
        src_text = src_target_pair['translation'][self.src_lang]
        tgt_text = src_target_pair['translation'][self.tgt_lang]

        # Transform the text into tokens
        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids

        # Add sos, eos and padding to each sentence
        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2  # We will add <s> and </s>
        # We will only add <s>, and </s> only on the label
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1

        # Make sure the number of padding tokens is not negative. If it is, the sentence is too long
        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError("Sentence is too long")

        # Add <s> and </s> token
        encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(enc_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Add only <s> token
        decoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Add only </s> token
        label = torch.cat(
            [
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Double check the size of the tensors to make sure they are all seq_len long
        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len

        return {
            "encoder_input": encoder_input,  # (seq_len)
            "decoder_input": decoder_input,  # (seq_len)
            "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(), # (1, 1, seq_len)
            "decoder_mask": (decoder_input != self.pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0)), # (1, seq_len) & (1, seq_len, seq_len),
            "label": label,  # (seq_len)
            "src_text": src_text,
            "tgt_text": tgt_text,
        }

In [8]:
train_size = int(0.9 * len(raw_dataset))
val_size = len(raw_dataset) - train_size
train_ds_raw, val_ds_raw = random_split(raw_dataset, [train_size, val_size])

train_dataset = BilingualDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
val_dataset = BilingualDataset(val_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])

In [9]:
len(train_dataset)

29098

In [None]:
def get_random_subset(dataset:Dataset, fraction:float):
    """
    Get a random subset of a dataset.
    
    Args:
        dataset (Dataset): The original dataset.
        fraction (float): Fraction of the dataset to keep (0 < fraction <= 1).
    
    Returns:
        Subset: A smaller dataset containing the random subset.
    """
    assert 0 < fraction <= 1, "Fraction must be in the range (0, 1]."
    
    total_size = len(dataset)
    subset_size = int(total_size * fraction)
    random_indices = random.sample(range(total_size), subset_size)
    return Subset(dataset, random_indices)

# Example usage
train_fraction = 0.65  # Use 65% of the training dataset
val_fraction = 0.65    # Use 65% of the validation dataset

train_dataset = get_random_subset(train_dataset, train_fraction)
val_dataset = get_random_subset(val_dataset, val_fraction)

In [11]:
len(train_dataset), len(val_dataset)

(18913, 2102)

In [12]:
train_dataset[10]

{'encoder_input': tensor([   2,  540,  123,   40, 1646,   76,    3,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           

In [13]:
max_len_src = 0
max_len_tgt = 0

for item in raw_dataset:
    src_ids = tokenizer_src.encode(item['translation'][config['lang_src']]).ids
    tgt_ids = tokenizer_tgt.encode(item['translation'][config['lang_tgt']]).ids
    max_len_src = max(max_len_src, len(src_ids))
    max_len_tgt = max(max_len_tgt, len(tgt_ids))

print(f'Max length of source sentence: {max_len_src}')
print(f'Max length of target sentence: {max_len_tgt}')

Max length of source sentence: 309
Max length of target sentence: 274


In [14]:
train_dataloader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=True)

In [15]:
for batch in train_dataloader:
    print(f"Encoder Input Shape->: {batch['encoder_input'].shape}")
    print(f"Decoder Input Shape->: {batch['decoder_input'].shape}")
    print(f"Label Shape->: {batch['label'].shape}")
    print(f"Encoder Mask Shape->: {batch['encoder_mask'].shape}")
    print(f"Decoder Mask Shape->: {batch['decoder_mask'].shape}")
    print(f"Source Text(en)->: {batch['src_text']}")
    print(f"Target Text(it)->: {batch['tgt_text']}")
    break

Encoder Input Shape->: torch.Size([8, 350])
Decoder Input Shape->: torch.Size([8, 350])
Label Shape->: torch.Size([8, 350])
Encoder Mask Shape->: torch.Size([8, 1, 1, 350])
Decoder Mask Shape->: torch.Size([8, 1, 350, 350])
Source Text(en)->: ['A prince ought to have no other aim or thought, nor select anything else for his study, than war and its rules and discipline; for this is the sole art that belongs to him who rules, and it is of such force that it not only upholds those who are born princes, but it often enables men to rise from a private station to that rank. And, on the contrary, it is seen that when princes have thought more of ease than of arms they have lost their states.', 'He was on a thoroughbred dark bay, which was obviously heated by galloping, and he was using the reins to hold it in.', "Of the fanatic's burning eternity I have no fear: there is not a future state worse than this present one--let me break away, and go home to God!'", 'And as this point is worthy of n

In [None]:
class LayerNormalization(nn.Module):

    def __init__(self, features: int, eps:float=10**-6) -> None:
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(features)) # alpha is a learnable parameter
        self.bias = nn.Parameter(torch.zeros(features)) # bias is a learnable parameter

    def forward(self, x):
        # x: (batch, seq_len, hidden_size)
         # Keep the dimension for broadcasting
        mean = x.mean(dim = -1, keepdim = True) # (batch, seq_len, 1)
        # Keep the dimension for broadcasting
        std = x.std(dim = -1, keepdim = True) # (batch, seq_len, 1)
        # eps is to prevent dividing by zero or when std is very small
        return self.alpha * (x - mean) / (std + self.eps) + self.bias

class FeedForwardBlock(nn.Module):

    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff) # w1 and b1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # w2 and b2

    def forward(self, x):
        # (batch, seq_len, d_model) --> (batch, seq_len, d_ff) --> (batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

class InputEmbeddings(nn.Module):

    def __init__(self, d_model: int, vocab_size: int) -> None:
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        # (batch, seq_len) --> (batch, seq_len, d_model)
        # Multiply by sqrt(d_model) to scale the embeddings according to the paper
        return self.embedding(x) * math.sqrt(self.d_model)
    
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)
        # Create a matrix of shape (seq_len, d_model)
        pe = torch.zeros(seq_len, d_model)
        # Create a vector of shape (seq_len)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) # (seq_len, 1)
        # Create a vector of shape (d_model)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # (d_model / 2)
        # Apply sine to even indices
        pe[:, 0::2] = torch.sin(position * div_term) # sin(position * (10000 ** (2i / d_model))
        # Apply cosine to odd indices
        pe[:, 1::2] = torch.cos(position * div_term) # cos(position * (10000 ** (2i / d_model))
        # Add a batch dimension to the positional encoding
        pe = pe.unsqueeze(0) # (1, seq_len, d_model)
        # Register the positional encoding as a buffer
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False) # (batch, seq_len, d_model)
        return self.dropout(x)

class ResidualConnection(nn.Module):
    
        def __init__(self, features: int, dropout: float) -> None:
            super().__init__()
            self.dropout = nn.Dropout(dropout)
            self.norm = LayerNormalization(features)
    
        def forward(self, x, sublayer):
            return x + self.dropout(sublayer(self.norm(x)))

class MultiHeadAttentionBlock(nn.Module):

    def __init__(self, d_model: int, h: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model # Embedding vector size
        self.h = h # Number of heads
        # Make sure d_model is divisible by h
        assert d_model % h == 0, "d_model is not divisible by h"

        self.d_k = d_model // h # Dimension of vector seen by each head
        self.w_q = nn.Linear(d_model, d_model, bias=False) # Wq
        self.w_k = nn.Linear(d_model, d_model, bias=False) # Wk
        self.w_v = nn.Linear(d_model, d_model, bias=False) # Wv
        self.w_o = nn.Linear(d_model, d_model, bias=False) # Wo
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        d_k = query.shape[-1]
        # Just apply the formula from the paper
        # (batch, h, seq_len, d_k) --> (batch, h, seq_len, seq_len)
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            # Write a very low value (indicating -inf) to the positions where mask == 0
            attention_scores.masked_fill_(mask == 0, -1e9)
        attention_scores = attention_scores.softmax(dim=-1) # (batch, h, seq_len, seq_len) # Apply softmax
        if dropout is not None:
            attention_scores = dropout(attention_scores)
        # (batch, h, seq_len, seq_len) --> (batch, h, seq_len, d_k)
        # return attention scores which can be used for visualization
        return (attention_scores @ value), attention_scores

    def forward(self, q, k, v, mask):
        query = self.w_q(q) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        key = self.w_k(k) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        value = self.w_v(v) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)

        # (batch, seq_len, d_model) --> (batch, seq_len, h, d_k) --> (batch, h, seq_len, d_k)
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)

        # Calculate attention
        x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)
        
        # Combine all the heads together
        # (batch, h, seq_len, d_k) --> (batch, seq_len, h, d_k) --> (batch, seq_len, d_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        # Multiply by Wo
        # (batch, seq_len, d_model) --> (batch, seq_len, d_model)  
        return self.w_o(x)

class EncoderBlock(nn.Module):

    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(2)])

    def forward(self, x, src_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x
    
class Encoder(nn.Module):

    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

class DecoderBlock(nn.Module):

    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(3)])

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))
        x = self.residual_connections[2](x, self.feed_forward_block)
        return x
    
class Decoder(nn.Module):

    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x)

class ProjectionLayer(nn.Module):

    def __init__(self, d_model, vocab_size) -> None:
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x) -> None:
        # (batch, seq_len, d_model) --> (batch, seq_len, vocab_size)
        return self.proj(x)
    
class Transformer(nn.Module):

    def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: InputEmbeddings, tgt_embed: InputEmbeddings, src_pos: PositionalEncoding, tgt_pos: PositionalEncoding, projection_layer: ProjectionLayer) -> None:
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer

    def encode(self, src, src_mask):
        # (batch, seq_len, d_model)
        src = self.src_embed(src)
        src = self.src_pos(src)
        return self.encoder(src, src_mask)
    
    def decode(self, encoder_output: torch.Tensor, src_mask: torch.Tensor, tgt: torch.Tensor, tgt_mask: torch.Tensor):
        # (batch, seq_len, d_model)
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)
    
    def project(self, x):
        # (batch, seq_len, vocab_size)
        return self.projection_layer(x)
    
def build_transformer(src_vocab_size: int, 
                      tgt_vocab_size: int, 
                      src_seq_len: int, 
                      tgt_seq_len: int, 
                      d_model: int=512,
                      N: int=6,
                      h: int=8,
                      dropout: float=0.1,
                      d_ff: int=2048) -> Transformer:
    
    # Create the embedding layers
    src_embed = InputEmbeddings(d_model, src_vocab_size)
    tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)

    # Create the positional encoding layers
    src_pos = PositionalEncoding(d_model, src_seq_len, dropout)
    tgt_pos = PositionalEncoding(d_model, tgt_seq_len, dropout)
    
    # Create the encoder blocks
    encoder_blocks = []
    for _ in range(N):
        encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        encoder_block = EncoderBlock(d_model, encoder_self_attention_block, feed_forward_block, dropout)
        encoder_blocks.append(encoder_block)

    # Create the decoder blocks
    decoder_blocks = []
    for _ in range(N):
        decoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        decoder_block = DecoderBlock(d_model, decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout)
        decoder_blocks.append(decoder_block)
    
    # Create the encoder and decoder
    encoder = Encoder(d_model, nn.ModuleList(encoder_blocks))
    decoder = Decoder(d_model, nn.ModuleList(decoder_blocks))
    
    # Create the projection layer
    projection_layer = ProjectionLayer(d_model, tgt_vocab_size)
    
    # Create the transformer
    transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer)
    
    # Initialize the parameters
    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    
    return transformer

In [17]:
vocab_src_len = tokenizer_src.get_vocab_size()
vocab_tgt_len = tokenizer_tgt.get_vocab_size()

model = build_transformer(vocab_src_len, vocab_tgt_len, config["seq_len"], config['seq_len'], d_model=config['d_model']).to(device)
model

Transformer(
  (encoder): Encoder(
    (layers): ModuleList(
      (0-5): 6 x EncoderBlock(
        (self_attention_block): MultiHeadAttentionBlock(
          (w_q): Linear(in_features=512, out_features=512, bias=False)
          (w_k): Linear(in_features=512, out_features=512, bias=False)
          (w_v): Linear(in_features=512, out_features=512, bias=False)
          (w_o): Linear(in_features=512, out_features=512, bias=False)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward_block): FeedForwardBlock(
          (linear_1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear_2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (residual_connections): ModuleList(
          (0-1): 2 x ResidualConnection(
            (dropout): Dropout(p=0.1, inplace=False)
            (norm): LayerNormalization()
          )
        )
      )
    )
    (norm): LayerNormalization

In [29]:
def greedy_decode(model: build_transformer, 
                  source: torch.Tensor, 
                  source_mask: torch.Tensor,
                  tokenizer_tgt: Tokenizer, 
                  max_len: int,
                  device: torch.device)->torch.Tensor:
    """
    Decodes a sequence from the source using a greedy decoding approach with a transformer model.

    Args:
        model ("build_transformer"): The transformer model to be used for encoding and decoding.
        source (torch.Tensor): The input tensor representing the source sequence. Shape: `(batch_size, seq_len)`.
        source_mask (torch.Tensor): A mask for the source input sequence. Shape: `(batch_size, 1, seq_len)`.
        tokenizer_tgt ("Tokenizer"): The tokenizer for the target language. Must provide `token_to_id` for special tokens.
        max_len (int): The maximum length for the decoded sequence.
        device (torch.device): The device (CPU or GPU) to perform the decoding on.

    Returns:
        torch.Tensor: A tensor representing the decoded sequence, excluding padding. Shape: `(seq_len,)`.

    Example:
        >>> from tokenizers import Tokenizer
        >>> import torch
        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        >>> src_tensor = torch.tensor([[1, 2, 3, 4]], dtype=torch.int64)
        >>> src_mask = torch.tensor([[[1, 1, 1, 1]]], dtype=torch.int64)
        >>> tgt_tokenizer = Tokenizer.from_file("tokenizer_tgt.json")
        >>> transformer_model = build_transformer()  # Example transformer model
        >>> decoded_seq = greedy_decode(transformer_model, src_tensor, src_mask, tgt_tokenizer, max_len=20, device=device)
        >>> print(decoded_seq)
    """
    
    sos_idx = tokenizer_tgt.token_to_id('[SOS]')
    eos_idx = tokenizer_tgt.token_to_id('[EOS]')

    # Precompute the encoder output and reuse it for every step
    encoder_output = model.encode(source, source_mask)
    # Initialize the decoder input with the sos token
    decoder_input = torch.empty(1, 1).fill_(sos_idx).type_as(source).to(device)
    while True:
        if decoder_input.size(1) == max_len:
            break

        # build mask for target
        decoder_mask = causal_mask(decoder_input.size(1)).type_as(source_mask).to(device)

        # calculate output
        out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)

        # get next token
        prob = model.project(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        decoder_input = torch.cat(
            [decoder_input, torch.empty(1, 1).type_as(source).fill_(next_word.item()).to(device)], dim=1
        )

        if next_word == eos_idx:
            break

    return decoder_input.squeeze(0)


def eval_model(model: build_tokenizer,
               greedy_decode: callable,
               val_dataloader: torch.utils.data.DataLoader, 
               tokenizer_src: Tokenizer, 
               tokenizer_tgt: Tokenizer,
               max_len: int, 
               device: torch.device, 
               num_examples:int=2) -> None:
    """
    Evaluates a transformer model by generating predictions on a validation dataset using greedy decoding.

    Args:
        model ("build_transformer"): The transformer model to be evaluated.
        greedy_decode (callable): The function for performing greedy decoding on the model.
        val_dataloader (torch.utils.data.DataLoader): The DataLoader for the validation dataset.
        tokenizer_src ("Tokenizer"): Tokenizer for the source language.
        tokenizer_tgt ("Tokenizer"): Tokenizer for the target language.
        max_len (int): Maximum length for the decoded sequence.
        device (torch.device): The device (CPU or GPU) to run the evaluation on.
        num_examples (int, optional): Number of examples to print during evaluation. Defaults to 2.

    Returns:
        None: Prints the source text, expected target text, and predicted output to the console.

    Example:
        >>> from tokenizers import Tokenizer
        >>> from torch.utils.data import DataLoader
        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        >>> val_loader = DataLoader(val_dataset, batch_size=1)
        >>> src_tokenizer = Tokenizer.from_file("tokenizer_src.json")
        >>> tgt_tokenizer = Tokenizer.from_file("tokenizer_tgt.json")
        >>> eval_model(transformer_model, greedy_decode, val_loader, src_tokenizer, tgt_tokenizer, max_len=50, device=device)
    """
    
    model.eval()
    count = 0

    source_texts = []
    expected = []
    predicted = []

    try:
        # get the console window width
        with os.popen('stty size', 'r') as console:
            _, console_width = console.read().split()
            console_width = int(console_width)
    except:
        # If we can't get the console width, use 80 as default
        console_width = 80

    with torch.no_grad():
        for batch in val_dataloader:
            count += 1
            encoder_input = batch["encoder_input"].to(device) # (b, seq_len)
            encoder_mask = batch["encoder_mask"].to(device) # (b, 1, 1, seq_len)

            # check that the batch size is 1
            assert encoder_input.size(
                0) == 1, "Batch size must be 1 for validation"

            model_out = greedy_decode(model, encoder_input, encoder_mask, tokenizer_tgt, max_len, device)

            source_text = batch["src_text"][0]
            target_text = batch["tgt_text"][0]
            model_out_text = tokenizer_tgt.decode(model_out.detach().cpu().numpy())

            source_texts.append(source_text)
            expected.append(target_text)
            predicted.append(model_out_text)
            
            # Print the source, target and model output
            print('-'*console_width)
            print(f"{f'SOURCE: ':>12}{source_text}")
            print(f"{f'TARGET: ':>12}{target_text}")
            print(f"{f'PREDICTED: ':>12}{model_out_text}")

            if count == num_examples:
                print('-'*console_width)
                break

def translate_english_to_italian(model:build_transformer,
                                 tokenizer_src: Tokenizer,
                                 tokenizer_tgt: Tokenizer,
                                 english_sentence: str,
                                 max_len: int,
                                 device: torch.device) -> str:
    """
    Translates an English sentence into Italian using a trained transformer model.

    Args:
        model ("build_transformer"): The trained transformer model.
        tokenizer_src ("Tokenizer"): Tokenizer for the source language (English).
        tokenizer_tgt ("Tokenizer"): Tokenizer for the target language (Italian).
        english_sentence (str): The English sentence to translate.
        max_len (int): Maximum length for the translated sentence.
        device (torch.device): Device (CPU or GPU) to use for the translation.

    Returns:
        str: The translated sentence in Italian.
    """
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        # Tokenize the English sentence
        src_tokens = tokenizer_src.encode(english_sentence).ids
        src_tensor = torch.tensor([src_tokens], dtype=torch.int64).to(device)
        
        # Create a source mask
        src_mask = (src_tensor != tokenizer_src.token_to_id("[PAD]")).unsqueeze(1).unsqueeze(1).to(device)

        # Perform greedy decoding to get the Italian translation
        translated_tokens = greedy_decode(
            model=model,
            source=src_tensor,
            source_mask=src_mask,
            tokenizer_tgt=tokenizer_tgt,
            max_len=max_len,
            device=device,
        )

        # Decode the translated tokens to get the Italian sentence
        italian_sentence = tokenizer_tgt.decode(translated_tokens.detach().cpu().numpy())
    
    return italian_sentence

In [35]:
def train_model(model: build_tokenizer,
                greedy_decode: callable,
                tokenizer_src:Tokenizer,
                tokenizer_tgt:Tokenizer,
                val_dataloader: torch.utils.data.DataLoader,
                train_dataloader: torch.utils.data.DataLoader,
                optimizer: torch.optim,
                loss_fn: nn.CrossEntropyLoss,
                english_sentence:str,
                config: dict, 
                device: torch.device) -> None:
    """
    Trains a transformer model for a sequence-to-sequence task using the provided dataset and configuration.

    Args:
        model ("build_transformer"): The transformer model to train.
        greedy_decode (callable): Function for greedy decoding during validation.
        tokenizer_src ("Tokenizer"): Tokenizer for the source language.
        tokenizer_tgt ("Tokenizer"): Tokenizer for the target language.
        val_dataloader (torch.utils.data.DataLoader): DataLoader for the validation dataset.
        train_dataloader (torch.utils.data.DataLoader): DataLoader for the training dataset.
        optimizer (torch.optim.Optimizer): Optimizer for updating model parameters.
        loss_fn (nn.CrossEntropyLoss): Loss function for computing the training loss.
        config (dict): Configuration dictionary containing hyperparameters such as `num_epochs` and `seq_len`.
        device (torch.device): Device (CPU or GPU) to use for training.

    Returns:
        None: The function trains the model in-place and prints progress and evaluation results after each epoch.

    Example:
        >>> from tokenizers import Tokenizer
        >>> from torch.utils.data import DataLoader
        >>> import torch.nn as nn
        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        >>> config = {"num_epochs": 10, "seq_len": 50}
        >>> train_model(
        ...     model=transformer_model,
        ...     greedy_decode=greedy_decode,
        ...     tokenizer_src=src_tokenizer,
        ...     tokenizer_tgt=tgt_tokenizer,
        ...     val_dataloader=val_loader,
        ...     train_dataloader=train_loader,
        ...     optimizer=torch.optim.Adam(transformer_model.parameters(), lr=0.001),
        ...     loss_fn=nn.CrossEntropyLoss(),
        ...     config=config,
        ...     device=device,
        ... )
    """
    
    # Define the device
    device = "cuda" if torch.cuda.is_available() else "mps" if torch.has_mps or torch.backends.mps.is_available() else "cpu"
    print("Using device:", device)
    if (device == 'cuda'):
        print(f"Device name: {torch.cuda.get_device_name(device.index)}")
        print(f"Device memory: {torch.cuda.get_device_properties(device.index).total_memory / 1024 ** 3} GB")
    elif (device == 'mps'):
        print(f"Device name: <mps>")
    else:
        print("NOTE: If you have a GPU, consider using it for training.")
        print("On a Windows machine with NVidia GPU, check this video: https://www.youtube.com/watch?v=GMSjDTU8Zlc")
        print("On a Mac machine, run: pip3 install --pre torch torchvision torchaudio torchtext --index-url https://download.pytorch.org/whl/nightly/cpu")

    initial_epoch = 0

    for epoch in range(initial_epoch, config['num_epochs']):
        torch.cuda.empty_cache()
        model.train()
        batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")
        for batch in batch_iterator:

            encoder_input = batch['encoder_input'].to(device) # (b, seq_len)
            decoder_input = batch['decoder_input'].to(device) # (B, seq_len)
            encoder_mask = batch['encoder_mask'].to(device) # (B, 1, 1, seq_len)
            decoder_mask = batch['decoder_mask'].to(device) # (B, 1, seq_len, seq_len)

            # Run the tensors through the encoder, decoder and the projection layer
            encoder_output = model.encode(encoder_input, encoder_mask) # (B, seq_len, d_model)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
            proj_output = model.project(decoder_output) # (B, seq_len, vocab_size)

            # Compare the output with the label
            label = batch['label'].to(device) # (B, seq_len)

            # Compute the loss using a simple cross entropy
            loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})
            
            # Backpropagate the loss
            loss.backward()

            # Update the weights
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)

        # Run validation at the end of every epoch
        eval_model(model, greedy_decode ,val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device)
        
        predicted_italian_sentence = translate_english_to_italian(
                model=model,
                tokenizer_src=tokenizer_src,
                tokenizer_tgt=tokenizer_tgt,
                english_sentence=english_sentence,
                max_len=50,
                device=device,
            )
        
        print()
        print("-"*150)
        print(f"English Sentence: {english_sentence}")
        print(f"Actual Italian Sentence: {'come stai amico mio?'}")
        print(f"Predicted Italian Sentence: {predicted_italian_sentence}")
        print()

if __name__ == '__main__':
    
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps=1e-9)
    loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id('[PAD]'), label_smoothing=0.1).to(device)
    english_sentence = "how are you my friend?"
    warnings.filterwarnings("ignore")
    train_model(
            model=model,
            greedy_decode=greedy_decode,
            tokenizer_src=tokenizer_src,
            tokenizer_tgt=tokenizer_tgt,
            val_dataloader=val_dataloader,
            train_dataloader=train_dataloader,
            optimizer=optimizer,
            loss_fn=loss_fn,
            english_sentence=english_sentence,
            config=config,
            device=device,
           )

Using device: cuda
Device name: Tesla P100-PCIE-16GB
Device memory: 15.887939453125 GB


Processing Epoch 00: 100%|██████████| 2365/2365 [09:47<00:00,  4.03it/s, loss=6.550]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: CHAPTER XII
    TARGET: XII
 PREDICTED: 
--------------------------------------------------------------------------------
    SOURCE: The doctor answered him, and then began talking about the scenes in the city Duma.
    TARGET: Il dottore aveva risposto, ma poi s’era messo a parlare sui disordini del consiglio di stato.
 PREDICTED: Il suo momento , e si , e il suo suo suo suo suo suo suo suo suoi .
--------------------------------------------------------------------------------

------------------------------------------------------------------------------------------------------------------------------------------------------
English Sentence: how are you my friend?
Actual Italian Sentence: come stai amico mio?
Predicted Italian Sentence: Ma non è ?



Processing Epoch 01: 100%|██████████| 2365/2365 [09:46<00:00,  4.04it/s, loss=5.231]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: 'They say that one who has been best man more than ten times never marries, and I wanted to be one for the tenth time to make myself safe, but was too late,' Count Sinyavin was saying to the pretty young Princess Charskaya, who had designs on him.
    TARGET: — Dicono che non si sposa chi fa da compare d’anello più di dieci volte, e io volevo farlo per la decima volta per acquietarmi, ma il posto era occupato — diceva il conte Sinjavin alla graziosa principessa carskaja, che aveva delle mire su di lui.
 PREDICTED: — E che non è mai mai mai mai mai mai mai mai mai mai mai mai mai mai mai , ma non è mai mai mai mai mai mai mai mai , ma non è mai , e che non si , e che non si , e che non si , e che non si , e che non si .
--------------------------------------------------------------------------------
    SOURCE: I think he was swearing, but am not certain; however, he was pronouncing some formula

Processing Epoch 02: 100%|██████████| 2365/2365 [09:45<00:00,  4.04it/s, loss=4.038]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: There was a pause, and the mother exchanged glances with her daughter.
    TARGET: Seguì un silenzio. La madre e la figlia si guardarono ancora una volta.
 PREDICTED: Era un ’ altra cosa , e la moglie , si mise a lei .
--------------------------------------------------------------------------------
    SOURCE: He saw that she was saying what she forced herself to utter and not what she wished to say.
    TARGET: Egli sentiva che Anna diceva quello che s’era imposta di dire, non quello che avrebbe voluto dire.
 PREDICTED: Egli era un ’ amore , ma non si poteva nulla , e non poteva nulla .
--------------------------------------------------------------------------------

------------------------------------------------------------------------------------------------------------------------------------------------------
English Sentence: how are you my friend?
Actual Italian Sentence: come stai ami

Processing Epoch 03: 100%|██████████| 2365/2365 [09:46<00:00,  4.03it/s, loss=6.192]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: I have a vague recollection of having been woke up at least a dozen times during the night by Harris wandering about the boat with the lantern, looking for his clothes.
    TARGET: Ho un vago ricordo d’essermi svegliato almeno una dozzina di volte durante la notte, per colpa di Harris che andava in giro nella barca con la lanterna, cercando i suoi panni.
 PREDICTED: Io mi piace un po ’ di nuovo , che non si , il fiume , con la chiusa di , e il sole , il sole .
--------------------------------------------------------------------------------
    SOURCE: And George said: "Not at all;" that it was his fault; and Harris said no, it was his.
    TARGET: E Giorgio disse: — No, niente, non è colpa tua; — e Harris osservò che infatti, era sua.
 PREDICTED: E Giorgio disse : “ Non c ’ era nulla , ma non si mise a parlare , e non si mise a parlare a lei .
---------------------------------------------------

Processing Epoch 04: 100%|██████████| 2365/2365 [09:46<00:00,  4.03it/s, loss=4.588]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: 'No, not now, later!' he said.
    TARGET: — No, non ora, dopo! — egli disse.
 PREDICTED: — No , non ne parliamo più .
--------------------------------------------------------------------------------
    SOURCE: It was very bitter, and he felt ashamed; yet mixed with the bitterness and the shame he felt a sense of joy and emotion at the greatness of his own humility.
    TARGET: Quanta amarezza, quanta vergogna! ma insieme con quest’amarezza e con questa vergogna erano in lui la gioia e la commozione che gli venivano dall’altezza della propria umiltà.
 PREDICTED: Era così , e , senza rispondere , ma , senza , e , con la sua vita , e la sua vita , la sua felicità , la propria felicità .
--------------------------------------------------------------------------------

----------------------------------------------------------------------------------------------------------------------------------

Processing Epoch 05: 100%|██████████| 2365/2365 [09:46<00:00,  4.04it/s, loss=2.985]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: 'I think I'll take him,' replied Vronsky.
    TARGET: — Credo che lo comprerò — rispose Vronskij.
 PREDICTED: — Io penso — disse Vronskij .
--------------------------------------------------------------------------------
    SOURCE: Ignorant midwives murder the babies, and the people remain steeped in ignorance, at the mercy of every village clerk; while you have in your power the means of helping them, and yet are not helping because you do not consider it important!'
    TARGET: Queste mammane fanno morir di fame i bambini e il popolo marcisce nell’ignoranza e rimane in potere di un qualsiasi scribacchino, mentre tu hai in mano i mezzi per riparare a questo, e non te ne dài pensiero perché, secondo te, la cosa non è importante.
 PREDICTED: e i nostri , e i nostri tempi , in campagna , in campagna , in campagna , non si , e non si , e non si .
--------------------------------------------------

Processing Epoch 06: 100%|██████████| 2365/2365 [09:46<00:00,  4.03it/s, loss=4.089]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Karenin halted and turned pale.
    TARGET: Aleksej Aleksandrovic si fermò e impallidì.
 PREDICTED: Aleksej Aleksandrovic si alzò e si alzò le spalle .
--------------------------------------------------------------------------------
    SOURCE: It's true Alesha did not stand very well: he kept turning round to see the back of his jacket; but nevertheless he was wonderfully sweet.
    TARGET: È vero che Alëša non stava proprio del tutto composto: non faceva che voltarsi per rimirarsi il dietro del giubbetto; tuttavia era straordinariamente aggraziato.
 PREDICTED: È vero che non c ’ era un , ma , dopo aver guardato il cavallo , ma era un cavallo , e lei era un ’ altra cosa .
--------------------------------------------------------------------------------

-----------------------------------------------------------------------------------------------------------------------------------------------

Processing Epoch 07: 100%|██████████| 2365/2365 [09:46<00:00,  4.03it/s, loss=4.391]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Accordingly, we went on board, took the arms which were left on board out of her, and whatever else we found there—which was a bottle of brandy, and another of rum, a few biscuit-cakes, a horn of powder, and a great lump of sugar in a piece of canvas (the sugar was five or six pounds): all which was very welcome to me, especially the brandy and sugar, of which I had had none left for many years.
    TARGET: Detto fatto! Venuti alla scialuppa ne levammo l’armi che v’erano state lasciate entro, e quant’altre minuzie vi ritrovammo; cioè un fiaschetto d’acquavite, uno di rum, una piccola provvigione di biscotto, un fiaschetto di polvere, un gran pane di zucchero del peso di cinque libbre, avvolto in un pezzo di canovaccio, tutte cose capitate in buon punto per me, massime l’acquavite e lo zucchero, di cui non vedeva da molti anni il vestigio.
 PREDICTED: a bordo il vascello , e ci diede la terra , 

Processing Epoch 08: 100%|██████████| 2365/2365 [09:46<00:00,  4.03it/s, loss=3.850]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Oblonsky, half asleep, refused to budge so early.
    TARGET: Oblonskij, nel sonno, si rifiutava di andare via così presto.
 PREDICTED: Stepan Arkad ’ ic , , , a lungo il tempo .
--------------------------------------------------------------------------------
    SOURCE: Knowing that he is a kind and generous man – that I am not worth his little finger – nevertheless I hate him!
    TARGET: Lo credi che, pur sapendo che egli è un uomo buono, eccellente, che io non valgo una sua unghia, tuttavia, io lo odio?
 PREDICTED: che è un uomo e non ho paura di un uomo che io non ho paura .
--------------------------------------------------------------------------------

------------------------------------------------------------------------------------------------------------------------------------------------------
English Sentence: how are you my friend?
Actual Italian Sentence: come stai amico mio?


Processing Epoch 09: 100%|██████████| 2365/2365 [09:46<00:00,  4.03it/s, loss=4.340]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs.
    TARGET: Aguzzò gli occhi, e cercò di fissare il fondo, per scoprire qualche cosa; ma in fondo era buio pesto e non si scopriva nulla. Guardò le pareti del pozzo e s'accorse che erano rivestite di scaffali di biblioteche; e sparse qua e là di mappe e quadri, sospesi a chiodi.
 PREDICTED: In questo momento si , e si sentì che per lo stesso momento si ; ma ora era già in modo che , per lui si , si con gli occhi di un ’ altra parte , si , e si con gli occhi e si , si e si delle sue .
--------------------------------------------------------------------------------
    SOURCE: "Come, Jane--come hither."
    TARGET: — Venite, Jane, 

Processing Epoch 10: 100%|██████████| 2365/2365 [09:46<00:00,  4.03it/s, loss=2.924]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: 'Oh dear, oh dear!' he muttered despairingly, as he recalled the most painful details of the quarrel.
    TARGET: “Ahi, ahi, ahi!” ripeteva con disperazione, ricordando le impressioni più penose per lui di quella rottura.
 PREDICTED: — Ah , ahi , ahi ! — gridò lei , con chiarezza che le idee avevano espresso in mente .
--------------------------------------------------------------------------------
    SOURCE: Linen I had none left but what was mere rags; I had goat’s hair, but neither knew how to weave it or spin it; and had I known how, here were no tools to work it with.
    TARGET: Biancheria io non ne avea che non fosse ridotta a veri cenci: aveva del pelo di capra; nè certo sapeva come si facesse nè a filarlo nè a tesserlo.
 PREDICTED: Non avevo mai potuto , ma che cosa era , perchè il collo non sapeva come se fosse , nè sapeva io avessi potuto ; e quanto io avessi potuto con tanta fatica

Processing Epoch 11: 100%|██████████| 2365/2365 [09:46<00:00,  4.03it/s, loss=3.010]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: The baby had taken the breast.
    TARGET: Il bambino s’era attaccato al petto.
 PREDICTED: La bambina aveva le sue mani .
--------------------------------------------------------------------------------
    SOURCE: 'Do you know that our brother Nicholas is here again?'
    TARGET: — Sai, Nikolaj è di nuovo qui.
 PREDICTED: — Sapete che il fratello è venuto a cominciare a cominciare a parlare ?
--------------------------------------------------------------------------------

------------------------------------------------------------------------------------------------------------------------------------------------------
English Sentence: how are you my friend?
Actual Italian Sentence: come stai amico mio?
Predicted Italian Sentence: Come siete mio amico ?



Processing Epoch 12: 100%|██████████| 2365/2365 [09:46<00:00,  4.03it/s, loss=2.474]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: As if just awakened from a dream, it was long before he could collect his thoughts.
    TARGET: Come se si fosse svegliato da un sonno, Levin stentò a tornare in sé.
 PREDICTED: Come se l ’ espressione di Pilato , il sogno era stato molto strano , era rimasto in tempo .
--------------------------------------------------------------------------------
    SOURCE: "If I were you, I should cut down those limes, but it must be done when the sap rises.
    TARGET: A criterio mio, codesto bosco di tigli, lo taglierei. Basta farlo quando è in succhio.
 PREDICTED: — Se vi , vi a questi alberi , ma credo che si a pregare , .
--------------------------------------------------------------------------------

------------------------------------------------------------------------------------------------------------------------------------------------------
English Sentence: how are you my friend?
Actual Ita

Processing Epoch 13: 100%|██████████| 2365/2365 [09:46<00:00,  4.03it/s, loss=2.722]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: "Voila, Monsieur Rochester, qui revient!"
    TARGET: — Ecco il signor Rochester!
 PREDICTED: — , signorina , buona febbre .
--------------------------------------------------------------------------------
    SOURCE: I then returned: "You are not without sense, cousin Eliza; but what you have, I suppose, in another year will be walled up alive in a French convent.
    TARGET: — Voi pure non ne mancate, Elisa, ma quando penso che fra un anno il vostro buon senso vi avrà rinchiuso fra le mura di un convento francese.... del resto queste cose non mi riguardano, e se vi conviene, basta.
 PREDICTED: " Allora mi avete detto , non siete accorto e neppure senza che avete avuto un vecchio , ma che vi ho visto come un giovane come una .
--------------------------------------------------------------------------------

---------------------------------------------------------------------------------------

Processing Epoch 14: 100%|██████████| 2365/2365 [09:46<00:00,  4.03it/s, loss=3.399]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: All that night and morning Levin had lived quite unconsciously, and felt quite outside the conditions of material existence.
    TARGET: Tutta quella notte e la mattina seguente Levin aveva vissuto inconsciamente e si era sentito del tutto fuori della vita materiale.
 PREDICTED: Tutta la notte e Levin aveva passato il pensiero di Levin , sempre particolarmente tranquillo e chiara della gente .
--------------------------------------------------------------------------------
    SOURCE: "Mrs. Fairfax will smile you a calm welcome, to be sure," said I; "and little Adele will clap her hands and jump to see you: but you know very well you are thinking of another than they, and that he is not thinking of you."
    TARGET: — La signora Fairfax, — dicevo a me stessa, — ti darà, sorridendo dolcemente, il benvenuto, Adele batterà le mani e ti salterà incontro, ma tu sai che pensi a un'altra persona e che

Processing Epoch 15: 100%|██████████| 2365/2365 [09:46<00:00,  4.03it/s, loss=3.794]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Nicholas had heard, but the words had no effect on him; his look remained reproachful and strained.
    TARGET: Nikolaj aveva sentito; ma queste parole non produssero nessuna impressione su di lui. Il suo sguardo era sempre teso e accusatore.
 PREDICTED: Nikolaj era sorpreso , ma non aveva avuto il pensiero che gli era uscito , e si era messo a gridare a un tratto a un tratto .
--------------------------------------------------------------------------------
    SOURCE: At that moment Agatha Mikhaylovna came in with some jam.
    TARGET: In quel momento entrò Agaf’ja Michajlovna con la marmellata.
 PREDICTED: In quel momento , in quel momento , con un certo prezzo .
--------------------------------------------------------------------------------

------------------------------------------------------------------------------------------------------------------------------------------------------


Processing Epoch 16: 100%|██████████| 2365/2365 [09:46<00:00,  4.03it/s, loss=2.796]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: "You did right to hold fast to each other," I said: as if the monster- splinters were living things, and could hear me.
    TARGET: — Fate bene di tenervi unite ancora, — dissi alle due parti dell'albero, come se potessero ascoltarmi.
 PREDICTED: — Mi rispose , — vi , — risposi , — anche se vi le mie forze .
--------------------------------------------------------------------------------
    SOURCE: But he was suddenly struck by the calm and cheerful expression of Katavasov's face, and felt so sorry to lose the spiritual condition which he was evidently spoiling by his conversation, that recollecting his resolution he ceased speaking.
    TARGET: Ma l’espressione calma e allegra del viso di Katavasov lo colpì, a un tratto, e gli venne pietà dello stato d’animo proprio, che, evidentemente, egli turbava con quella conversazione; si ricordò del suo proposito e si fermò.
 PREDICTED: Ma Levin era in

Processing Epoch 17: 100%|██████████| 2365/2365 [09:45<00:00,  4.04it/s, loss=2.425]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: He also said he was ready for bed.
    TARGET: Aggiunse che era pronto per andare a letto.
 PREDICTED: E tutto andava dicendo che il letto era pronto .
--------------------------------------------------------------------------------
    SOURCE: You follow me, sur."
    TARGET: Seguitemi, signore.
 PREDICTED: Mi .
--------------------------------------------------------------------------------

------------------------------------------------------------------------------------------------------------------------------------------------------
English Sentence: how are you my friend?
Actual Italian Sentence: come stai amico mio?
Predicted Italian Sentence: Come state ?



Processing Epoch 18: 100%|██████████| 2365/2365 [09:45<00:00,  4.04it/s, loss=3.053]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: "What foreign country was he going to, Bessie?"
    TARGET: — E in qual paese andava, Bessie?
 PREDICTED: — Che c ' è di partire , Bessie ?
--------------------------------------------------------------------------------
    SOURCE: VRONSKY DID NOT EVEN TRY TO SLEEP that night.
    TARGET: Per tutta quella notte Vronskij non tentò neppure d’addormentarsi.
 PREDICTED: Vronskij non conosceva né quella notte .
--------------------------------------------------------------------------------

------------------------------------------------------------------------------------------------------------------------------------------------------
English Sentence: how are you my friend?
Actual Italian Sentence: come stai amico mio?
Predicted Italian Sentence: Come mai , amica mia ?



Processing Epoch 19: 100%|██████████| 2365/2365 [09:45<00:00,  4.04it/s, loss=2.027]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: And this consciousness was at first so painful, the fear lest that helpless being should suffer was so strong, that it quite hid the strange feeling of unreasoning joy and even pride which he experienced when the baby sneezed.
    TARGET: E questa coscienza era così tormentosa nei primi tempi, il terrore che quell’essere impotente soffrisse era così forte, che proprio per questo non avvertiva lo strano sentimento di spensierata gioia e perfino di orgoglio ch’egli aveva provato proprio nel momento in cui il bambino aveva starnutito.
 PREDICTED: E la coscienza era così facile il più piccolo , che per essere costretto a , così come se il terrore avesse fatto la umiliazione a quell ’ incubo opprimente della propria umiliazione .
--------------------------------------------------------------------------------
    SOURCE: Therefore goodness is beyond the chain of cause and effect.
    TARGET: Perciò,

In [36]:
config = {
        "batch_size": 8,
        "num_epochs": 6,
        "lr": 10**-3,
        "seq_len": 350,
        "d_model": 512,
        "datasource": 'opus_books',
        "lang_src": "en",
        "lang_tgt": "it",
        "model_folder": "weights",
        "model_basename": "tmodel_",
        "preload": "latest",
        "tokenizer_file": "tokenizer_{0}.json",
        "experiment_name": "runs/tmodel"
}

In [37]:
train_model(
            model=model,
            greedy_decode=greedy_decode,
            tokenizer_src=tokenizer_src,
            tokenizer_tgt=tokenizer_tgt,
            val_dataloader=val_dataloader,
            train_dataloader=train_dataloader,
            optimizer=optimizer,
            loss_fn=loss_fn,
            english_sentence=english_sentence,
            config=config,
            device=device,
           )

Using device: cuda
Device name: Tesla P100-PCIE-16GB
Device memory: 15.887939453125 GB


Processing Epoch 00: 100%|██████████| 2365/2365 [09:45<00:00,  4.04it/s, loss=2.246]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: "No."
    TARGET: — No.
 PREDICTED: — No .
--------------------------------------------------------------------------------
    SOURCE: Should any little accidental disappointment of the appetite occur, such as the spoiling of a meal, the under or the over dressing of a dish, the incident ought not to be neutralised by replacing with something more delicate the comfort lost, thus pampering the body and obviating the aim of this institution; it ought to be improved to the spiritual edification of the pupils, by encouraging them to evince fortitude under temporary privation.
    TARGET: Se accade loro un piccolo incidente, un pasto guastato, per esempio, non si deve paralizzare l'effetto dell'azione. Voi dimenticate lo scopo di questa istituzione e certi avvenimenti dovrebbero esser cagione di edificazione per le alunne; sarebbe quello il momento di predicare la forza d'animo nelle privazioni del

Processing Epoch 01: 100%|██████████| 2365/2365 [09:45<00:00,  4.04it/s, loss=1.926]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Turn that Dormouse out of court!
    TARGET: Fuori quel Ghiro!
 PREDICTED: il Ghiro !
--------------------------------------------------------------------------------
    SOURCE: What would happen to her and to her son, toward whom his feelings had changed as they had toward her, no longer occupied his mind.
    TARGET: Tutto quello che sarebbe poi accaduto di lei e del figlio, verso il quale, così come verso di lei, si eran mutati i suoi sentimenti, non lo interessava più.
 PREDICTED: Ma cosa , per quanto avrebbe potuto amare il figlio , in cui era accaduto fra i suoi sentimenti , non era più nulla di prima .
--------------------------------------------------------------------------------

------------------------------------------------------------------------------------------------------------------------------------------------------
English Sentence: how are you my friend?
Actual Italian 

Processing Epoch 02: 100%|██████████| 2365/2365 [09:45<00:00,  4.04it/s, loss=1.543]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: I did not care to go out of sight of the boat, fearing the coming of canoes with savages down the river; but the boy seeing a low place about a mile up the country, rambled to it, and by-and-by I saw him come running towards me.
    TARGET: Non mi piacea di perdere di vista la scialuppa, per paura che alcuni canotti di selvaggi scendessero lungo il fiume; ma il ragazzo scorgendo una valletta lontana circa un miglio dal luogo ove eravamo, si trasse fin là, nè andò guari che il vidi tornare a me correndo come il vento. Pensai fosse inseguìto da qualche uomo, o spaventato da qualche fiera, onde gli corsi incontro per aiutarlo; ma quando gli fui più vicino, vidi alcun che pendergli dalle spalle.
 PREDICTED: Non m ’ importa d ’ esser venuta a vedere la vista di vedere veder la barca , che , verso il pesce , aveva una specie di metri lontano dal bosco , e in un paese , mi sentii che lontano .
-------

Processing Epoch 03: 100%|██████████| 2365/2365 [09:44<00:00,  4.04it/s, loss=2.030]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: I speak particularly of the young ladies. St. John's eyes, though clear enough in a literal sense, in a figurative one were difficult to fathom.
    TARGET: Parlo particolarmente delle due ragazze, perché gli occhi di Saint-John, benché fossero chiari nel significato vero della parola, erano inesplorabili.
 PREDICTED: Io parlo sinceramente , tutti i signori di Saint - John , ma quella differenza che amava una spiegazione , anche qualunque fosse il pensiero diversa da una cosa più vicina , che amava .
--------------------------------------------------------------------------------
    SOURCE: He went down to see what it was.
    TARGET: Stepan Arkad’ic uscì a vedere.
 PREDICTED: di vedere quello che era avvenuto .
--------------------------------------------------------------------------------

------------------------------------------------------------------------------------------------------

Processing Epoch 04: 100%|██████████| 2365/2365 [09:45<00:00,  4.04it/s, loss=1.950]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: The wind sighed low in the firs: all was moorland loneliness and midnight hush.
    TARGET: Il vento stormiva dolcemente fra gli abeti; intorno a me non vidi altro che solitudine.
 PREDICTED: La violenza , il vento più mite s ' era di riposo e di gliene aveva .
--------------------------------------------------------------------------------
    SOURCE: But the time came when I understood that I could no longer deceive myself that I am alive, and cannot be blamed because God made me so, that I want to love and to live.
    TARGET: Ma è venuto poi il momento in cui ho compreso, in cui non mi è stato più possibile ingannare me stessa, in cui ho sentito che ero viva, che non avevo colpa se Dio mi aveva fatto così per l’amore e per la vita.
 PREDICTED: Ma , dopo che non son venuto più a lungo , non posso vivere in che a Dio non posso vivere , e a Dio mi sarei felice la felicità che l ’ amore e io vo

Processing Epoch 05: 100%|██████████| 2365/2365 [09:45<00:00,  4.04it/s, loss=2.047]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: She wore a white dress trimmed with wide embroidery, and as she sat in a corner of the verandah behind some plants, did not hear Vronsky coming.
    TARGET: Vestiva un abito bianco con un largo ricamo; sedeva in un angolo della terrazza di là dai fiori e non aveva avvertito l’avvicinarsi di lui.
 PREDICTED: in un fuggevole vestito , e si preparava a guardare dietro , in un angolo , le finestre si fermò dinanzi a Vronskij che non portava le assi da Vronskij .
--------------------------------------------------------------------------------
    SOURCE: From that day Montmorency regarded the kettle with a mixture of awe, suspicion, and hate.
    TARGET: Da quel giorno Montmorency guardò il calderino con un misto di timore, di sospetto e di odio.
 PREDICTED: Dal giorno che Montmorency aveva un certo buon senso di rimprovero , senza ragione , e ne .
---------------------------------------------------

In [None]:
def calculate_bleu_score(
                        model: build_transformer,
                        dataloader: torch.utils.data.DataLoader,
                        tokenizer_src: Tokenizer,
                        tokenizer_tgt: Tokenizer,
                        max_len: int,
                        device: torch.device) -> float:
    """
    Calculate the average BLEU score for a model on a given dataset with a progress bar.

    Args:
        model (callable): The trained transformer model.
        dataloader (torch.utils.data.DataLoader): Dataloader containing validation or test data.
        tokenizer_src (Tokenizer): Tokenizer for the source language (English).
        tokenizer_tgt (Tokenizer): Tokenizer for the target language (Italian).
        max_len (int): Maximum length for the generated sentences.
        device (torch.device): Device to run the model on.

    Returns:
        float: The average BLEU score over the dataset.
    """
    model.eval()
    total_bleu_score = 0.0
    count = 0

    # Initialize the progress bar
    with tqdm(total=len(dataloader), desc="Calculating BLEU Score", unit="batch") as pbar:
        with torch.no_grad():
            for batch in dataloader:
                encoder_input = batch["encoder_input"].to(device)
                encoder_mask = batch["encoder_mask"].to(device)
                target_text = batch["tgt_text"]  # List of actual Italian sentences
                
                # Generate the translation using the model
                generated_tokens = translate_english_to_italian(
                    model=model,
                    tokenizer_src=tokenizer_src,
                    tokenizer_tgt=tokenizer_tgt,
                    english_sentence=batch["src_text"][0],  # Assuming batch size = 1
                    max_len=max_len,
                    device=device,
                )

                # Decode the generated tokens to text
                generated_text = generated_tokens.split()  # Split into words
                reference_text = [target_text[0].split()]  # Reference text as a list of words
                
                # Calculate BLEU score for this sentence
                bleu_score = sentence_bleu(reference_text, generated_text)
                total_bleu_score += bleu_score
                count += 1

                # Update the progress bar
                pbar.set_postfix({"BLEU (running avg)": f"{(total_bleu_score / count):.4f}"})
                pbar.update(1)

    # Calculate average BLEU score
    average_bleu_score = total_bleu_score / count if count > 0 else 0.0
    return average_bleu_score

In [42]:
average_train_bleu = calculate_bleu_score(
    model=model,
    dataloader=train_dataloader,
    tokenizer_src=tokenizer_src,
    tokenizer_tgt=tokenizer_tgt,
    max_len=config['seq_len'],
    device=device,
)

average_val_bleu = calculate_bleu_score(
    model=model,
    dataloader=val_dataloader,
    tokenizer_src=tokenizer_src,
    tokenizer_tgt=tokenizer_tgt,
    max_len=config['seq_len'],
    device=device,
)

print(f"Train Average BLEU Score: {average_train_bleu:.7f}")
print(f"Val Average BLEU Score: {average_val_bleu:.7f}")

Calculating BLEU Score: 100%|██████████| 2365/2365 [09:14<00:00,  4.27batch/s, BLEU (running avg)=0.4594]
Calculating BLEU Score: 100%|██████████| 2102/2102 [08:10<00:00,  4.28batch/s, BLEU (running avg)=0.4034]

Train Average BLEU Score: 0.4593545
Val Average BLEU Score: 0.4034139





In [None]:
torch.save(model.state_dict(), 'english_to_italian.pth')

In [76]:
model.load_state_dict(torch.load('english_to_italian.pth'))
model

Transformer(
  (encoder): Encoder(
    (layers): ModuleList(
      (0-5): 6 x EncoderBlock(
        (self_attention_block): MultiHeadAttentionBlock(
          (w_q): Linear(in_features=512, out_features=512, bias=False)
          (w_k): Linear(in_features=512, out_features=512, bias=False)
          (w_v): Linear(in_features=512, out_features=512, bias=False)
          (w_o): Linear(in_features=512, out_features=512, bias=False)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward_block): FeedForwardBlock(
          (linear_1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear_2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (residual_connections): ModuleList(
          (0-1): 2 x ResidualConnection(
            (dropout): Dropout(p=0.1, inplace=False)
            (norm): LayerNormalization()
          )
        )
      )
    )
    (norm): LayerNormalization