# Transformer from Scratch - English to Norwegian Translation

This notebook implements a full Transformer architecture from the ground up in PyTorch, inspired by the seminal paper ["Attention Is All You Need"](https://arxiv.org/abs/1706.03762). The model is trained on the `en-no` subset of the [Helsinki-NLP / opus_books](https://huggingface.co/datasets/Helsinki-NLP/opus_books) dataset to perform English → Norwegian translation.

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

import math
from pathlib import Path

from tqdm import tqdm

from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

from torch.utils.tensorboard import SummaryWriter

import torchmetrics



### Hyperparameters

In [2]:
## config.py

def get_config():
    return{
        "batch_size": 16,
        "num_epochs": 20,
        "lr": 0.0001,
        "seq_len": 150,
        "d_model": 512,
        "lang_src": "en",  #set source language here
        "lang_tgt": "no",  #set target language here
        "model_folder": "weights",
        "model_basename": "transformer1_",
        "preload": None,
        "tokenizer_file": "tokenizer_{0}.json",
        "experiment_name": "runs/transformer_model"
    }

def get_weights_file_path(config, epoch: str):
    model_folder = config['model_folder']
    model_basename = config['model_basename']
    model_filename = f"{model_basename}{epoch}.pt"

    return str(Path('.') / model_folder / model_filename)

## Building blocks

### Input Embeddings
In NLP, words/tokens are represented as integers (token IDs), but since neural networks cannot work directly with numbers, these are first converted into continuous dense vectors via embeddings.

The embedding layer:
- Maps each token ID to a learnable vector of dimension `d_model`
- Allows the model to learn semantic meaning - similar words/tokens get similar embeddings during training

Refer to section 3.4 of the paper.

In [3]:
class InputEmbeddings(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model) #maps token IDs to vectors

    def forward(self, x):
        """
        Args:
            x: Tensor of token indices, shape (batch_size, seq_len)

        Returns:
            Embedded tensor, scaled, shape (batch_size, seq_len, d_model)
        
        """
        return self.embedding(x) * math.sqrt(self.d_model)  #Refer section 3.4 of the paper (This is primarily done to balance the scale of the embeddings and positional encodings, and helps stabilize training early on.)


### Positional Encoding
Transformers process tokens in parallel, with no built-in sense of order. But language is sequential:

"The cat sat on the mat" =/= "The mat cat on the sat"

Positional information is injected into the input to create awareness of the token position, using Positional Encoding. They have the same dimension `d_model` as the input embeddings.

Refer to section 3.5 of the paper.

In [4]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)

        #Create matrix of shape (seq_len, d_model)
        pe = torch.zeros(seq_len, d_model)
        #Create a vector of shape (seq_len, 1)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) #tensor contains positions 0,1,...,seq_len - 1
        #Using section 3.5 to write out position encodings
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -math.log(10000.0) / d_model)

        pe[:, 0::2] = torch.sin(position * div_term) #sine to even positions
        pe[:, 1::2] = torch.sin(position * div_term) #cosine to odd positions

        #Add batch dimension to make the shape (1, seq_len, d_model), so it can be added to x during forward pass
        #with shape (batch_size, seq_len, d_model)
        pe = pe.unsqueeze(0)

        self.register_buffer('pe', pe) #tensor gets saved with the model and moves with it to GPU/CPU

    def forward(self, x):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False) #Add positional encodings to embeddings, requires_grad_() False means that this won't participate in backpropagation
        return self.dropout(x)
        

### Add + Norm Layer

- Layer Norm normalizes each sample’s hidden vector across dimensions, improving training dynamics and convergence.
- LayerNorm is preferred over BatchNorm because:
    1. It doesn't depend on batch statistics.
    2. Works well for variable-length sequences.

In [5]:
class LayerNorm(nn.Module):
    def __init__(self, epsilon: float = 10**-6):
        super().__init__()
        self.epsilon = epsilon
        self.bias = nn.Parameter(torch.zeros(1)) #added (shift)
        self.alpha = nn.Parameter(torch.ones(1)) #multiplied (scale)

    def forward(self, x):
        mean = x.mean(dim = -1, keepdim = True)
        std = x.std(dim = -1, keepdim = True)

        return self.alpha * (x - mean) / (std + self.epsilon) + self.bias #From section


### Feed Forward layer

Discussed in section 3.3 of the paper, and equation 2

In [6]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff) #W1 and B1
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model) #W2 and B2

    def forward(self, x):
        return self.linear2(self.dropout(torch.relu(self.linear1(x)))) #(batch, seq_len, d_model) -> (batch, seq_len, d_ff) -> (batch, seq_len, d_model)

### Multi-Head Attention

Discussed in section 3.2.2 of the paper, this is the core innovation of the transformer model.
It is multiple scaled dot-product attention heads run in parallel, followed by a linear projection.

Refer to equation 1 for calculating attention scores, and then two equations in section 3.2.2 for multihead attention scores

Steps involved:
1. Project input `(Q, K, V)` into h subspaces using `W_q`, `W_k`, `W_v`.
2. Compute scaled dot-product attention for each head.
3. Concatenate all head outputs.
4. Pass through a final `W_o` projection layer.

In [7]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, h, dropout): #h is the number of heads
        super().__init__()
        self.d_model = d_model #we have to ensure that d_model is divisible by h, d_model/h = d_k
        self.h = h
        assert d_model % h == 0, "d_model is not divisible by h"

        self.d_k = d_model // h

        self.w_q = nn.Linear(d_model, d_model) #W_q
        self.w_k = nn.Linear(d_model, d_model) #W_k
        self.w_v = nn.Linear(d_model, d_model) #W_v

        self.w_o = nn.Linear(d_model, d_model) #W_o
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        d_k = query.shape[-1] #last dimension of Q,K,V

        # Calculate attention using Equation 1
        # (batch, h, seq_len, d_k) -> (batch, h, seq_len, seq_len)
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k) #matrix multiplication of Q with K.transpose
        # Apply mask
        if mask is not None:
            attention_scores.masked_fill_(mask == 0, -1e9) #values we don't want in the attention matrix will be replaced by a very low value
        
        # apply softmax
        attention_scores = attention_scores.softmax(dim = -1) #(batch, h, seq_len, seq_len)

        # apply dropout
        if dropout is not None:
            attention_scores = dropout(attention_scores)

        return (attention_scores @ value), attention_scores

    def forward(self, q, k, v, mask):
        query = self.w_q(q)     # (batch, seq_len, d_model) -> (batch, seq_len, d_model)
        key = self.w_k(k)       # (batch, seq_len, d_model) -> (batch, seq_len, d_model)
        value = self.w_v(v)     # (batch, seq_len, d_model) -> (batch, seq_len, d_model)
 
        # Reshaping for MultiHead Attention from [batch, seq_len, d_model] to [batch, seq_len, h, d_k]
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1,2) #(batch, seq_len, d_model) -> (batch, seq_len, h, d_k).transpose -> (batch, seq_len, d_model) -> (batch, h, seq_len, d_k)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1,2) #(batch, seq_len, d_model) -> (batch, seq_len, h, d_k).transpose -> (batch, seq_len, d_model) -> (batch, h, seq_len, d_k)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1,2) #(batch, seq_len, d_model) -> (batch, seq_len, h, d_k).transpose -> (batch, seq_len, d_model) -> (batch, h, seq_len, d_k)

        x, self.attention_scores = MultiHeadAttention.attention(query, key, value, mask, self.dropout)

        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k) #revert the transpose operation (batch, h, seq_len, d_k).transpose -> (batch, seq_len, d_model)
        
        #(batch, seq_len, d_model) -> (batch, seq_len, d_model)
        return self.w_o(x)

### Create skip connection (residual connection)

Addition is element-wise, and the result layer is normalized

`Output = LayerNorm(x + Sublayer(x))`

In [8]:
class SkipConnection(nn.Module):
    def __init__(self, dropout):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.norm = LayerNorm()

    #Applying Post-Norm here
    def forward(self, x, sublayer):
        return self.norm(x + self.dropout(sublayer(x)))

## Creating Encoder block
Creating the Encoder block, and stacking N of them together

In [9]:
class EncoderBlock(nn.Module):
    def __init__(self, 
                 self_attention_block: MultiHeadAttention, 
                 feed_forward_block: FeedForward,
                 dropout: float):
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([SkipConnection(dropout) for _ in range(2)])

    def forward(self, x, src_mask): #mask to hide padding words
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
        x = self.residual_connections[1](x, self.feed_forward_block)

        return x


### Creating the Encoder object (N encoder blocks stacked)

Each block:
- Applies multi-head attention to every position with all other positions (self-attention)
- Applies a feedforward network
- Each sub-layer is wrapped with residual + layer norm

After all blocks, it applies a final layer norm.

In [10]:
class Encoder(nn.Module):
    def __init__(self, layers: nn.ModuleList):
        super().__init__()
        self.layers = layers
        self.norm = LayerNorm()

    def forward(self, x, mask):
        for layer in self.layers:  #Loop over each EncoderBlock in self.layers
            x = layer(x, mask)
        return self.norm(x)

## Creating Decoder Block

Uses the same classes as the Encoder block for the most part.
The key difference comes in with the Masked Multi-Head Attention (uses self-attention), and an additional, cross-attention Multi-Head Attention block.

Refer to section 3.1 in the paper


In [11]:
class DecoderBlock(nn.Module):
    def __init__(self, 
                 self_attention_block: MultiHeadAttention,
                 cross_attention_block: MultiHeadAttention,
                 feed_forward_block: FeedForward,
                 dropout: float):
        super().__init__()

        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block

        self.residual_connections = nn.ModuleList([SkipConnection(dropout) for _ in range(3)])

    def forward(self, x, encoder_output, src_mask, tgt_mask): #src_mask: encoder, tgt_mask: decoder
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask)) #self-attention block of decoder
        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask)) #cross-attention takes in key from the decoder, and the query and value from the encoder, with the encoder mask
        x = self.residual_connections[2](x, self.feed_forward_block)

        return x


### Creating Decoder object (N decoder blocks stacked)



In [12]:
class Decoder(nn.Module):
    def __init__(self, layers: nn.ModuleList):
        super().__init__()
        self.layers = layers
        self.norm = LayerNorm()

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)

        return self.norm(x)
        


## Final Linear Layer
Projects the embeddings into a position in the vocabulary.

In [13]:
class ProjectionLayer(nn.Module):
    def __init__(self,
                 d_model: int,
                 vocab_size: int):
        super().__init__()

        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        #(batch, seq_len, d_model) -> (batch, seq_len, vocab_size)

        return torch.log_softmax(self.proj(x), dim = -1)


## Transformer Block

We now have all the components we need to put together and create the Transformer block.

In [14]:
class Transformer(nn.Module):
    def __init__(self,
                 encoder: Encoder,
                 decoder: Decoder,
                 src_embed: InputEmbeddings,
                 tgt_embed: InputEmbeddings,
                 src_pos: PositionalEncoding,
                 tgt_pos: PositionalEncoding,
                 projection_layer: ProjectionLayer
                 ):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer

    #define 3 methods: one to encode, one to decode, one to project

    def encode(self, src, src_mask):
        src = self.src_embed(src)   #apply embedding
        src = self.src_pos(src)     #apply positional encoding
        return self.encoder(src, src_mask)  #apply encoder block
    
    def decode(self, encoder_output, src_mask, tgt, tgt_mask):
        tgt = self.tgt_embed(tgt)   #apply embedding
        tgt = self.tgt_pos(tgt)     #apply positional encoding
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)
    
    def project(self, x):
        return self.projection_layer(x)
    

### Function to build transformer

Given all hyperparameters, this function builds the transformer and initializes the parameters with some initial values

In [15]:
def build_transformer(src_vocab_size: int,
                      tgt_vocab_size: int,
                      src_seq_len: int,
                      tgt_seq_len: int,
                      d_model: int = 512,   #as mentioned in the paper
                      N: int = 6,           #number of encoder/decoder blocks
                      h: int = 8,           #number of heads
                      dropout: float = 0.1,
                      d_ff: int = 2048
) -> Transformer:
    
    # Create the embedding layers
    src_embed = InputEmbeddings(d_model, src_vocab_size)
    tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)

    # Create the positional encoding layers
    src_pos = PositionalEncoding(d_model, src_seq_len, dropout)
    tgt_pos = PositionalEncoding(d_model, tgt_seq_len, dropout)

    #create encoder blocks
    encoder_blocks = []
    for _ in range(N):
        encoder_self_attention_block = MultiHeadAttention(d_model, h, dropout)
        feed_forward_block = FeedForward(d_model, d_ff, dropout)
        encoder_block = EncoderBlock(encoder_self_attention_block, feed_forward_block, dropout)
        encoder_blocks.append(encoder_block)

    #create decoder blocks
    decoder_blocks = []
    for _ in range(N):
        decoder_self_attention_block = MultiHeadAttention(d_model, h, dropout)
        decoder_cross_attention_block = MultiHeadAttention(d_model, h, dropout)
        feed_forward_block = FeedForward(d_model, d_ff, dropout)
        decoder_block = DecoderBlock(decoder_self_attention_block, 
                                     decoder_cross_attention_block,
                                     feed_forward_block,
                                     dropout)
        decoder_blocks.append(decoder_block)

    #Create the encoder and decoder
    encoder = Encoder(nn.ModuleList(encoder_blocks))
    decoder = Decoder(nn.ModuleList(decoder_blocks))

    #Create projection layer
    projection_layer = ProjectionLayer(d_model, tgt_vocab_size)

    #Create the transformer
    transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer)

    #Initialize parameters
    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    return transformer

# Creating English - Norwegian translation Transformer

1. Using opus_books dataset from HuggingFace (Available here: https://huggingface.co/datasets/Helsinki-NLP/opus_books/viewer/en-es?views%5B%5D=en_es)
2. Use word-level tokenizer from HuggingFace - Splits sentence into tokens (Build vocabulary) (Documentation here: https://github.com/huggingface/tokenizers)

### Tokenizer

In [16]:
def get_all_sentences(dataset, lang):
    for item in dataset:
        yield item['translation'][lang]

def build_tokenizer(config, dataset, lang):
    #config['tokenizer_file'] = '../tokenizers/tokenizer_{0}.json'
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    if not Path.exists(tokenizer_path):
        tokenizer = Tokenizer(WordLevel(unk_token='[UNK]')) #tokenizes unknown word
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2) #unknown word, padding, start of sentenece, end of sentence | min_frequency: for the word to appear in the vocabulary it needs to appear at least twice
        tokenizer.train_from_iterator(get_all_sentences(dataset, lang), trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))

    return tokenizer

## Dataset

### Custom Dataset Class

Use a custom Dataset class, following PyTorch documentation: https://docs.pytorch.org/tutorials/beginner/data_loading_tutorial.html

In [17]:
class TranslationDataset(Dataset):
    def __init__(self, dataset, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len):
        super().__init__()

        self.dataset = dataset
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        self.seq_len = seq_len

        #Save tokens to create tensors for the model: SOS, EOS and Padding
        #Use special method of tokenizer, token_to_id

        self.sos_token = torch.tensor([tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64)
        assert tokenizer_tgt.token_to_id("[EOS]") is not None, "[EOS] not found in tokenizer"

    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, index):
        src_target_pair = self.dataset[index]
        src_text = src_target_pair['translation'][self.src_lang]
        tgt_text = src_target_pair['translation'][self.tgt_lang]

        enc_input_tokens = self.tokenizer_src.encode(src_text).ids #input ids as array
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids

        #padding tokens used to match seq_len
        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2 #2 for the SOS and EOS tokens
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1 #Decoder only has SOS, and target only has EOS

        #make sure padding token length is sufficient/non-negative
        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError('Sentence is too long')
        

        #one sentence goes to encoder input, one goes to decoder input, one output sentence
        #Add SOS and EOS to source text
        encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(enc_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype=torch.int64)
                ]
        )

        decoder_input = torch.cat(  #no EOS here, only SOS
            [
                self.sos_token,
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64)
            ]
        )

        label = torch.cat(  #add EOS to decoder output
            [
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64)
            ]
        )

        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len

        return {
            "encoder_input": encoder_input, #(seq_len)
            "decoder_input": decoder_input, #(seq_len)
            "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(), # (1, 1, seq_len) | mask: these tokens should not be seen by the attention mechanism
            "decoder_mask": (decoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int() & causal_mask(decoder_input.size(0)), # (1, seq_len) & (1, seq_len, seq_len) | causal mask, so that the decoder doesn't look at next position inputs
            "label": label, # (seq_len)
            "src_text": src_text,
            "tgt_text": tgt_text
        }

def causal_mask(size): #so that the decoder doesn't look at next position inputs
    mask = torch.triu(torch.ones(1, size, size), diagonal = 1).type(torch.int) #grabs upper triangular matrix, and makes it 0 | upper triangular matrix shows next position inputs
    return mask == 0

### Create dataset

- Download dataset 
- Create test dataset, val dataset, 
- Send to DataLoader to be used in training loop

In [18]:
def get_dataset(config):
    dataset_raw = load_dataset('Helsinki-NLP/opus_books', f'{config["lang_src"]}-{config["lang_tgt"]}', split='train') #dataset and subset, ex: en-es

    #build tokenizer
    tokenizer_src = build_tokenizer(config=config, 
                                           dataset=dataset_raw,
                                           lang = config["lang_src"])
    tokenizer_tgt = build_tokenizer(config=config, 
                                           dataset=dataset_raw,
                                           lang = config["lang_tgt"])
    
    #making custom split for training and validation (90-10)
    train_dataset_size = int(0.9 * len(dataset_raw))
    val_dataset_size = len(dataset_raw) - train_dataset_size
    train_dataset_raw, val_dataset_raw = random_split(dataset_raw, [train_dataset_size, val_dataset_size])

    train_dataset = TranslationDataset(dataset=train_dataset_raw, 
                                       tokenizer_src=tokenizer_src,
                                       tokenizer_tgt=tokenizer_tgt,
                                       src_lang=config['lang_src'],
                                       tgt_lang=config['lang_tgt'],
                                       seq_len=config['seq_len'])
    
    val_dataset = TranslationDataset(dataset=val_dataset_raw, 
                                       tokenizer_src=tokenizer_src,
                                       tokenizer_tgt=tokenizer_tgt,
                                       src_lang=config['lang_src'],
                                       tgt_lang=config['lang_tgt'],
                                       seq_len=config['seq_len'])
    
    #setting max_len based on the dataset src and tgt lengths
    max_len_src, max_len_tgt = 0, 0

    for item in dataset_raw:
        src_ids = tokenizer_src.encode(item['translation'][config['lang_src']]).ids
        tgt_ids = tokenizer_tgt.encode(item['translation'][config['lang_tgt']]).ids

        max_len_src = max(max_len_src, len(src_ids))
        max_len_tgt = max(max_len_tgt, len(tgt_ids))

    print(f"Max length of source sentence: {max_len_src}")
    print(f"Max length of target sentence: {max_len_tgt}")

    #Create DataLoaders

    train_dataloader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=True) #batch size 1 to process each sentence one by one

    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt

### Build model

Based on vocab_size, it builds the transformer model

In [19]:
def build_model(config, vocab_src_len, vocab_tgt_len):
    model = build_transformer(src_vocab_size=vocab_src_len,
                              tgt_vocab_size=vocab_tgt_len,
                              src_seq_len=config['seq_len'],
                              tgt_seq_len=config['seq_len'],
                              d_model=config['d_model'])
    return model

## Build Training Loop


### Model validation code

In [20]:
def greedy_decode(model, source, source_mask, tokenizer_src, tokenizer_tgt, max_len, device):
    sos_idx = tokenizer_tgt.token_to_id('[SOS]')
    eos_idx = tokenizer_tgt.token_to_id('[EOS]')

    #precompute the encoder output and reuse it for every token in the decoder
    encoder_output = model.encode(source, source_mask)

    #initialize decoder input with sos token
    decoder_input = torch.empty(1,1).fill_(sos_idx).type_as(source).to(device)
    while True:
        if decoder_input.size(1) == max_len:
            break

        #build mask for decoder input
        decoder_mask = causal_mask(size=decoder_input.size(1)).type_as(source_mask).to(device)

        #calculate output of decoder
        out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)

        #get next token
        prob = model.project(out[:,-1]) #last token
        _, next_word = torch.max(prob, dim=1) #selects token with max probability (greedy search)

        #next_word becomes input for the next iteration
        decoder_input = torch.cat([decoder_input, torch.empty(1,1).type_as(source).fill_(next_word.item()).to(device)], dim=1)

        if next_word == eos_idx:
            break
    
    return decoder_input.squeeze(0) #remove batch dimension

def run_validation(model, validation_dataset, tokenizer_src, tokenizer_tgt, max_len, device, print_msg, global_step, writer, num_examples = 2):
    model.eval()

    count = 0
    source_texts = []
    expected = []
    predicted = []

    console_width = 80

    with torch.no_grad():
        for batch in validation_dataset:
            count += 1
            encoder_input = batch['encoder_input'].to(device)
            encoder_mask = batch['encoder_mask'].to(device)

            assert encoder_input.size(0) == 1, "Batch size must be 1 for validation"

            model_out = greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)

            source_text = batch['src_text'][0]
            target_text = batch['tgt_text'][0]
            model_out_text = tokenizer_tgt.decode(model_out.detach().cpu().numpy()) #convert tokens back into text

            #save into respective lists
            source_texts.append(source_text)
            expected.append(target_text)
            predicted.append(model_out_text)

            #print to console
            print_msg('-'*console_width)
            print_msg(f'Source: {source_text}')
            print_msg(f'Target: {target_text}')
            print_msg(f'Predicted: {model_out_text}')

            if count == num_examples:
                break

    if writer:
        # Evaluate the character error rate
        # Compute the char error rate 
        metric = torchmetrics.CharErrorRate()
        cer = metric(predicted, expected)
        writer.add_scalar('validation cer', cer, global_step)
        writer.flush()

        # Compute the word error rate
        metric = torchmetrics.WordErrorRate()
        wer = metric(predicted, expected)
        writer.add_scalar('validation wer', wer, global_step)
        writer.flush()

        # Compute the BLEU metric
        metric = torchmetrics.BLEUScore()
        bleu = metric(predicted, expected)
        writer.add_scalar('validation BLEU', bleu, global_step)
        writer.flush()

### Model training code

In [21]:
def train_model(config):
    #device agnostic code
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    Path(config['model_folder']).mkdir(parents=True, exist_ok=True)

    train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_dataset(config)
    model = build_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)

    #tensorboard
    writer = SummaryWriter(config['experiment_name'])
    
    #restore model in case of crash
    initial_epoch = 0
    global_step = 0
    if config['preload']:
        model_filename = get_weights_file_path(config, config['preload'])
        print(f"Preloading model: {model_filename}")
        state = torch.load(model_filename)
        initial_epoch = state['epoch'] + 1
        optimizer.load_state_dict(state['optimizer_state_dict'])
        global_step = state['global_step']

    #define loss function and optimizer
    loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id('[PAD]'), label_smoothing=0.1) #helps with overfitting
    
    optimizer = torch.optim.Adam(params=model.parameters(),
                                 lr = config['lr'],
                                 eps = 1e-9)
    
    for epoch in range(initial_epoch, config['num_epochs']):
        
        batch_iterator = tqdm(train_dataloader, desc=f'Processing epoch {epoch:02d}')

        for batch in batch_iterator:
            model.train()
            encoder_input = batch['encoder_input'].to(device) #(batch, seq_len)
            decoder_input = batch['decoder_input'].to(device) #(batch, seq_len)
            encoder_mask = batch['encoder_mask'].to(device) #(batch, 1, 1, seq_len)
            decoder_mask = batch['decoder_mask'].to(device) #(batch, 1, seq_len, seq_len)

            #run tensors through transformer
            encoder_output = model.encode(encoder_input, encoder_mask) #(batch, seq_len, d_model)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) #(batch, seq_len, d_model)

            proj_output = model.project(decoder_output) #(batch, seq_len, tgt_vocab_size)

            label = batch['label'].to(device) #(batch, seq_len)

            loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1)) #(batch, seq_len, tgt_vocab_size) -> (batch * seq_len, tgt_vocab_size) to compare with label

            batch_iterator.set_postfix({f"Loss": f"{loss.item():6.3f}"})

            #log the loss
            writer.add_scalar('train_loss', loss.item(), global_step)
            writer.flush()
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            global_step += 1

        run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device, lambda msg: batch_iterator.write(msg), global_step, writer)

        model_filename = get_weights_file_path(config, f'{epoch:02d}')
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'global_step': global_step
        }, model_filename)

Run the training

In [22]:
if __name__ == "__main__":
    config = get_config()
    train_model(config)

Using device: cuda
Max length of source sentence: 141
Max length of target sentence: 95


Processing epoch 00: 100%|██████████| 197/197 [00:27<00:00,  7.07it/s, Loss=6.100]


--------------------------------------------------------------------------------
Source: Welcome to Baskerville Hall!"
Target: Velkommen til Baskerville herregård!”
Predicted: Jeg .
--------------------------------------------------------------------------------
Source: Our friends had already secured a first-class carriage and were waiting for us upon the platform.
Target: Våre venner hadde allerede sikret seg en førsteklasses kupé og stod nå på platformen og ventet på oss.
Predicted: Jeg .


Processing epoch 01: 100%|██████████| 197/197 [00:27<00:00,  7.12it/s, Loss=5.895]


--------------------------------------------------------------------------------
Source: His method had the additional advantage that if they were to take a cab he was all ready to follow them.
Target: Hans fremgangsmåte hadde også fordelen at hvis de tok en vogn, ville han dermed også kunne følge etter dem straks.
Predicted: Jeg er .
--------------------------------------------------------------------------------
Source: On the whole I incline to the latter view, since the matter was evidently important, and it is unlikely that the composer of such a letter would be careless.
Target: Jeg vil nærmest anta det siste. Saken har åpenbart vært av stor viktighet for ham, og det er ikke sannsynlig at man er skjødesløs når man skal sette sammen et slikt brev.
Predicted: Jeg er , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ,

Processing epoch 02: 100%|██████████| 197/197 [00:27<00:00,  7.18it/s, Loss=5.680]


--------------------------------------------------------------------------------
Source: "Watson," said the baronet, "it was the cry of a hound."
Target: “Watson,” sa sir Henry, “det var en hund som ulte.”
Predicted: “ Ja , er det det det det det det det det det det .
--------------------------------------------------------------------------------
Source: "I hope your visit has cast some light upon those occurrences which have puzzled us?"
Target: “Jeg håper at De ved Deres besøk har hatt anledning til å bringe litt lys over alle de hemmelighetsfulle hendelsene som har satt oss i så stor uro.”
Predicted: “ Jeg De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De De D

Processing epoch 03: 100%|██████████| 197/197 [00:27<00:00,  7.15it/s, Loss=5.443]


--------------------------------------------------------------------------------
Source: And it was at this moment that there occurred a most strange and unexpected thing. We had risen from our rocks and were turning to go home, having abandoned the hopeless chase.
Target: Vi hadde reist oss og skulle gå hjem, da vi fant ut vi måtte oppgi den håpløse jakten.
Predicted: Det var , han han han han og han han han han han han han han han han han .
--------------------------------------------------------------------------------
Source: I whisked round and had just time to catch a glimpse of something which I took to be a large black calf passing at the head of the drive.
Target: Jeg snudde meg rundt og så i farten et glimt av noe som jeg tok for en sort kalv som sprang forbi hesten.
Predicted: Jeg var jeg jeg jeg var , og jeg og jeg og jeg og jeg og jeg og jeg og jeg og jeg og jeg og jeg og jeg og jeg og jeg og jeg og jeg og jeg og jeg og jeg jeg jeg og jeg og jeg og jeg og jeg og jeg og jeg

Processing epoch 04: 100%|██████████| 197/197 [00:27<00:00,  7.13it/s, Loss=5.264]


--------------------------------------------------------------------------------
Source: "It is dead."
Target: “Den er død.”
Predicted: “ Ja , Henry .”
--------------------------------------------------------------------------------
Source: There remain the people who will actually surround Sir Henry Baskerville upon the moor."
Target: Men vi har tilbake de personene som nå vil bli sir Henrys omgivelser der ute på moen.”
Predicted: Det er , og , og , og , og .”


Processing epoch 05: 100%|██████████| 197/197 [00:27<00:00,  7.12it/s, Loss=5.384]


--------------------------------------------------------------------------------
Source: He retained it in his hand after using it to set the hound upon the track.
Target: Han holdt den i hånden etter å ha brukt den til å sette hunden på sporet.
Predicted: Han var , og han var , og han var .
--------------------------------------------------------------------------------
Source: It was the sob of a woman, the muffled, strangling gasp of one who is torn by an uncontrollable sorrow.
Target: Det var en kvinnes gråt, en undertrykt, kvalt hulking av et menneske som pines av en sorg som det ikke lenger er herre over.
Predicted: Det er en , og , og , og , og , og .


Processing epoch 06: 100%|██████████| 197/197 [00:27<00:00,  7.14it/s, Loss=5.355]


--------------------------------------------------------------------------------
Source: "Well, it is rather obvious."
Target: “Jo, det er nokså øyensynlig.”
Predicted: “ Ja , det er det ikke .”
--------------------------------------------------------------------------------
Source: I whisked round and had just time to catch a glimpse of something which I took to be a large black calf passing at the head of the drive.
Target: Jeg snudde meg rundt og så i farten et glimt av noe som jeg tok for en sort kalv som sprang forbi hesten.
Predicted: Jeg var , og jeg var , og jeg var , og jeg var , og jeg var .


Processing epoch 07: 100%|██████████| 197/197 [00:27<00:00,  7.17it/s, Loss=5.194]


--------------------------------------------------------------------------------
Source: "It chanced that some little time later Hugo left his guests to carry food and drink--with other worse things, perchance--to his captive, and so found the cage empty and the bird escaped.
Target: Da Hugo noe senere forlot sine gjester for å bringe mat og drikke og kanskje endog det som verre var — til sin fange, hendte det seg at han fant buret tomt og fuglen fløyet.
Predicted: “ Det var en en en en en , og han var , og han var , og han var , og han var , og han var .
--------------------------------------------------------------------------------
Source: A lucky long shot of my revolver might have crippled him, but I had brought it only to defend myself if attacked, and not to shoot an unarmed man who was running away.
Target: Et heldig skudd av min revolver kunne ha stanset ham, men jeg hadde kun medbrakt den for å forsvare meg selv hvis jeg ble angrepet, og ikke for å skyte en flyktende, ubevæpn

Processing epoch 08: 100%|██████████| 197/197 [00:27<00:00,  7.13it/s, Loss=4.581]


--------------------------------------------------------------------------------
Source: Sherlock Holmes struck his hand against his knee with an impatient gesture.
Target: Sherlock Holmes strøk hånden over kneet med en utålmodig bevegelse.
Predicted: Holmes og .
--------------------------------------------------------------------------------
Source: You are not fit for further adventures to-night.
Target: De er ikke i stand til å være med på videre eventyr i natt.
Predicted: De er at De er Dem .


Processing epoch 09: 100%|██████████| 197/197 [00:27<00:00,  7.10it/s, Loss=4.536]


--------------------------------------------------------------------------------
Source: And I would have you believe, my sons, that the same Justice which punishes sin may also most graciously forgive it, and that no ban is so heavy but that by prayer and repentance it may be removed.
Target: Og jeg vil, at I, mine sønner, skal tro at den rettferdighet som straffer synden, også nådig tilgir den, og at ingen forbannelse er så tung at den ikke ved bønn og anger kan oppheves.
Predicted: Jeg har ikke ikke ikke ikke ikke ikke ikke Dem , og jeg har jeg har være Dem , og jeg har være Dem .
--------------------------------------------------------------------------------
Source: The young heir glanced round with a gloomy face.
Target: Den unge arving så seg omkring med et uttrykk av uhygge.
Predicted: og .


Processing epoch 10: 100%|██████████| 197/197 [00:27<00:00,  7.13it/s, Loss=4.840]


--------------------------------------------------------------------------------
Source: "Not for the world, my dear Watson.
Target: “Nei, ikke for alt i verden, kjære Watson.
Predicted: “ Ja , De er Dem Dem , hr .
--------------------------------------------------------------------------------
Source: And he left five years ago--the date is on the stick.
Target: Og han har forlatt sykehuset for fem år siden — årstallet står på stokken.
Predicted: Det var han , og han var .


Processing epoch 11: 100%|██████████| 197/197 [00:27<00:00,  7.12it/s, Loss=4.222]


--------------------------------------------------------------------------------
Source: We ran and ran until we were completely blown, but the space between us grew ever wider.
Target: Vi sprang og sprang, inntil vi var fullstendig utmaset, men avstanden mellom oss og ham ble stadig større og større.
Predicted: Vi var en , og vi var en i i i i .
--------------------------------------------------------------------------------
Source: He certainly seemed to be getting uncomfortably near the truth.
Target: Han begynte å nærme seg sannheten på en høyst uhyggelig måte.
Predicted: Det er ikke at han ikke ikke ikke ikke ikke ikke ikke ikke ikke ikke ikke ikke ikke .


Processing epoch 12: 100%|██████████| 197/197 [00:27<00:00,  7.17it/s, Loss=4.760]


--------------------------------------------------------------------------------
Source: "I suppose it is pretty thick, now that you mention it."
Target: “Ja, den er visst ganske tykk nå, når De nevner det.”
Predicted: “ Nei , jeg har ikke ikke ikke det .”
--------------------------------------------------------------------------------
Source: He retained it in his hand after using it to set the hound upon the track.
Target: Han holdt den i hånden etter å ha brukt den til å sette hunden på sporet.
Predicted: Vi kunne ikke ikke ikke ikke ikke ikke ikke ikke ikke se seg .


Processing epoch 13: 100%|██████████| 197/197 [00:27<00:00,  7.17it/s, Loss=4.526]


--------------------------------------------------------------------------------
Source: The use of artificial means to make the creature diabolical was a flash of genius upon his part.
Target: Bruken av kunstige midler til å gi dyret et djevelsk utseende var et genialt påfunn av ham.
Predicted: Vi er en , og det er en , og det er en .
--------------------------------------------------------------------------------
Source: It was, at least, absolutely effective.
Target: Hans innflytelse virket i alle fall.
Predicted: Det var en , og det var en .


Processing epoch 14: 100%|██████████| 197/197 [00:27<00:00,  7.13it/s, Loss=4.214]


--------------------------------------------------------------------------------
Source: I thought that you were in Baker Street working out that case of blackmailing."
Target: Jeg trodde at De var i Baker Street og var fullt opptatt av injuriesaken.”
Predicted: Jeg har hørt det i å meg i det .”
--------------------------------------------------------------------------------
Source: I act entirely from a sense of public duty.
Target: Jeg handler utelukkende av følelsen av min plikt mot samfunnet.
Predicted: Jeg har hørt en i .


Processing epoch 15: 100%|██████████| 197/197 [00:27<00:00,  7.15it/s, Loss=4.553]


--------------------------------------------------------------------------------
Source: For two hours the strange business in which we had been involved appeared to be forgotten, and he was entirely absorbed in the pictures of the modern Belgian masters.
Target: For et par timer var den merkelige saken vi var kommet opp i fullstendig glemt; han gikk helt opp i betraktningen av de moderne belgiske mesteres bilder.
Predicted: Han hadde en av , og det var av å være , og .
--------------------------------------------------------------------------------
Source: "My word, it does not seem a very cheerful place," said the detective with a shiver, glancing round him at the gloomy slopes of the hill and at the huge lake of fog which lay over the Grimpen Mire. "I see the lights of a house ahead of us."
Target: “Det er sannelig ikke noe hyggelig ventested,” sa detektiven og fór gysende sammen ved synet av de uhyggelige, stupbratte skrentene og det svære tåkehavet som lå over Grimpenmyren.
Predic

Processing epoch 16: 100%|██████████| 197/197 [00:27<00:00,  7.23it/s, Loss=4.356]


--------------------------------------------------------------------------------
Source: My wife and I will be happy, Sir Henry, to stay with you until you have made your fresh arrangements, but you will understand that under the new conditions this house will require a considerable staff."
Target: Jeg og min hustru vil med glede bli her, til De er kommet i orden, sir Henry, men De forstår at det vil trenges et større tjenerhold til et hus som dette under de nye vilkår.”
Predicted: Hvis De har vært , og jeg er det , og jeg er det , og jeg er det i i å gjøre det .”
--------------------------------------------------------------------------------
Source: "I hope that you will come also.
Target: “Godt.
Predicted: “ Jeg har fått redd at De har .


Processing epoch 17: 100%|██████████| 197/197 [00:27<00:00,  7.21it/s, Loss=4.305]


--------------------------------------------------------------------------------
Source: "You may be right."
Target: “De har muligens rett.”
Predicted: “ De kan være noe til å være mere ?”
--------------------------------------------------------------------------------
Source: She always comes to us when she is in town."
Target: Hun bor bestandig her når hun er i byen.”
Predicted: Men jeg ville ikke få et av å få et .”


Processing epoch 18: 100%|██████████| 197/197 [00:27<00:00,  7.13it/s, Loss=4.112]


--------------------------------------------------------------------------------
Source: "I can't give you the name, sir, but I can give you the initials.
Target: “Jeg kan ikke oppgi navnet til Dem, sir Henry, men jeg kan si Dem forbokstavene.
Predicted: “ Jeg har ikke ikke ikke vært for å fortelle Dem .
--------------------------------------------------------------------------------
Source: "And why were you holding a candle to the window?"
Target: “Og hvorfor gjorde De det?”
Predicted: “ Hvordan gikk De det ?”


Processing epoch 19: 100%|██████████| 197/197 [00:27<00:00,  7.15it/s, Loss=4.172]


--------------------------------------------------------------------------------
Source: "What sort of night was it?'
Target: “Hva slags aften var det?”
Predicted: “ Er det ?”
--------------------------------------------------------------------------------
Source: The rattle of our wheels died away as we drove through drifts of rotting vegetation--sad gifts, as it seemed to me, for Nature to throw before the carriage of the returning heir of the Baskervilles.
Target: Lyden av vognhjulene døde hen mens vi kjørte gjennom dynger av råtnende blader. Det forekom meg som naturen bød Baskervillernes gjenkomne arving en trist velkomsthilsen.
Predicted: Det var en av å være en av en , og vi var kommet av moen , og seg seg seg seg i moen .


Test the trained model with custom input

In [24]:
def translate_sentence(
    sentence: str,
    model,
    tokenizer_src,
    tokenizer_tgt,
    config,
    max_len=50,
    device="cuda" if torch.cuda.is_available() else "cpu"
):
    model.eval()

    # Encode source sentence
    tokens = tokenizer_src.encode(sentence).ids
    tokens = [tokenizer_tgt.token_to_id("[SOS]")] + tokens + [tokenizer_tgt.token_to_id("[EOS]")]

    encoder_input = torch.tensor(tokens, dtype=torch.int64).unsqueeze(0).to(device)  # (1, seq_len)
    encoder_mask = (encoder_input != tokenizer_tgt.token_to_id("[PAD]")).unsqueeze(0).unsqueeze(0).to(device)

    # Encode
    with torch.no_grad():
        encoder_output = model.encode(encoder_input, encoder_mask)

    # Prepare decoder input (just SOS at first)
    decoder_input = torch.tensor([[tokenizer_tgt.token_to_id("[SOS]")]], dtype=torch.int64).to(device)

    for _ in range(max_len):
        decoder_mask = (decoder_input != tokenizer_tgt.token_to_id("[PAD]")).unsqueeze(0).unsqueeze(0)
        size = decoder_input.size(1)
        causal_mask = torch.triu(torch.ones((1, size, size), device=device), diagonal=1).bool()
        decoder_mask = decoder_mask & ~causal_mask

        with torch.no_grad():
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask)
            logits = model.project(decoder_output)  # (1, seq_len, vocab_size)
            next_token_logits = logits[:, -1, :]  # last token

            next_token = torch.argmax(next_token_logits, dim=-1)  # greedy decode

        decoder_input = torch.cat([decoder_input, next_token.unsqueeze(0)], dim=1)

        if next_token.item() == tokenizer_tgt.token_to_id("[EOS]"):
            break

    # Remove SOS and EOS
    output_tokens = decoder_input.squeeze().tolist()
    output_tokens = output_tokens[1:]  # remove SOS
    if tokenizer_tgt.token_to_id("[EOS]") in output_tokens:
        output_tokens = output_tokens[:output_tokens.index(tokenizer_tgt.token_to_id("[EOS]"))]

    return tokenizer_tgt.decode(output_tokens)


In [25]:

config = get_config()
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load tokenizers
from tokenizers import Tokenizer
tokenizer_src = Tokenizer.from_file(config['tokenizer_file'].format(config['lang_src']))
tokenizer_tgt = Tokenizer.from_file(config['tokenizer_file'].format(config['lang_tgt']))

# Build model
model = build_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size())
model.to(device)

# Load trained weights
model_filename = get_weights_file_path(config, "19")  # or last epoch saved
print(f"Loading model from {model_filename}")
state = torch.load(model_filename, map_location=device)
model.load_state_dict(state['model_state_dict'])
model.eval()


sentence = "The weather is nice today."
translation = translate_sentence(sentence, model, tokenizer_src, tokenizer_tgt, config)
print("→", translation)


Loading model from weights\transformer1_19.pt
→ Det er en av .
