## Installing dependencies

In [1]:
pip install torchsummary typing --break-system-packages

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
"""
-----------------------------------------------------------------------------
Transformer using pytorch and numpy
-----------------------------------------------------------------------------
AUTHOR: Soumitra Samanta (soumitra.samanta@gm.rkmvu.ac.in)
-----------------------------------------------------------------------------
Package required:
Numpy: https://numpy.org/
Matplotlib: https://matplotlib.org
-----------------------------------------------------------------------------
"""

import numpy as np
import torch
from torch import Tensor
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torchsummary import summary
import math

from typing import Tuple


In [3]:
class self_attention_layer(nn.Module):
    """
    Self attention layer
    """
    
    def __init__(
        self,
        dims_embd: int,
    )->None:
        """
        Self attention class initialization
        
        Inpout:
            - dims_embd (int): Embedding dimension
        """
        
        super().__init__()
        self.dims_embd_ = dims_embd
        ############################################################################
        #                             Your code will be here                       #
        #--------------------------------------------------------------------------#
        self.W_q_ = nn.Linear(dims_embd, dims_embd)
        self.W_k_ = nn.Linear(dims_embd, dims_embd)
        self.W_v_ = nn.Linear(dims_embd, dims_embd)
        
        
        #--------------------------------------------------------------------------#
        #                             End of your code                             #
        ############################################################################
        
    def forward(
        self, 
        x: Tensor 
    )->Tensor:
        """
        Forward pass for the self attention layer
        
        Imput:
            - x (torch tensor): Input data
            
        Output:
        
        """
        
        y = []
        ############################################################################
        #                             Your code will be here                       #
        #--------------------------------------------------------------------------#

        Q = self.W_q_(x)
        K = self.W_k_(x)
        V = self.W_v_(x)
        
        d = self.dims_embd_
        attention_scores = torch.matmul(Q,K.transpose(-2,-1))/math.sqrt(d)

        attention_weights = F.softmax(attention_scores, dim=-1)
        y = torch.matmul(attention_weights, V)  
        
        #--------------------------------------------------------------------------#
        #                             End of your code                             #
        ############################################################################
    
        return y
    

In [4]:
class transformer_block_encoder(nn.Module):
    """
    Transformer single block
    """
    
    def __init__(
        self,
        dims_embd: int,
        num_hidden_nodes_ffnn: int = 2048,
        dropout_prob: float = 0.0
    )->None:
        """
        Transformer single block class initialization
        
        Inpout:
            - dims_embd (int):             Embedding dimension
            - num_hidden_nodes_ffnn (int): Number of neurons in the fed-forward layer
            - dropout_prob (float):        Dropout probability in liner layers
        """
        
        super().__init__()
        
        ############################################################################
        #                             Your code will be here                       #
        #--------------------------------------------------------------------------#
        self.attention_ = self_attention_layer(dims_embd)
        
        self.layer_norm1_ = nn.LayerNorm(dims_embd)
        self.layer_norm2_ = nn.LayerNorm(dims_embd)
        
        self.ffnn_ = nn.Sequential(
            nn.Linear(dims_embd, num_hidden_nodes_ffnn),
            nn.ReLU(),
            nn.Linear(num_hidden_nodes_ffnn, dims_embd)
        )
        self.droput_ops_ = nn.Dropout(dropout_prob)
        
        self.dims_embd_ = dims_embd
        self.num_hidden_nodes_ffnn_ = num_hidden_nodes_ffnn
        self.dropout_prob_ = dropout_prob
        
        
        #--------------------------------------------------------------------------#
        #                             End of your code                             #
        ############################################################################
        
    def forward(
        self,
        x: Tensor,
    )->Tensor:
        """
        Forward pass for the transformer block
        
        Imput:
            - x (torch tensor): Input data
            
        Output:
        
        """
        
        ############################################################################
        #                             Your code will be here                       #
        #--------------------------------------------------------------------------#

        # Inside Encoder Block of transformer: We execute the following
        # Self_attention_layer -> Add residuals of self_attention -> Layer Normalize
        # -> Feed Forward -> Add residuals of feed_forward -> Layer Normalize
        
        self_attention = self.attention_(x)
        x = x + self.droput_ops_(self_attention)
        x = self.layer_norm1_(x)
    
        feed_forward = self.ffnn_(x)
        x = x + self.droput_ops_(feed_forward)
        x = self.layer_norm2_(x)
    
        #--------------------------------------------------------------------------#
        #                             End of your code                             #
        ############################################################################
        
        return x
        

In [5]:
class transformer_encoder(nn.Module):
    """
    Transformer encoder module
    """
    
    def __init__(
        self,
        dims_embd: int,
        num_hidden_nodes_ffnn: int = 2048,
        dropout_prob: float = 0.0,
        num_layers_encoder: int = 2
    )->None:
        """
        Transformer encoder class initialization
        
        Inpout:
            - dims_embd (int):             Embedding dimension
            - num_hidden_nodes_ffnn (int): Number of neurons in the fed-forward layer
            - dropout_prob (float):        Dropout probability in liner layers
            - num_layers_encoder (int):    Number encoder blocks
        """
        super().__init__()
        
        ############################################################################
        #                             Your code will be here                       #
        #--------------------------------------------------------------------------#
        self.trs_endr_blocks_ = nn.ModuleList(
            [
                transformer_block_encoder(dims_embd, num_hidden_nodes_ffnn, dropout_prob) for _ in range(num_layers_encoder)
            ]
        )
        
        self.num_layers_encoder_ = num_layers_encoder
        
        
        #--------------------------------------------------------------------------#
        #                             End of your code                             #
        ############################################################################
    
    def forward(
        self,
        x: Tensor,
    )->Tensor:
        """
        Forward pass for the transformer encoder
        
        Imput:
            - x (torch tensor): Input data
            
        Output:
        
        """
        
        ############################################################################
        #                             Your code will be here                       #
        #--------------------------------------------------------------------------#
        
        for block in self.trs_endr_blocks_:
            x = block(x)
        
        #--------------------------------------------------------------------------#
        #                             End of your code                             #
        ############################################################################
        
        return x
    
    

In [6]:
class cross_attention_layer(nn.Module):
    """
    Cross attention layer
    """
    
    def __init__(
        self,
        dims_embd: int,
    )->None:
        """
        Cross attention class initialization
        
        Inpout:
            - dims_embd (int): Embedding dimension
        """
        
        super().__init__()
        
        ############################################################################
        #                             Your code will be here                       #
        #--------------------------------------------------------------------------#
        
        self.W_q_ = nn.Linear(dims_embd, dims_embd)
        self.W_k_ = nn.Linear(dims_embd, dims_embd)
        self.W_v_ = nn.Linear(dims_embd, dims_embd)
        
        self.dims_embd_ = dims_embd
        
        
        #--------------------------------------------------------------------------#
        #                             End of your code                             #
        ############################################################################
        
    def forward(
        self, 
        x: Tensor,
        y: Tensor
    )->Tensor:
        """
        Forward pass for the cross-attention layer
        
        Imput:
            - x (torch tensor): Input encoder data
            - y (torch tensor): Input decoder data
            
        Output:
        
        """
        
        ############################################################################
        #                             Your code will be here                       #
        #--------------------------------------------------------------------------#
        
        # Q calculated using decoder input data
        Q = self.W_q_(y)

        # K and V calculated using encoder input data
        K = self.W_k_(x)
        V = self.W_v_(x)
        
        scores = torch.matmul(Q, K.transpose(-2,-1)) / (self.dims_embd_**0.5)
        attn_weights = torch.softmax(scores, dim=-1)
    
        y = torch.matmul(attn_weights, V)
        
        #--------------------------------------------------------------------------#
        #                             End of your code                             #
        ############################################################################
    
        return y
    

In [7]:
class transformer_block_decoder(nn.Module):
    """
    Transformer single decoder block
    """
    
    def __init__(
        self,
        dims_embd: int,
        num_hidden_nodes_ffnn: int = 2048,
        dropout_prob: float = 0.0
    )->None:
        """
        Transformer single block class initialization
        
        Inpout:
            - dims_embd (int):             Embedding dimension
            - num_hidden_nodes_ffnn (int): Number of neurons in the fed-forward layer
            - dropout_prob (float):        Dropout probability in liner layers
        """
        
        super().__init__()
        
        ############################################################################
        #                             Your code will be here                       #
        #--------------------------------------------------------------------------#
        
        self.attention_ = self_attention_layer(dims_embd)
        self.cross_attention_ = cross_attention_layer(dims_embd)
        
        self.layer_norm1_ = nn.LayerNorm(dims_embd)
        self.layer_norm2_ = nn.LayerNorm(dims_embd)
        self.layer_norm3_ = nn.LayerNorm(dims_embd)
        
        self.ffnn_ = nn.Sequential(
            nn.Linear(dims_embd, num_hidden_nodes_ffnn),
            nn.ReLU(),
            nn.Linear(num_hidden_nodes_ffnn, dims_embd)
        )
        self.droput_ops_ = nn.Dropout(dropout_prob)
        
        self.dims_embd_ = dims_embd
        self.num_hidden_nodes_ffnn_ = num_hidden_nodes_ffnn
        self.dropout_prob_ = dropout_prob
        
        #--------------------------------------------------------------------------#
        #                             End of your code                             #
        ############################################################################
        
    def forward(
        self,
        x: Tensor,
        y: Tensor
    )->Tensor:
        """
        Forward pass for the transformer block
        
        Imput:
            - x (torch tensor): Input encoder data
            - y (torch tensor): Input decoder data
            
        Output:
        
        """
        
        ############################################################################
        #                             Your code will be here                       #
        #--------------------------------------------------------------------------#

        # Self_attention first
        y1 = self.attention_(y)
        y = self.layer_norm1_(y + self.droput_ops_(y1))
    
        # Then cross_attention is decoder attending to encoder
        y1 = self.cross_attention_(x, y)
        y = self.layer_norm2_(y + self.droput_ops_(y1))
    
        # and finally feed_forward_network
        y1 = self.ffnn_(y)
        y = self.layer_norm3_(y + self.droput_ops_(y1))
    
        #--------------------------------------------------------------------------#
        #                             End of your code                             #
        ############################################################################
        
        return y
    
    

In [8]:
class transformer_decoder(nn.Module):
    """
    Transformer decoder module
    """
    
    def __init__(
        self,
        dims_embd: int,
        num_hidden_nodes_ffnn: int = 2048,
        dropout_prob: float = 0.0,
        num_layers_decoder: int = 2
    )->None:
        """
        Transformer decoder class initialization
        
        Inpout:
            - dims_embd (int):             Embedding dimension
            - num_hidden_nodes_ffnn (int): Number of neurons in the fed-forward layer
            - dropout_prob (float):        Dropout probability in liner layers
            - num_layers_decoder (int):    Number decoder blocks
        """
        super().__init__()
        
        ############################################################################
        #                             Your code will be here                       #
        #--------------------------------------------------------------------------#
        self.trs_dcdr_blocks_ = nn.ModuleList(
            [
                transformer_block_decoder(dims_embd, num_hidden_nodes_ffnn, dropout_prob) for _ in range(num_layers_decoder)
            ]
        )
        
        self.num_layers_decoder_ = num_layers_decoder
        #--------------------------------------------------------------------------#
        #                             End of your code                             #
        ############################################################################
        
    def forward(
        self,
        x: Tensor,
        y: Tensor
    )->Tensor:
        """
        Forward pass for the transformer encoder
        
        Imput:
            - x (torch tensor): Input encoder data
            - y (torch tensor): Input decoder data
            
        Output:
        
        """
        
        ############################################################################
        #                             Your code will be here                       #
        #--------------------------------------------------------------------------#
        
        for block in self.trs_dcdr_blocks_:
            x = block(x,y)
        
        #--------------------------------------------------------------------------#
        #                             End of your code                             #
        ############################################################################
        
        return x
        
       
    

In [9]:
dims_embd = 10
num_data_points = 100
batch_size = 5
num_hidden_nodes_ffnn = 1024
dropout_prob = 0.2
num_layers_encoder = 2

x = torch.rand(batch_size, num_data_points, dims_embd)
y = torch.rand(batch_size, num_data_points, dims_embd)

# Test Self-attention layer and its input output size  
print('='*70)
model_self_attention_layer = self_attention_layer(dims_embd)
print('Self-attention layer models is: \n{}' .format(model_self_attention_layer))
print('-'*70)

y_bar = model_self_attention_layer(x)
print('Self-attention layer input size: {}' .format(x.shape))
print('Self-attention layer output size: {}' .format(y_bar.shape))
print('-'*70)
        
# Test Transformer encoder block input output size 
print('='*70)
model_transformer_block_encoder = transformer_block_encoder(dims_embd, num_hidden_nodes_ffnn, dropout_prob)
print('Transformer block models is: \n{}' .format(model_transformer_block_encoder))
print('-'*70)
print('Transformer block models summary:')
print('-'*70)
summary(model_transformer_block_encoder, (num_data_points, dims_embd, ), device=str("cpu"))
print('-'*70)

y_bar = model_transformer_block_encoder(x)
print('Transformer block input size: {}' .format(x.shape))
print('Transformer block output size: {}' .format(y_bar.shape))  
print('-'*70)

# Test Transformer encoder input output size 
print('='*70)
model_transformer_encoder = transformer_encoder(dims_embd, num_hidden_nodes_ffnn, dropout_prob, num_layers_encoder)
print('Transformer encoder models is: \n{}' .format(model_transformer_encoder))
print('-'*70)
print('Transformer encoder models summary:')
print('-'*70)
summary(model_transformer_encoder, (num_data_points, dims_embd, ), device=str("cpu"))
print('-'*70)

y_bar = model_transformer_encoder(x)
print('Transformer encoder input size: {}' .format(x.shape))
print('Transformer encoder output size: {}' .format(y_bar.shape))  
print('-'*70)

# Test Cross-attention layer and its input output size  
print('='*70)
model_cross_attention_layer = cross_attention_layer(dims_embd)
print('Cross-attention layer models is: \n{}' .format(model_cross_attention_layer))
print('-'*70)

y_bar = model_cross_attention_layer(x, y)
print('Cross-attention layer input size: {}' .format(x.shape))
print('Cross-attention layer output size: {}' .format(y_bar.shape))
print('-'*70)

# Test Transformer decoder block input output size 
print('='*70)
model_transformer_block_decoder = transformer_block_decoder(dims_embd, num_hidden_nodes_ffnn, dropout_prob)
print('Transformer decoder block models is: \n{}' .format(model_transformer_block_decoder))
print('-'*70)
print('Transformer decoder block models summary:')
print('-'*70)
summary(model_transformer_block_decoder, [(num_data_points, dims_embd, ), (num_data_points, dims_embd, )], device=str("cpu"))
print('-'*70)

y_bar = model_transformer_block_decoder(x, y)
print('Transformer block input size: {}' .format(x.shape))
print('Transformer block output size: {}' .format(y_bar.shape))  
print('-'*70)

# Test Transformer decoder input output size 
print('='*70)
model_transformer_decoder = transformer_decoder(dims_embd, num_hidden_nodes_ffnn, dropout_prob, num_layers_encoder)
print('Transformer decoder models is: \n{}' .format(model_transformer_decoder))
print('-'*70)
print('Transformer decoder models summary:')
print('-'*70)
summary(model_transformer_decoder, [(num_data_points, dims_embd, ), (num_data_points, dims_embd, )], device=str("cpu"))
print('-'*70)

y_bar = model_transformer_decoder(x, y)
print('Transformer decoder input size: {}' .format(x.shape))
print('Transformer decoder output size: {}' .format(y_bar.shape))  
print('-'*70)
        
        
        
        

Self-attention layer models is: 
self_attention_layer(
  (W_q_): Linear(in_features=10, out_features=10, bias=True)
  (W_k_): Linear(in_features=10, out_features=10, bias=True)
  (W_v_): Linear(in_features=10, out_features=10, bias=True)
)
----------------------------------------------------------------------
Self-attention layer input size: torch.Size([5, 100, 10])
Self-attention layer output size: torch.Size([5, 100, 10])
----------------------------------------------------------------------
Transformer block models is: 
transformer_block_encoder(
  (attention_): self_attention_layer(
    (W_q_): Linear(in_features=10, out_features=10, bias=True)
    (W_k_): Linear(in_features=10, out_features=10, bias=True)
    (W_v_): Linear(in_features=10, out_features=10, bias=True)
  )
  (layer_norm1_): LayerNorm((10,), eps=1e-05, elementwise_affine=True)
  (layer_norm2_): LayerNorm((10,), eps=1e-05, elementwise_affine=True)
  (ffnn_): Sequential(
    (0): Linear(in_features=10, out_features=102

## TRAINING THE MODEL on the asked DATASET

In [10]:
pip install datasets --break-system-packages

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [11]:
pip install huggingface_hub --break-system-packages

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


### Transformer Model

In [12]:
class TransformerModel(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, dims_embd):
        super().__init__()
        self.embedding_src = nn.Embedding(src_vocab_size, dims_embd)
        self.embedding_tgt = nn.Embedding(tgt_vocab_size, dims_embd)
        
        self.encoder = transformer_encoder(dims_embd)
        self.decoder = transformer_decoder(dims_embd)
        
        self.output_layer = nn.Linear(dims_embd, tgt_vocab_size)

    def forward(self, src, tgt):
        # src_emb = self.embedding_src(src)
        # tgt_emb = self.embedding_tgt(tgt)
        src_emb = self.embedding_src(src).transpose(0, 1)  # (seq_len, batch) -> (batch, seq_len, embed)
        tgt_emb = self.embedding_tgt(tgt).transpose(0, 1)

        memory = self.encoder(src_emb)
        out = self.decoder(memory, tgt_emb)
        return self.output_layer(out)


### Step 1: Loading Bengali and English sentences

In [13]:
with open('bn-en.txt/OpenSubtitles.bn-en.bn', encoding='utf-8') as f_bn:
    bn_sentences = f_bn.read().strip().split('\n')

with open('bn-en.txt/OpenSubtitles.bn-en.en', encoding='utf-8') as f_en:
    en_sentences = f_en.read().strip().split('\n')

print(f"Loaded {len(bn_sentences)} Bengali sentences and {len(en_sentences)} English sentences.")

Loaded 72022 Bengali sentences and 72022 English sentences.


### Step 2: Build Vocabularies

In [14]:
from collections import Counter

def tokenize(text):
    return text.lower().split()

def build_vocab(sentences, min_freq=2):
    counter = Counter()
    for sentence in sentences:
        tokens = tokenize(sentence)
        counter.update(tokens)
    
    vocab = {"<pad>": 0, "<sos>": 1, "<eos>": 2, "<unk>": 3}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    
    return vocab

src_vocab = build_vocab(en_sentences)  # English is source
tgt_vocab = build_vocab(bn_sentences)  # Bengali is target

### Step 3: Encode Sentences

In [15]:
def encode(sentence, vocab):
    tokens = ["<sos>"] + tokenize(sentence) + ["<eos>"]
    return torch.tensor([vocab.get(token, vocab["<unk>"]) for token in tokens], dtype=torch.long)

### Step 4: Create Dataset and DataLoader

In [16]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class TranslationDataset(Dataset):
    def __init__(self, src_sentences, tgt_sentences, src_vocab, tgt_vocab):
        self.src_sentences = src_sentences
        self.tgt_sentences = tgt_sentences
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab

    def __len__(self):
        return len(self.src_sentences)

    def __getitem__(self, idx):
        src = encode(self.src_sentences[idx], self.src_vocab)
        tgt = encode(self.tgt_sentences[idx], self.tgt_vocab)
        return src, tgt

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, padding_value=src_vocab["<pad>"], batch_first=False)
    tgt_batch = pad_sequence(tgt_batch, padding_value=tgt_vocab["<pad>"], batch_first=False)
    return src_batch, tgt_batch

dataset = TranslationDataset(en_sentences, bn_sentences, src_vocab, tgt_vocab)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

### Step 5: Training Loop

In [17]:
import torch.nn.functional as F
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TransformerModel(len(src_vocab), len(tgt_vocab), dims_embd=512).to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss(ignore_index=src_vocab["<pad>"])

num_epochs = 1

In [None]:
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0

    for batch_idx, (src, tgt) in enumerate(train_loader):
        src, tgt = src.to(device), tgt.to(device)

        tgt_input = tgt[:-1, :]  # remove <eos>
        tgt_output = tgt[1:, :]  # remove <sos>

        preds = model(src, tgt_input)
        preds = preds.view(-1, preds.shape[-1])
        tgt_output = tgt_output.reshape(-1)

        loss = criterion(preds, tgt_output)
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()
        
        epoch_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Loss: {epoch_loss:.4f}")