In [3]:
"""
-----------------------------------------------------------------------------
Transformer using pytorch and numpy
-----------------------------------------------------------------------------
AUTHOR: Soumitra Samanta (soumitra.samanta@gm.rkmvu.ac.in)
-----------------------------------------------------------------------------
Package required:
Numpy: https://numpy.org/
Matplotlib: https://matplotlib.org
-----------------------------------------------------------------------------
"""

import numpy as np
import torch
from torch import Tensor
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torchsummary import summary

from typing import Tuple


In [3]:
# Define the input and output file paths
input_file = "/home/shubham/Pictures/trans_eng_to_ben/bn.txt"
output_file = "/home/shubham/Pictures/trans_eng_to_ben/new_dataset_1000_lines_bn.txt"

# Open the input file and write only the first 1000 lines to the output file
with open(input_file, "r") as infile, open(output_file, "w") as outfile:
    for i, line in enumerate(infile):
        if i < 1000:
            outfile.write(line)
        else:
            break

print(f"New dataset with 1000 lines created at: {output_file}")


New dataset with 1000 lines created at: /home/shubham/Pictures/trans_eng_to_ben/new_dataset_1000_lines_bn.txt


In [4]:
class self_attention_layer(nn.Module):
    """
    Self attention layer
    """
    
    def __init__(
        self,
        dims_embd: int,
    )->None:
        """
        Self attention class initialization
        
        Inpout:
            - dims_embd (int): Embedding dimension
        """
        
        super().__init__()
        self.dims_embd_ = dims_embd
        ############################################################################
        #                             Your code will be here                       #
        #--------------------------------------------------------------------------#
        self.W_q_ = nn.Linear(dims_embd, dims_embd)
        self.W_k_ = nn.Linear(dims_embd, dims_embd)
        self.W_v_ = nn.Linear(dims_embd, dims_embd)
        
        
        #--------------------------------------------------------------------------#
        #                             End of your code                             #
        ############################################################################
        
    def forward(
        self, 
        x: Tensor 
    )->Tensor:
        """
        Forward pass for the self attention layer
        
        Imput:
            - x (torch tensor): Input data
            
        Output:
        
        """
        
        y = []
        ############################################################################
        #                             Your code will be here                       #
        #--------------------------------------------------------------------------#
        
        
        
        #--------------------------------------------------------------------------#
        #                             End of your code                             #
        ############################################################################
    
        return y
    
class transformer_block_encoder(nn.Module):
    """
    Transformer single block
    """
    
    def __init__(
        self,
        dims_embd: int,
        num_hidden_nodes_ffnn: int = 2048,
        dropout_prob: float = 0.0
    )->None:
        """
        Transformer single block class initialization
        
        Inpout:
            - dims_embd (int):             Embedding dimension
            - num_hidden_nodes_ffnn (int): Number of neurons in the fed-forward layer
            - dropout_prob (float):        Dropout probability in liner layers
        """
        
        super().__init__()
        
        ############################################################################
        #                             Your code will be here                       #
        #--------------------------------------------------------------------------#
        self.attention_ = self_attention_layer(dims_embd)
        
        self.layer_norm1_ = nn.LayerNorm(dims_embd)
        self.layer_norm2_ = nn.LayerNorm(dims_embd)
        
        self.ffnn_ = nn.Sequential(
            nn.Linear(dims_embd, num_hidden_nodes_ffnn),
            nn.ReLU(),
            nn.Linear(num_hidden_nodes_ffnn, dims_embd)
        )
        self.droput_ops_ = nn.Dropout(dropout_prob)
        
        self.dims_embd_ = dims_embd
        self.num_hidden_nodes_ffnn_ = num_hidden_nodes_ffnn
        self.dropout_prob_ = dropout_prob
        
        
        #--------------------------------------------------------------------------#
        #                             End of your code                             #
        ############################################################################
        
    def forward(
        self,
        x: Tensor,
    )->Tensor:
        """
        Forward pass for the transformer block
        
        Imput:
            - x (torch tensor): Input data
            
        Output:
        
        """
        
        ############################################################################
        #                             Your code will be here                       #
        #--------------------------------------------------------------------------#
        
    
        #--------------------------------------------------------------------------#
        #                             End of your code                             #
        ############################################################################
        
        return x
        
class transformer_encoder(nn.Module):
    """
    Transformer encoder module
    """
    
    def __init__(
        self,
        dims_embd: int,
        num_hidden_nodes_ffnn: int = 2048,
        dropout_prob: float = 0.0,
        num_layers_encoder: int = 2
    )->None:
        """
        Transformer encoder class initialization
        
        Inpout:
            - dims_embd (int):             Embedding dimension
            - num_hidden_nodes_ffnn (int): Number of neurons in the fed-forward layer
            - dropout_prob (float):        Dropout probability in liner layers
            - num_layers_encoder (int):    Number encoder blocks
        """
        super().__init__()
        
        ############################################################################
        #                             Your code will be here                       #
        #--------------------------------------------------------------------------#
        self.trs_endr_blocks_ = nn.ModuleList(
            [
                transformer_block_encoder(dims_embd, num_hidden_nodes_ffnn, dropout_prob) for _ in range(num_layers_encoder)
            ]
        )
        
        self.num_layers_encoder_ = num_layers_encoder
        
        
        #--------------------------------------------------------------------------#
        #                             End of your code                             #
        ############################################################################
    
    def forward(
        self,
        x: Tensor,
    )->Tensor:
        """
        Forward pass for the transformer encoder
        
        Imput:
            - x (torch tensor): Input data
            
        Output:
        
        """
        
        ############################################################################
        #                             Your code will be here                       #
        #--------------------------------------------------------------------------#
        
        
        #--------------------------------------------------------------------------#
        #                             End of your code                             #
        ############################################################################
        
        return x
    
    
class cross_attention_layer(nn.Module):
    """
    Cross attention layer
    """
    
    def __init__(
        self,
        dims_embd: int,
    )->None:
        """
        Cross attention class initialization
        
        Inpout:
            - dims_embd (int): Embedding dimension
        """
        
        super().__init__()
        
        ############################################################################
        #                             Your code will be here                       #
        #--------------------------------------------------------------------------#
        
        self.W_q_ = nn.Linear(dims_embd, dims_embd)
        self.W_k_ = nn.Linear(dims_embd, dims_embd)
        self.W_v_ = nn.Linear(dims_embd, dims_embd)
        
        self.dims_embd_ = dims_embd
        
        
        #--------------------------------------------------------------------------#
        #                             End of your code                             #
        ############################################################################
        
    def forward(
        self, 
        x: Tensor,
        y: Tensor
    )->Tensor:
        """
        Forward pass for the cross-attention layer
        
        Imput:
            - x (torch tensor): Input encoder data
            - y (torch tensor): Input decoder data
            
        Output:
        
        """
        
        ############################################################################
        #                             Your code will be here                       #
        #--------------------------------------------------------------------------#
        
        
        
        #--------------------------------------------------------------------------#
        #                             End of your code                             #
        ############################################################################
    
        return y
    

class transformer_block_decoder(nn.Module):
    """
    Transformer single decoder block
    """
    
    def __init__(
        self,
        dims_embd: int,
        num_hidden_nodes_ffnn: int = 2048,
        dropout_prob: float = 0.0
    )->None:
        """
        Transformer single block class initialization
        
        Inpout:
            - dims_embd (int):             Embedding dimension
            - num_hidden_nodes_ffnn (int): Number of neurons in the fed-forward layer
            - dropout_prob (float):        Dropout probability in liner layers
        """
        
        super().__init__()
        
        ############################################################################
        #                             Your code will be here                       #
        #--------------------------------------------------------------------------#
        
        self.attention_ = self_attention_layer(dims_embd)
        self.cross_attention_ = cross_attention_layer(dims_embd)
        
        self.layer_norm1_ = nn.LayerNorm(dims_embd)
        self.layer_norm2_ = nn.LayerNorm(dims_embd)
        self.layer_norm3_ = nn.LayerNorm(dims_embd)
        
        self.ffnn_ = nn.Sequential(
            nn.Linear(dims_embd, num_hidden_nodes_ffnn),
            nn.ReLU(),
            nn.Linear(num_hidden_nodes_ffnn, dims_embd)
        )
        self.droput_ops_ = nn.Dropout(dropout_prob)
        
        self.dims_embd_ = dims_embd
        self.num_hidden_nodes_ffnn_ = num_hidden_nodes_ffnn
        self.dropout_prob_ = dropout_prob
        
        #--------------------------------------------------------------------------#
        #                             End of your code                             #
        ############################################################################
        
    def forward(
        self,
        x: Tensor,
        y: Tensor
    )->Tensor:
        """
        Forward pass for the transformer block
        
        Imput:
            - x (torch tensor): Input encoder data
            - y (torch tensor): Input decoder data
            
        Output:
        
        """
        
        ############################################################################
        #                             Your code will be here                       #
        #--------------------------------------------------------------------------#
        
        y = self.layer_norm1_(y + self.attention_(y))
        y = self.droput_ops_(y)
        y = self.layer_norm2_(y + self.cross_attention_(x, y))
        y = self.droput_ops_(y)
        y = self.layer_norm3_(y + self.ffnn_(y))
        y = self.droput_ops_(y)
    
        #--------------------------------------------------------------------------#
        #                             End of your code                             #
        ############################################################################
        
        return y
    
    
class transformer_decoder(nn.Module):
    """
    Transformer decoder module
    """
    
    def __init__(
        self,
        dims_embd: int,
        num_hidden_nodes_ffnn: int = 2048,
        dropout_prob: float = 0.0,
        num_layers_decoder: int = 2
    )->None:
        """
        Transformer decoder class initialization
        
        Inpout:
            - dims_embd (int):             Embedding dimension
            - num_hidden_nodes_ffnn (int): Number of neurons in the fed-forward layer
            - dropout_prob (float):        Dropout probability in liner layers
            - num_layers_decoder (int):    Number decoder blocks
        """
        super().__init__()
        
        ############################################################################
        #                             Your code will be here                       #
        #--------------------------------------------------------------------------#
        self.trs_dcdr_blocks_ = nn.ModuleList(
            [
                transformer_block_decoder(dims_embd, num_hidden_nodes_ffnn, dropout_prob) for _ in range(num_layers_decoder)
            ]
        )
        
        self.num_layers_decoder_ = num_layers_decoder
        #--------------------------------------------------------------------------#
        #                             End of your code                             #
        ############################################################################
        
    def forward(
        self,
        x: Tensor,
        y: Tensor
    )->Tensor:
        """
        Forward pass for the transformer encoder
        
        Imput:
            - x (torch tensor): Input encoder data
            - y (torch tensor): Input decoder data
            
        Output:
        
        """
        
        ############################################################################
        #                             Your code will be here                       #
        #--------------------------------------------------------------------------#
        
        for block in self.trs_dcdr_blocks_:
            x = block(x, y)
        
        #--------------------------------------------------------------------------#
        #                             End of your code                             #
        ############################################################################
        
        return x
        
       
    
dims_embd = 10
num_data_points = 100
batch_size = 5
num_hidden_nodes_ffnn = 1024
dropout_prob = 0.2
num_layers_encoder = 2

x = torch.rand(batch_size, num_data_points, dims_embd)
y = torch.rand(batch_size, num_data_points, dims_embd)

# Test Self-attention layer and its input output size  
print('='*70)
model_self_attention_layer = self_attention_layer(dims_embd)
print('Self-attention layer models is: \n{}' .format(model_self_attention_layer))
print('-'*70)

y_bar = model_self_attention_layer(x)
print('Self-attention layer input size: {}' .format(x.shape))
print('Self-attention layer output size: {}' .format(y_bar.shape))
print('-'*70)
        
# Test Transformer encoder block input output size 
print('='*70)
model_transformer_block_encoder = transformer_block_encoder(dims_embd, num_hidden_nodes_ffnn, dropout_prob)
print('Transformer block models is: \n{}' .format(model_transformer_block_encoder))
print('-'*70)
print('Transformer block models summary:')
print('-'*70)
summary(model_transformer_block_encoder, (num_data_points, dims_embd, ), device=str("cpu"))
print('-'*70)

y_bar = model_transformer_block_encoder(x)
print('Transformer block input size: {}' .format(x.shape))
print('Transformer block output size: {}' .format(y_bar.shape))  
print('-'*70)

# Test Transformer encoder input output size 
print('='*70)
model_transformer_encoder = transformer_encoder(dims_embd, num_hidden_nodes_ffnn, dropout_prob, num_layers_encoder)
print('Transformer encoder models is: \n{}' .format(model_transformer_encoder))
print('-'*70)
print('Transformer encoder models summary:')
print('-'*70)
summary(model_transformer_encoder, (num_data_points, dims_embd, ), device=str("cpu"))
print('-'*70)

y_bar = model_transformer_encoder(x)
print('Transformer encoder input size: {}' .format(x.shape))
print('Transformer encoder output size: {}' .format(y_bar.shape))  
print('-'*70)

# Test Cross-attention layer and its input output size  
print('='*70)
model_cross_attention_layer = cross_attention_layer(dims_embd)
print('Cross-attention layer models is: \n{}' .format(model_cross_attention_layer))
print('-'*70)

y_bar = model_cross_attention_layer(x, y)
print('Cross-attention layer input size: {}' .format(x.shape))
print('Cross-attention layer output size: {}' .format(y_bar.shape))
print('-'*70)

# Test Transformer decoder block input output size 
print('='*70)
model_transformer_block_decoder = transformer_block_decoder(dims_embd, num_hidden_nodes_ffnn, dropout_prob)
print('Transformer decoder block models is: \n{}' .format(model_transformer_block_decoder))
print('-'*70)
print('Transformer decoder block models summary:')
print('-'*70)
summary(model_transformer_block_decoder, [(num_data_points, dims_embd, ), (num_data_points, dims_embd, )], device=str("cpu"))
print('-'*70)

y_bar = model_transformer_block_decoder(x, y)
print('Transformer block input size: {}' .format(x.shape))
print('Transformer block output size: {}' .format(y_bar.shape))  
print('-'*70)

# Test Transformer decoder input output size 
print('='*70)
model_transformer_decoder = transformer_decoder(dims_embd, num_hidden_nodes_ffnn, dropout_prob, num_layers_encoder)
print('Transformer decoder models is: \n{}' .format(model_transformer_decoder))
print('-'*70)
print('Transformer decoder models summary:')
print('-'*70)
summary(model_transformer_decoder, [(num_data_points, dims_embd, ), (num_data_points, dims_embd, )], device=str("cpu"))
print('-'*70)

y_bar = model_transformer_decoder(x, y)
print('Transformer decoder input size: {}' .format(x.shape))
print('Transformer decoder output size: {}' .format(y_bar.shape))  
print('-'*70)
        
        
        
        

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import re
import math
from torch.utils.data import Dataset, DataLoader

# File paths
ENGLISH_FILE_PATH = '/home/shubham/Pictures/trans_eng_to_ben/en.txt'  # Replace with actual file path
BENGALI_FILE_PATH = '/home/shubham/Pictures/trans_eng_to_ben/bn.txt'  # Replace with actual file path

# Loading data
def load_data(english_file, bengali_file):
    with open(english_file, 'r', encoding='utf-8') as f:
        english_sentences = f.readlines()
    with open(bengali_file, 'r', encoding='utf-8') as f:
        bengali_sentences = f.readlines()
    
    # Clean and strip sentences
    english_sentences = [line.strip() for line in english_sentences]
    bengali_sentences = [line.strip() for line in bengali_sentences]
    
    return english_sentences, bengali_sentences

# Tokenization and encoding
def tokenize_and_pad(sentences, tokenizer, max_len):
    tokenized = [tokenizer.encode(sentence) for sentence in sentences]
    return [sent + [0] * (max_len - len(sent)) for sent in tokenized]

# PyTorch Dataset and Dataloader
class TranslationDataset(Dataset):
    def __init__(self, english, bengali, english_tokenizer, bengali_tokenizer, max_len):
        self.english = tokenize_and_pad(english, english_tokenizer, max_len)
        self.bengali = tokenize_and_pad(bengali, bengali_tokenizer, max_len)
    
    def __len__(self):
        return len(self.english)
    
    def __getitem__(self, idx):
        return torch.tensor(self.english[idx]), torch.tensor(self.bengali[idx])

In [None]:
import torch
import torch.nn as nn
from torch import Tensor
from torchsummary import summary
import math

class self_attention_layer(nn.Module):
    """
    Self-attention layer
    """
    
    def __init__(
        self,
        dims_embd: int,
    )->None:
        """
        Self-attention class initialization
        
        Input:
            - dims_embd (int): Embedding dimension
        """
        
        super().__init__()
        self.dims_embd_ = dims_embd
        self.W_q_ = nn.Linear(dims_embd, dims_embd)
        self.W_k_ = nn.Linear(dims_embd, dims_embd)
        self.W_v_ = nn.Linear(dims_embd, dims_embd)
        
    def forward(
        self, 
        x: Tensor 
    )->Tensor:
        """
        Forward pass for the self-attention layer
        
        Input:
            - x (torch tensor): Input data
            
        Output:
            - y (torch tensor): Output of self-attention
        """
        
        Q = self.W_q_(x)
        K = self.W_k_(x)
        V = self.W_v_(x)
        
        scores = Q @ K.transpose(-2, -1) / math.sqrt(self.dims_embd_)
        attention = torch.softmax(scores, dim=-1)
        y = attention @ V
        
        return y
    
class transformer_block_encoder(nn.Module):
    """
    Transformer single encoder block
    """
    
    def __init__(
        self,
        dims_embd: int,
        num_hidden_nodes_ffnn: int = 2048,
        dropout_prob: float = 0.0
    )->None:
        """
        Transformer single block class initialization
        
        Input:
            - dims_embd (int):             Embedding dimension
            - num_hidden_nodes_ffnn (int): Number of neurons in the feed-forward layer
            - dropout_prob (float):        Dropout probability in linear layers
        """
        
        super().__init__()
        
        self.attention_ = self_attention_layer(dims_embd)
        self.layer_norm1_ = nn.LayerNorm(dims_embd)
        self.layer_norm2_ = nn.LayerNorm(dims_embd)
        
        self.ffnn_ = nn.Sequential(
            nn.Linear(dims_embd, num_hidden_nodes_ffnn),
            nn.ReLU(),
            nn.Linear(num_hidden_nodes_ffnn, dims_embd)
        )
        self.dropout_ = nn.Dropout(dropout_prob)
        
    def forward(
        self,
        x: Tensor,
    )->Tensor:
        """
        Forward pass for the transformer block
        
        Input:
            - x (torch tensor): Input data
            
        Output:
            - x (torch tensor): Output of transformer block encoder
        """
        
        attn_out = self.attention_(x)
        x = self.layer_norm1_(x + attn_out)
        x = self.dropout_(x)
        
        ffnn_out = self.ffnn_(x)
        x = self.layer_norm2_(x + ffnn_out)
        x = self.dropout_(x)
        
        return x
        
class transformer_encoder(nn.Module):
    """
    Transformer encoder module
    """
    
    def __init__(
        self,
        dims_embd: int,
        num_hidden_nodes_ffnn: int = 2048,
        dropout_prob: float = 0.0,
        num_layers_encoder: int = 2
    )->None:
        """
        Transformer encoder class initialization
        
        Input:
            - dims_embd (int):             Embedding dimension
            - num_hidden_nodes_ffnn (int): Number of neurons in the feed-forward layer
            - dropout_prob (float):        Dropout probability in linear layers
            - num_layers_encoder (int):    Number of encoder blocks
        """
        super().__init__()
        
        self.encoder_blocks = nn.ModuleList(
            [
                transformer_block_encoder(dims_embd, num_hidden_nodes_ffnn, dropout_prob) for _ in range(num_layers_encoder)
            ]
        )
    
    def forward(
        self,
        x: Tensor,
    )->Tensor:
        """
        Forward pass for the transformer encoder
        
        Input:
            - x (torch tensor): Input data
            
        Output:
            - x (torch tensor): Output of transformer encoder
        """
        
        for block in self.encoder_blocks:
            x = block(x)
        
        return x
    
class cross_attention_layer(nn.Module):
    """
    Cross-attention layer
    """
    
    def __init__(
        self,
        dims_embd: int,
    )->None:
        """
        Cross-attention class initialization
        
        Input:
            - dims_embd (int): Embedding dimension
        """
        
        super().__init__()
        
        self.dims_embd_ = dims_embd  # Define dims_embd_ as an attribute
        self.W_q_ = nn.Linear(dims_embd, dims_embd)
        self.W_k_ = nn.Linear(dims_embd, dims_embd)
        self.W_v_ = nn.Linear(dims_embd, dims_embd)
        
    def forward(
        self, 
        x: Tensor,
        y: Tensor
    )->Tensor:
        """
        Forward pass for the cross-attention layer
        
        Input:
            - x (torch tensor): Input encoder data
            - y (torch tensor): Input decoder data
            
        Output:
            - y (torch tensor): Output of cross-attention
        """
        
        Q = self.W_q_(y)
        K = self.W_k_(x)
        V = self.W_v_(x)
        
        # Use self.dims_embd_ in the forward pass
        scores = Q @ K.transpose(-2, -1) / math.sqrt(self.dims_embd_)
        attention = torch.softmax(scores, dim=-1)
        y = attention @ V
        
        return y

    

class transformer_block_decoder(nn.Module):
    """
    Transformer single decoder block
    """
    
    def __init__(
        self,
        dims_embd: int,
        num_hidden_nodes_ffnn: int = 2048,
        dropout_prob: float = 0.0
    )->None:
        """
        Transformer single block class initialization
        
        Input:
            - dims_embd (int):             Embedding dimension
            - num_hidden_nodes_ffnn (int): Number of neurons in the feed-forward layer
            - dropout_prob (float):        Dropout probability in linear layers
        """
        
        super().__init__()
        
        self.attention_ = self_attention_layer(dims_embd)
        self.cross_attention_ = cross_attention_layer(dims_embd)
        
        self.layer_norm1_ = nn.LayerNorm(dims_embd)
        self.layer_norm2_ = nn.LayerNorm(dims_embd)
        self.layer_norm3_ = nn.LayerNorm(dims_embd)
        
        self.ffnn_ = nn.Sequential(
            nn.Linear(dims_embd, num_hidden_nodes_ffnn),
            nn.ReLU(),
            nn.Linear(num_hidden_nodes_ffnn, dims_embd)
        )
        self.dropout_ = nn.Dropout(dropout_prob)
        
    def forward(
        self,
        x: Tensor,
        y: Tensor
    )->Tensor:
        """
        Forward pass for the transformer block
        
        Input:
            - x (torch tensor): Input encoder data
            - y (torch tensor): Input decoder data
            
        Output:
            - y (torch tensor): Output of transformer block decoder
        """
        
        y = self.layer_norm1_(y + self.attention_(y))
        y = self.dropout_(y)
        y = self.layer_norm2_(y + self.cross_attention_(x, y))
        y = self.dropout_(y)
        y = self.layer_norm3_(y + self.ffnn_(y))
        y = self.dropout_(y)
        
        return y
    
class transformer_decoder(nn.Module):
    """
    Transformer decoder module
    """
    
    def __init__(
        self,
        dims_embd: int,
        num_hidden_nodes_ffnn: int = 2048,
        dropout_prob: float = 0.0,
        num_layers_decoder: int = 2
    )->None:
        """
        Transformer decoder class initialization
        
        Input:
            - dims_embd (int):             Embedding dimension
            - num_hidden_nodes_ffnn (int): Number of neurons in the feed-forward layer
            - dropout_prob (float):        Dropout probability in linear layers
            - num_layers_decoder (int):    Number of decoder blocks
        """
        super().__init__()
        
        self.decoder_blocks = nn.ModuleList(
            [
                transformer_block_decoder(dims_embd, num_hidden_nodes_ffnn, dropout_prob) for _ in range(num_layers_decoder)
            ]
        )
    
    def forward(
        self,
        x: Tensor,
        y: Tensor
    )->Tensor:
        """
        Forward pass for the transformer decoder
        
        Input:
            - x (torch tensor): Input encoder data
            - y (torch tensor): Input decoder data
            
        Output:
            - y (torch tensor): Output of transformer decoder
        """
        
        for block in self.decoder_blocks:
            y = block(x, y)
        
        return y


In [None]:
# Load data and initialize tokenizers (substitute with actual tokenizer creation)
english_sentences, bengali_sentences = load_data(ENGLISH_FILE_PATH, BENGALI_FILE_PATH)
english_tokenizer = None  # Implement tokenizer
bengali_tokenizer = None  # Implement tokenizer

# Dataset and Dataloader
train_dataset = TranslationDataset(english_sentences, bengali_sentences, english_tokenizer, bengali_tokenizer, max_len)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Model components
encoder = transformer_encoder(embedding_dim, ff_dim, dropout_prob, num_layers)
decoder = transformer_decoder(embedding_dim, ff_dim, dropout_prob, num_layers)
model = Transformer(encoder, decoder, vocab_size_tgt=bengali_tokenizer.vocab_size)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
model.train()
for epoch in range(epochs):
    for src, tgt in train_loader:
        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        loss = criterion(output.view(-1, output.shape[-1]), tgt[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}')

# Translation function
def translate(model, sentence, max_len=50):
    model.eval()
    with torch.no_grad():
        # Encode source sentence
        src = torch.tensor(tokenize_and_pad([sentence], english_tokenizer, max_len)).to(model.device)
        tgt = torch.tensor([[bengali_tokenizer.vocab_size]])  # Initialize with start token
        for i in range(max_len):
            output = model(src, tgt)
            pred_token = output.argmax(dim=-1)[:, -1]
            tgt = torch.cat((tgt, pred_token.unsqueeze(0)), dim=-1)
            if pred_token.item() == 0:  # Stop on <eos>
                break
        return bengali_tokenizer.decode(tgt.squeeze().tolist())

# Test Translation
test_sentence = "Hello, how are you?"
translated_sentence = translate(model, test_sentence)
print(f"English: {test_sentence}")
print(f"Bengali: {translated_sentence}")

In [3]:
dims_embd = 10
num_data_points = 100
batch_size = 5
num_hidden_nodes_ffnn = 1024
dropout_prob = 0.2
num_layers_encoder = 2

x = torch.rand(batch_size, num_data_points, dims_embd)
y = torch.rand(batch_size, num_data_points, dims_embd)

# Test Self-attention layer and its input output size  
print('='*70)
model_self_attention_layer = self_attention_layer(dims_embd)
print('Self-attention layer models is: \n{}' .format(model_self_attention_layer))
print('-'*70)

y_bar = model_self_attention_layer(x)
print('Self-attention layer input size: {}' .format(x.shape))
print('Self-attention layer output size: {}' .format(y_bar.shape))
print('-'*70)
        
# Test Transformer encoder block input output size 
print('='*70)
model_transformer_block_encoder = transformer_block_encoder(dims_embd, num_hidden_nodes_ffnn, dropout_prob)
print('Transformer block models is: \n{}' .format(model_transformer_block_encoder))
print('-'*70)
print('Transformer block models summary:')
print('-'*70)
summary(model_transformer_block_encoder, (num_data_points, dims_embd, ), device=str("cpu"))
print('-'*70)

y_bar = model_transformer_block_encoder(x)
print('Transformer block input size: {}' .format(x.shape))
print('Transformer block output size: {}' .format(y_bar.shape))  
print('-'*70)

# Test Transformer encoder input output size 
print('='*70)
model_transformer_encoder = transformer_encoder(dims_embd, num_hidden_nodes_ffnn, dropout_prob, num_layers_encoder)
print('Transformer encoder models is: \n{}' .format(model_transformer_encoder))
print('-'*70)
print('Transformer encoder models summary:')
print('-'*70)
summary(model_transformer_encoder, (num_data_points, dims_embd, ), device=str("cpu"))
print('-'*70)

y_bar = model_transformer_encoder(x)
print('Transformer encoder input size: {}' .format(x.shape))
print('Transformer encoder output size: {}' .format(y_bar.shape))  
print('-'*70)

# Test Cross-attention layer and its input output size  
print('='*70)
model_cross_attention_layer = cross_attention_layer(dims_embd)
print('Cross-attention layer models is: \n{}' .format(model_cross_attention_layer))
print('-'*70)

y_bar = model_cross_attention_layer(x, y)
print('Cross-attention layer input size: {}' .format(x.shape))
print('Cross-attention layer output size: {}' .format(y_bar.shape))
print('-'*70)

# Test Transformer decoder block input output size 
print('='*70)
model_transformer_block_decoder = transformer_block_decoder(dims_embd, num_hidden_nodes_ffnn, dropout_prob)
print('Transformer decoder block models is: \n{}' .format(model_transformer_block_decoder))
print('-'*70)
print('Transformer decoder block models summary:')
print('-'*70)
summary(model_transformer_block_decoder, [(num_data_points, dims_embd, ), (num_data_points, dims_embd, )], device=str("cpu"))
print('-'*70)

y_bar = model_transformer_block_decoder(x, y)
print('Transformer block input size: {}' .format(x.shape))
print('Transformer block output size: {}' .format(y_bar.shape))  
print('-'*70)

# Test Transformer decoder input output size 
print('='*70)
model_transformer_decoder = transformer_decoder(dims_embd, num_hidden_nodes_ffnn, dropout_prob, num_layers_encoder)
print('Transformer decoder models is: \n{}' .format(model_transformer_decoder))
print('-'*70)
print('Transformer decoder models summary:')
print('-'*70)
summary(model_transformer_decoder, [(num_data_points, dims_embd, ), (num_data_points, dims_embd, )], device=str("cpu"))
print('-'*70)

y_bar = model_transformer_decoder(x, y)
print('Transformer decoder input size: {}' .format(x.shape))
print('Transformer decoder output size: {}' .format(y_bar.shape))  
print('-'*70)
        

Self-attention layer models is: 
self_attention_layer(
  (W_q_): Linear(in_features=10, out_features=10, bias=True)
  (W_k_): Linear(in_features=10, out_features=10, bias=True)
  (W_v_): Linear(in_features=10, out_features=10, bias=True)
)
----------------------------------------------------------------------
Self-attention layer input size: torch.Size([5, 100, 10])
Self-attention layer output size: torch.Size([5, 100, 10])
----------------------------------------------------------------------
Transformer block models is: 
transformer_block_encoder(
  (attention_): self_attention_layer(
    (W_q_): Linear(in_features=10, out_features=10, bias=True)
    (W_k_): Linear(in_features=10, out_features=10, bias=True)
    (W_v_): Linear(in_features=10, out_features=10, bias=True)
  )
  (layer_norm1_): LayerNorm((10,), eps=1e-05, elementwise_affine=True)
  (layer_norm2_): LayerNorm((10,), eps=1e-05, elementwise_affine=True)
  (ffnn_): Sequential(
    (0): Linear(in_features=10, out_features=102

Traceback (most recent call last):
  File "/home/shubham/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_2663/1461004299.py", line 58, in <module>
    y_bar = model_cross_attention_layer(x, y)
  File "/home/shubham/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/shubham/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/tmp/ipykernel_2663/1716964706.py", line 200, in forward
    scores = Q @ K.transpose(-2, -1) / math.sqrt(self.dims_embd_)
  File "/home/shubham/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1931, in __getattr__
    raise AttributeError(
AttributeError: 'cross_attention_layer' object has no attribute 'dims_embd_'

During handling of 

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import re
import os

# Set the file paths for English and Bengali datasets
ENGLISH_FILE_PATH = '/home/shubham/Pictures/trans_eng_to_ben/en.txt'  # replace with your file path
BENGALI_FILE_PATH = '/home/shubham/Pictures/trans_eng_to_ben/bn.txt'  # replace with your file path

# Load and preprocess data
def load_data(english_file, bengali_file):
    with open(english_file, 'r', encoding='utf-8') as f:
        english_sentences = f.readlines()
    with open(bengali_file, 'r', encoding='utf-8') as f:
        bengali_sentences = f.readlines()
    
    # Clean and trim whitespaces
    english_sentences = [line.strip() for line in english_sentences]
    bengali_sentences = [line.strip() for line in bengali_sentences]
    
    return english_sentences, bengali_sentences

# Tokenizer
def tokenize(lang_sentences):
    tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
        lang_sentences, target_vocab_size=2**13)
    return tokenizer

# Prepare tokenized datasets with padding
def encode(lang1, lang2, tokenizer1, tokenizer2):
    lang1_encoded = [tokenizer1.encode(sentence) for sentence in lang1]
    lang2_encoded = [tokenizer2.encode(sentence) for sentence in lang2]
    
    max_len = max(max(len(seq) for seq in lang1_encoded), max(len(seq) for seq in lang2_encoded))
    lang1_padded = tf.keras.preprocessing.sequence.pad_sequences(lang1_encoded, maxlen=max_len, padding='post')
    lang2_padded = tf.keras.preprocessing.sequence.pad_sequences(lang2_encoded, maxlen=max_len, padding='post')
    
    return lang1_padded, lang2_padded

# Load data and initialize tokenizers
english_sentences, bengali_sentences = load_data(ENGLISH_FILE_PATH, BENGALI_FILE_PATH)
english_tokenizer = tokenize(english_sentences)
bengali_tokenizer = tokenize(bengali_sentences)

# Encode and pad the sentences
english_padded, bengali_padded = encode(english_sentences, bengali_sentences, english_tokenizer, bengali_tokenizer)

# Transformer Architecture
class Transformer(tf.keras.Model):
    def __init__(self, vocab_size_src, vocab_size_tgt, embedding_dim=256, num_heads=8, ff_dim=512, num_layers=4):
        super(Transformer, self).__init__()
        self.encoder = [
            tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)
            for _ in range(num_layers)
        ]
        self.decoder = [
            tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)
            for _ in range(num_layers)
        ]
        self.dense = tf.keras.layers.Dense(vocab_size_tgt)
        
    def call(self, src, tgt, training):
        x = src
        for layer in self.encoder:
            x = layer(x, x)
        
        y = tgt
        for layer in self.decoder:
            y = layer(y, x)
        
        return self.dense(y)

# Model initialization
embedding_dim = 256
num_heads = 8
ff_dim = 512
num_layers = 4

model = Transformer(
    vocab_size_src=english_tokenizer.vocab_size, 
    vocab_size_tgt=bengali_tokenizer.vocab_size,
    embedding_dim=embedding_dim, num_heads=num_heads, ff_dim=ff_dim, num_layers=num_layers
)

# Prepare dataset for training
BUFFER_SIZE = 20000
BATCH_SIZE = 64

dataset = tf.data.Dataset.from_tensor_slices((english_padded, bengali_padded))
dataset = dataset.cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
EPOCHS = 10
model.fit(dataset, epochs=EPOCHS)

# Translate a given English sentence to Bengali
def translate(sentence):
    # Preprocess the input
    sentence = english_tokenizer.encode(sentence)
    sentence = tf.keras.preprocessing.sequence.pad_sequences([sentence], maxlen=english_padded.shape[1], padding='post')
    
    # Predict using the model
    prediction = model(sentence, training=False)
    predicted_sentence = [np.argmax(p) for p in prediction[0]]
    
    # Decode the predicted sentence
    translated_sentence = bengali_tokenizer.decode(predicted_sentence)
    return translated_sentence

# Example translation
english_text = "Hello, how are you?"
bengali_translation = translate(english_text)
print(f"English: {english_text}")
print(f"Bengali Translation: {bengali_translation}")


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# Define self-attention, transformer encoder, and decoder as provided earlier
# Copy the `self_attention_layer`, `transformer_block_encoder`, `transformer_encoder`, 
# `cross_attention_layer`, `transformer_block_decoder`, `transformer_decoder` classes here

# Set dimensions
dims_embd = 128  # You might need to adjust this based on vocabulary size and embedding size
num_hidden_nodes_ffnn = 512
dropout_prob = 0.1
num_layers_encoder = 2
num_layers_decoder = 2

# Model instantiation
encoder = transformer_encoder(dims_embd, num_hidden_nodes_ffnn, dropout_prob, num_layers_encoder)
decoder = transformer_decoder(dims_embd, num_hidden_nodes_ffnn, dropout_prob, num_layers_decoder)

class TransformerTranslationModel(nn.Module):
    def __init__(self, encoder, decoder, vocab_size_src, vocab_size_tgt, dims_embd):
        super(TransformerTranslationModel, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embedding = nn.Embedding(vocab_size_src, dims_embd)
        self.tgt_embedding = nn.Embedding(vocab_size_tgt, dims_embd)
        self.fc_out = nn.Linear(dims_embd, vocab_size_tgt)
    
    def forward(self, src, tgt):
        src_embedded = self.src_embedding(src)
        tgt_embedded = self.tgt_embedding(tgt)
        encoded = self.encoder(src_embedded)
        decoded = self.decoder(encoded, tgt_embedded)
        output = self.fc_out(decoded)
        return output

# Dataset class to load and process text
class TranslationDataset(Dataset):
    def __init__(self, src_texts, tgt_texts, src_vocab, tgt_vocab, max_len=100):
        self.src_texts = src_texts
        self.tgt_texts = tgt_texts
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.max_len = max_len
    
    def __len__(self):
        return len(self.src_texts)
    
    def __getitem__(self, idx):
        src_text = self.src_texts[idx]
        tgt_text = self.tgt_texts[idx]
        
        src_indices = [self.src_vocab.get(word, self.src_vocab['<UNK>']) for word in src_text.split()[:self.max_len]]
        tgt_indices = [self.tgt_vocab.get(word, self.tgt_vocab['<UNK>']) for word in tgt_text.split()[:self.max_len]]
        
        return torch.tensor(src_indices), torch.tensor(tgt_indices)

# Load and preprocess the dataset
def load_dataset(file_path, num_lines=1000):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()[:num_lines]
    eng_texts = [line.split('\t')[0].strip() for line in lines]
    ben_texts = [line.split('\t')[1].strip() for line in lines]
    return eng_texts, ben_texts

# Create a vocabulary dictionary for both languages
def build_vocab(texts, max_size=5000):
    from collections import Counter
    word_counts = Counter(word for text in texts for word in text.split())
    vocab = {word: idx for idx, (word, _) in enumerate(word_counts.most_common(max_size), start=2)}
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    return vocab

# Load the English and Bengali data
eng_texts = load_dataset('/home/shubham/Pictures/trans_eng_to_ben/en.txt', num_lines=1000)
ben_texts=load_dataset('/home/shubham/Pictures/trans_eng_to_ben/bn.txt', num_lines=1000)

# Build vocabularies
src_vocab = build_vocab(eng_texts)
tgt_vocab = build_vocab(ben_texts)

# Train-test split
train_eng, val_eng, train_ben, val_ben = train_test_split(eng_texts, ben_texts, test_size=0.2)

# Dataset and DataLoader
train_dataset = TranslationDataset(train_eng, train_ben, src_vocab, tgt_vocab)
val_dataset = TranslationDataset(val_eng, val_ben, src_vocab, tgt_vocab)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Initialize the model
vocab_size_src = len(src_vocab)
vocab_size_tgt = len(tgt_vocab)
model = TransformerTranslationModel(encoder, decoder, vocab_size_src, vocab_size_tgt, dims_embd)

# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=src_vocab['<PAD>'])
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for src, tgt in train_loader:
        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        output = output.view(-1, vocab_size_tgt)
        tgt = tgt[:, 1:].contiguous().view(-1)
        loss = criterion(output, tgt)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

    # Validation step
    model.eval()
    with torch.no_grad():
        val_loss = 0
        for src, tgt in val_loader:
            output = model(src, tgt[:, :-1])
            output = output.view(-1, vocab_size_tgt)
            tgt = tgt[:, 1:].contiguous().view(-1)
            loss = criterion(output, tgt)
            val_loss += loss.item()
        val_loss /= len(val_loader)
        print(f"Validation Loss: {val_loss:.4f}")
