# Overview 
In this notebook, I'll be focusing on machine translation using a Bengali to English dataset. I spent 10 days gathering the data, resulting in a clean dataset containing a total of 4 million rows. It's worth noting that this dataset is not yet public. Nonetheless, I'll attempt to train a Transformer architecture model for machine translation.

# Import Dependencies

In [1]:
import os
import sys
import math
import copy
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()
import json

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

import torchtext
import datetime
import pathlib
import io
import os
import re
import string
import time
from numpy import random
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from keras.models import Model
from keras.layers import Layer
from keras.layers import (Dense,Flatten,SimpleRNN,InputLayer,Conv1D,Bidirectional,GRU,LSTM,BatchNormalization,Dropout,Input, Embedding,TextVectorization)
from keras.losses import BinaryCrossentropy,CategoricalCrossentropy, SparseCategoricalCrossentropy
from keras.metrics import Accuracy,TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy
from keras.optimizers import Adam
from keras.layers import MultiHeadAttention, LayerNormalization
from tensorboard.plugins import projector

2024-05-08 10:34:47.965860: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-08 10:34:47.965983: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-08 10:34:48.127584: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
df = pd.read_csv('/kaggle/input/no-more-low-resources-bengali-machine-translation/final_data.csv')

In [5]:
print(df.shape)
print(df.isnull().sum())
print(df.duplicated().sum())

(3288975, 2)
bn    10
en    38
dtype: int64
0


In [3]:
df.dropna(inplace = True)

In [4]:
df = df.head(1000000)

In [49]:
print(df.isnull().sum())
print(df.duplicated().sum())

bn    0
en    0
dtype: int64
0


In [6]:
for i in range(15):
    print(df['bn'][i+1])
    print(df['en'][i+1])

তুরুপ
trump
পদদলিত করা
overrides
সেগুলো হৃদয়কে উষ্ণ করে এবং রোজকার বোঝাগুলোকে হালকা করে
they warm the heart and ease the daily load
আমি ভালোবাসি তোমাকে
i love you
পোর্ট কোম্পানি লিমিটেড কেপিসিএল
port company limited kpcl
তোলপাড়
commotions
এছাড়াও ক্লেমঁসো উইলসনের ১৪ দফার ব্যাপারে সংশয়ী এবং হতাশ ছিলেন তিনি অভিযোগ করে বলেন মিস্টার উইলসনের ১৪ দফা বিরক্তিকর
clemenceau also expressed skepticism and frustration with wilsons fourteen points mr wilson bores me with his fourteen points complained clemenceau
আলজাজিরার সঙ্গে জালুদের সাক্ষাতকার এবং তার দৃষ্টিভঙ্গীর বিষয়ে অন্যান্য টুইটার ব্যবহারকারীরা মন্তব্য করেন
other twitter users went on commenting on jallouds interview for aljazeera and his attitude
ক্লাব
club
তাই আসুন এখন আমরা একবার পরীক্ষা করে দেখি যে তাদের এই দাবি সত্যি কি না
so let us investigate on trial
ওমানের শাসক সুলতান কাবুস বিন সাইদের সমালোচনা করার অপরাধে আলরাওয়াহিকে শাস্তি প্রদানের পর গ্লোবাল ভয়েসেস এডভোকেসিতে ২০১২ সালে তাকে লক্ষ্যনীয়ভাবে উপস্থাপন করা হয়
in 2012 alrawahi was featured

In [5]:
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

num_sentences = len(df)
num_train = int(train_ratio * num_sentences)
num_val = int(val_ratio * num_sentences)
num_test = num_sentences - num_train - num_val

# Shuffle the dataset
df = df.sample(frac=1).reset_index(drop=True)

In [6]:
train_df = df[:num_train]
val_df = df[num_train:num_train+num_val]
test_df = df[num_train+num_val:]

# Tokenization

In [7]:
train_en_tokens = []
train_bn_tokens = []
val_en_tokens = []
val_bn_tokens = []
test_en_tokens = []
test_bn_tokens = []
en_vocab = {'<pad>': 0}  # Initialize English vocabulary with <pad> token
bn_vocab = {'<pad>': 0}  # Initialize Bengali vocabulary with <pad> token

def tokenize_sentence(sentence, vocab):
    tokens = sentence.split()
    token_ids = []
    for token in tokens:
        if token not in vocab:
            vocab[token] = len(vocab)
        token_ids.append(vocab[token])
    return token_ids

# Tokenizing training data
print("Tokenizing training data:")
for en_sent, bn_sent in tqdm(zip(train_df['en'], train_df['bn']), total=len(train_df)):
    train_en_tokens.append(tokenize_sentence(en_sent, en_vocab))
    train_bn_tokens.append(tokenize_sentence(bn_sent, bn_vocab))

# Tokenizing validation data
print("Tokenizing validation data:")
for en_sent, bn_sent in tqdm(zip(val_df['en'], val_df['bn']), total=len(val_df)):
    val_en_tokens.append(tokenize_sentence(en_sent, en_vocab))
    val_bn_tokens.append(tokenize_sentence(bn_sent, bn_vocab))

# Tokenizing testing data
print("Tokenizing testing data:")
for en_sent, bn_sent in tqdm(zip(test_df['en'], test_df['bn']), total=len(test_df)):
    test_en_tokens.append(tokenize_sentence(en_sent, en_vocab))
    test_bn_tokens.append(tokenize_sentence(bn_sent, bn_vocab))

# Update the vocabulary sizes
src_vocab_size = len(en_vocab)
tgt_vocab_size = len(bn_vocab)


Tokenizing training data:


  0%|          | 0/800000 [00:00<?, ?it/s]

Tokenizing validation data:


  0%|          | 0/100000 [00:00<?, ?it/s]

Tokenizing testing data:


  0%|          | 0/100000 [00:00<?, ?it/s]

# Custome Dataset

In [19]:
class TranslationDataset(data.Dataset):
    def __init__(self, en_tokens, bn_tokens):
        self.en_tokens = en_tokens
        self.bn_tokens = bn_tokens
        self.max_len = max(max(len(en), len(bn)) for en, bn in zip(en_tokens, bn_tokens))
        
    def __len__(self):
        return len(self.en_tokens)
    
    def __getitem__(self, index):
        en_data = self.en_tokens[index] + [0] * (self.max_len - len(self.en_tokens[index]))  # Padding with 0
        bn_data = self.bn_tokens[index] + [0] * (self.max_len - len(self.bn_tokens[index]))  # Padding with 0
        return torch.tensor(en_data), torch.tensor(bn_data)

In [20]:
train_dataset = TranslationDataset(train_en_tokens, train_bn_tokens)
val_dataset = TranslationDataset(val_en_tokens, val_bn_tokens)
test_dataset = TranslationDataset(test_en_tokens, test_bn_tokens)

# Create data loaders
train_loader = data.DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = data.DataLoader(val_dataset, batch_size=2)
test_loader = data.DataLoader(test_dataset, batch_size=2)


In [21]:
# Print the first batch of data from train_loader
for batch_idx, (en_data, bn_data) in enumerate(train_loader):
    print("English Data (Batch 0):", en_data)
    print("Bengali Data (Batch 0):", bn_data)
    break  # Break after printing the first batch to avoid printing the entire dataset


English Data (Batch 0): tensor([[   32,   201,    42,  ...,     0,     0,     0],
        [   11, 12390,    42,  ...,     0,     0,     0]])
Bengali Data (Batch 0): tensor([[   31,   136,   374,  ...,     0,     0,     0],
        [  138, 29632,  6925,  ...,     0,     0,     0]])


# Modeling

<hr>
<h4>Model Architecture</h4>
<hr>
<img src='https://machinelearningmastery.com/wp-content/uploads/2021/08/attention_research_1.png'>

# *Step Wise Explanation:*
* **input Embedding**: The process begins with encoding the input language(e.g English sequenc) into numerical vectors. Each word or token is transformed into a high-dimentional vector. 

* **Multi-Head Self-Attention:** This is the heart of the transformer. The model looks at each word in the input sentences and assings different lavels of inportance to other words in the sentence. Multiple attention heads allow the model to focus on different aspects of the sentence simultaneously.

* **Positional Encoding:**  Since transformers don't have inherent sence of word order, positional encoding is added to the word embeddings to help the model understand the words's position in the sentence.

* **Encoder - Decoder Architecture** : In translation task, there are typically two parts: the encoder and the decoder. The encoder takes the input sentence and process it, while the decoder generates the translated output.

* **Decoder Self-Attention** : The decoder also uses multi-head self-attention, but slightly modified to prevent if from looking ahead in the output sentence, which would result in incorrect translations.

* **Attention Output** : The outputs from the attention mechanisms are uesd to calculte attention scores, which datermine how much each word in the input sentence contributes to each word in the output sentence. Position-wise Feedforward Networks: After attention, the model passes the data through feedforward neural networks ot further process and refine the information. Output Layer: The final layer in the decoder procduces probablilities for each word in the targer language vaocabulary, allowing the model to predict the next word in the tanslationl.

* **Training And Optimization** : Transformers are trained using large parralel corpora of source and targer language sentences. They learn to minimize the difference between predicted tranlations and the actual translations in the training data.

* **Repeat For Each Token** : This process is repeated for each word in the output sentence, where the previously generated words are used as context for generating the next word. Beam Search or Greedy Decoding: During inference, the model generates translations one word at a time. Beam search or greedy decoding is often used to select the most likely next word based on the model's predictions.

<h4>Inside Attention Layer</h4>
<img src='https://production-media.paperswithcode.com/methods/35184258-10f5-4cd0-8de3-bd9bc8f88dc3.png'>

# Easy to understand Explanation:

Let's break down and realte it to the components and process in a transformer model:

   *  School and Studens: Think of the school as the entire context, and the students as the individual tokens in a sequence.

   *  Vecitorization and Tokenization : the process of converting students into tokens and vectorizing them represents in the initial preprocessing steps where text data is tokenized into individual words or tokens and then converted into numerical vector representations.

   * Vocabulary: The vocabulary of the school represents the set of unique tokens (students) that the model has learned from previous schools within the same company. Thse tokens are sued to represent words in the sequence.

   * Intrs-Attention (Self-Attention) : Each student's interaction with their classmates represents the  intra-attention mechanishm, where relationships, influences, and context between tokenss (students) are captured. Each studen becomes a query (Q), and their classmates become keys (K) and values (V). Attention scores are calculated to determine how much weight each student should give to their calssmates. Softmax normalization of attention scores can be thought of as grading each student's relationships and influence each others. Concatenation of information from different teachers (heads) captures diverse insights.

   * Linear Layer: The linear layer represents the post-attention but post-attention precessing step that helps combine and refine information before producing the final output.


This is the essence of how attention mechanishm work in t transformers, where tokens (students) atten to  each other, calculate their influence, and produce context vectors (mark sheets) for each other. These context vocetors are then in cross-attention to compare tokens from different parts of the model, ultimately leading to the model's final output

Encoder's Role(intra-attention in encoder): the encoder preocess the input sequence and performs intra-attention. it produces context vectors (contextual representation) for each word in the input sequence. Thesee context vecotrs capture information about how each word related to others within input sequence

Signaling the Decoder: The decoder is signaled to start generating the output sequence. Typically , this is fone by providing the decoder with an intial input, often a special start token (e.g ,or).

Generating the first Word: For the first word in the output sequence, the decoder combines the follwing: The start token as the initail query. The encoder's context vectors, which represent the input sequence. The decoder's own context vector for the output sequence (initialzed explicitly). These componentes are used to predicte the word in the output sequence.

Subssequent Word Preditions: For genrating subsequent words in the output sequence, the following process occurs: The shifted target (previously generated word) becomes the query. The encodr's enctext vector, representing the input sequence, are used for context. The context vectors for the target word (which includes context from the encoder) are also considered. The last word's hidden state, obtained from the decoder's self-attention (intra-attention), is incorporated. These components collectively contribute to the prediction of each subsequent word in the output sequence.

Ierative toekn Generation: The decoder repeats the process of generating tokens one by one, considering context from both the encoder's input sequence and it's own generated sequence. At each step, the decoder calculates a probability distribution over the vocabulary for the next token and selects the token with the highest probability.

Ending the sequence: The process continues until the model generates an end token or  reaches a predefined maximum sequence length

 # Transformer Architecture
 
 <img src="https://www.mihaileric.com/static/feedforward_layer_and_normalization-dfdcfbd00009f7f99eca73ae29f2dfb7-4ec3a.png">

# Positional Encoding

In [11]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [12]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

# Custome Attention Layer


**Multi-Head-Attention Layer**
<hr>

In [13]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, debug_str = None):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        self.debug_str = debug_str
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if (self.debug_str == 'cross'):
            print('attn_scores:',attn_scores.shape, mask.shape)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output
        
    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        
    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
        
    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output

# Encoder Layer

In [14]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

# Decoder

In [15]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)#, debug_str="cross")
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x



As you see the combined mask does not consider the pad tokens.
Lets define each of the mask roles:
- Padding mask - removes consideration of unnecessary pad tokens.
- Causal mask - Prevents from peeking in the future and helps decoder in predicting one token at a time.
- Cross-Attention mask - In the context of cross-attention between the encoder and decoder, a mask is used to ensure that the decoder only attends to positions in the encoder that have valid information. In this case, it can be similar to a padding mask when dealing with sequences of different lengths.
- Combined mask - takes the best of both world and  It ensures that the decoder doesn't include padding tokens in its consideration (like the padding mask) and enforces the autoregressive behavior (like the causal mask), allowing the decoder to predict one token at a time while avoiding future tokens.


# Full Transformer Model

In [16]:

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout, pad_token_src = 0, pad_token_tgt = 0, device = 'cpu'):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)
        
        self.pad_token_src = pad_token_src
        self.pad_token_tgt = pad_token_tgt
        self.device = device
        self = self.to(self.device)

    def generate_mask(self, src_mask, tgt_mask):
        src_mask = src_mask.unsqueeze(1).unsqueeze(2)
        tgt_mask = tgt_mask.unsqueeze(1).unsqueeze(3)
        seq_length = tgt_mask.size(2)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask.to(self.device)
        return src_mask, tgt_mask

    def decode(self, src, bos_token_id, eos_token_id, mask=None, max_dec_length = 25):
        """
        for inference
        Args:
            src: input to encoder 
            trg: input to decoder
        out:
            out_labels : returns final prediction of sequence
        """

        tgt = torch.tensor([[bos_token_id]]*src.shape[0]).to(self.device)
        if mask:
            src_mask, tgt_mask = self.generate_mask(mask['src_mask'], mask['tgt_mask'])
        else:
            src_mask, tgt_mask = self.generate_mask(src!=self.pad_token_src, tgt!=self.pad_token_tgt)
        
        enc_output = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        out_labels = tgt
        unfinished_seq = np.array([1]*src.shape[0])
        i=0;
        while (sum(unfinished_seq)>0 & i<max_dec_length):
            dec_output = self.dropout(self.positional_encoding(self.decoder_embedding(out_labels)))
            for dec_layer in self.decoder_layers:
                dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)
            output = self.fc(dec_output)

            out_labels = torch.cat((out_labels, output[:,-1:,:].argmax(-1)),dim=1)
            
            unfinished_seq[(out_labels[:,-1] == eos_token_id).cpu().numpy()] = 0

            i += 1;
        return out_labels
    
    def forward(self, src, tgt, mask = None):
        if mask:
            src_mask, tgt_mask = self.generate_mask(mask['src_mask'], mask['tgt_mask'])
        else:
            src_mask, tgt_mask = self.generate_mask(src!=self.pad_token_src, tgt!=self.pad_token_tgt)
                
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

In [17]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# Hyperparameters
src_vocab_size = len(en_vocab)
tgt_vocab_size = len(bn_vocab)
# Reduce Model Size
d_model = 64  # Decrease the model dimensionality
num_heads = 2  # Decrease the number of attention heads
num_layers = 2  # Decrease the number of layers
d_ff = 512  # Decrease the size of the feed-forward layers
max_seq_length = max(train_dataset.max_len, val_dataset.max_len, test_dataset.max_len)  # Maximum sequence length
dropout = 0.1  # Dropout probability

# Instantiate the Transformer model
transformer_model = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout, device = device)

In [63]:
print(transformer_model)

Transformer(
  (encoder_embedding): Embedding(272575, 64)
  (decoder_embedding): Embedding(466513, 64)
  (positional_encoding): PositionalEncoding()
  (encoder_layers): ModuleList(
    (0-1): 2 x EncoderLayer(
      (self_attn): MultiHeadAttention(
        (W_q): Linear(in_features=64, out_features=64, bias=True)
        (W_k): Linear(in_features=64, out_features=64, bias=True)
        (W_v): Linear(in_features=64, out_features=64, bias=True)
        (W_o): Linear(in_features=64, out_features=64, bias=True)
      )
      (feed_forward): PositionWiseFeedForward(
        (fc1): Linear(in_features=64, out_features=512, bias=True)
        (fc2): Linear(in_features=512, out_features=64, bias=True)
        (relu): ReLU()
      )
      (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (decoder_layers): ModuleList(
    (0-1): 2 x DecoderLayer(
      (self_a

In [None]:
from tqdm import tqdm

# Training loop
for epoch in range(num_epochs):
    transformer_model.train()  # Set the model to training mode
    total_loss = 0
    
    # Create a progress bar
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")
    
    # Iterate through batches
    for batch_idx, (src, tgt) in progress_bar:
        src, tgt = src.to(device), tgt.to(device)
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        output = transformer_model(src, tgt[:, :-1])  # Exclude the <eos> token from input
        
        # Flatten the output and target tensors to compute loss
        output_flat = output.view(-1, output.size(-1))
        tgt_flat = tgt[:, 1:].contiguous().view(-1)  # Exclude the <bos> token from target
        
        # Calculate loss
        loss = criterion(output_flat, tgt_flat)
        
        # Backward pass
        loss.backward()
        
        # Clip gradients to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(transformer_model.parameters(), max_norm=1)
        
        # Update parameters
        optimizer.step()
        
        # Add batch loss to total loss
        total_loss += loss.item()
        
        # Update progress bar description
        progress_bar.set_postfix({"Loss": loss.item()})
    
    # Calculate average loss for the epoch
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")
    
    # Validation
    transformer_model.eval()  # Set the model to evaluation mode
    val_loss = 0
    
    with torch.no_grad():
        # Create a progress bar for validation
        val_progress_bar = tqdm(enumerate(val_loader), total=len(val_loader), desc="Validation", unit="batch")
        
        for batch_idx, (src, tgt) in val_progress_bar:
            src, tgt = src.to(device), tgt.to(device)
            
            # Forward pass
            output = transformer_model(src, tgt[:, :-1])  # Exclude the <eos> token from input
            
            # Flatten the output and target tensors to compute loss
            output_flat = output.view(-1, output.size(-1))
            tgt_flat = tgt[:, 1:].contiguous().view(-1)  # Exclude the <bos> token from target
            
            # Calculate loss
            loss = criterion(output_flat, tgt_flat)
            
            # Add batch loss to total loss
            val_loss += loss.item()
            
            # Update progress bar description
            val_progress_bar.set_postfix({"Validation Loss": loss.item()})
    
    # Calculate average validation lfrom tqdm import tqdm

# Training loop
for epoch in range(num_epochs):
    transformer_model.train()  # Set the model to training mode
    total_loss = 0
    
    # Create a progress bar
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")
    
    # Iterate through batches
    for batch_idx, (src, tgt) in progress_bar:
        src, tgt = src.to(device), tgt.to(device)
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        output = transformer_model(src, tgt[:, :-1])  # Exclude the <eos> token from input
        
        # Flatten the output and target tensors to compute loss
        output_flat = output.view(-1, output.size(-1))
        tgt_flat = tgt[:, 1:].contiguous().view(-1)  # Exclude the <bos> token from target
        
        # Calculate loss
        loss = criterion(output_flat, tgt_flat)
        
        # Backward pass
        loss.backward()
        
        # Clip gradients to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(transformer_model.parameters(), max_norm=1)
        
        # Update parameters
        optimizer.step()
        
        # Add batch loss to total loss
        total_loss += loss.item()
        
        # Update progress bar description
        progress_bar.set_postfix({"Loss": loss.item()})
    
    # Calculate average loss for the epoch
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")
    
    # Validation
    transformer_model.eval()  # Set the model to evaluation mode
    val_loss = 0
    
    with torch.no_grad():
        # Create a progress bar for validation
        val_progress_bar = tqdm(enumerate(val_loader), total=len(val_loader), desc="Validation", unit="batch")
        
        for batch_idx, (src, tgt) in val_progress_bar:
            src, tgt = src.to(device), tgt.to(device)
            
            # Forward pass
            output = transformer_model(src, tgt[:, :-1])  # Exclude the <eos> token from input
            
            # Flatten the output and target tensors to compute loss
            output_flat = output.view(-1, output.size(-1))
            tgt_flat = tgt[:, 1:].contiguous().view(-1)  # Exclude the <bos> token from target
            
            # Calculate loss
            loss = criterion(output_flat, tgt_flat)
            
            # Add batch loss to total loss
            val_loss += loss.item()
            
            # Update progress bar description
            val_progress_bar.set_postfix({"Validation Loss": loss.item()})
    
    # Calculate average validation loss
    avg_val_loss = val_loss / len(val_loader)
    print(f"Validation Loss: {avg_val_loss:.4f}")

# Save the trained model
torch.save(transformer_model.state_dict(), 'transformer_model.pth')
oss
    avg_val_loss = val_loss / len(val_loader)
    print(f"Validation Loss: {avg_val_loss:.4f}")

# Save the trained model
torch.save(transformer_model.state_dict(), 'transformer_model.pth')


Epoch 1/10:   4%|▎         | 14210/400000 [42:32<19:09:54,  5.59batch/s, Loss=9.21]

In [43]:
max_seq_length

1204

# save the model

In [None]:
PATH = "./transformer_overfit.pth"
# PATH = f"./transformer_epoch_{epoch}_batch_{batch}.pth"

# Inference

In [None]:
import torch
from torchtext.data.utils import get_tokenizer

def translate_english_to_bengali(model, src_text, en_vocab, bn_vocab, device='cpu', max_length=50):
    # Tokenize the input English text
    tokenizer = get_tokenizer("basic_english")
    src_tokens = tokenizer(src_text)
    
    # Convert tokens to indices using the English vocabulary
    src_indices = [en_vocab[token] for token in src_tokens]
    
    # Convert indices to tensor and add batch dimension
    src_tensor = torch.tensor(src_indices, dtype=torch.long, device=device).unsqueeze(0)
    
    # Generate mask for the source input
    src_mask = (src_tensor != model.pad_token_src).to(device)
    
    # Translate the English text to Bengali
    with torch.no_grad():
        # Generate the translation
        translation_tensor = model.decode(src_tensor, en_vocab['<bos>'], en_vocab['<eos>'], mask={'src_mask': src_mask})
    
    # Convert translation tensor to list of indices
    translation_indices = translation_tensor.squeeze(0).cpu().tolist()
    
    # Convert indices to Bengali tokens
    translation_tokens = [bn_vocab.itos[idx] for idx in translation_indices]
    
    # Remove special tokens and return the translated text
    return ' '.join(token for token in translation_tokens[1:] if token not in ['<eos>', '<pad>'])


In [None]:
# Load the trained model
model = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout, device=device)
model.load_state_dict(torch.load('transformer_model.pth'))
model.eval()
en_vocab = len(en_vocab)
bn_vocab = len(bn_vocab)

# Enter the English text to translate
english_text = "Enter your English text here."

# Translate the English text to Bengali
bengali_text = translate_english_to_bengali(model, english_text, en_vocab, bn_vocab, device=device)

# Print the translated text
print("Translated Bengali text:", bengali_text)
