**Step:1**

In [None]:
train_data_path='/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.train.tsv'
valid_data_path='/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.dev.tsv'
test_data_path='/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.test.tsv'

**step:2**

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
def load_and_prepare_data(path, batch_size=32):
    df = pd.read_csv(path, delimiter="\t", header=None)
    df.columns = ['target_word', 'input_word', 'dummy']
    df = df.drop(columns=['dummy'])
    df = df.dropna()
    df = df.reset_index(drop=True)
    df['input_word'] = df['input_word'].astype(str)
    df['target_word'] = df['target_word'].astype(str)

    max_input_len = max(len(word) for word in df['input_word'])
    max_target_len = max(len(word) for word in df['target_word'])

    input_letter_vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2}
    target_letter_vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2}
    letter_idx = 3

    for letter in sorted(set(''.join(df['input_word']))):
        input_letter_vocab[letter] = letter_idx
        letter_idx += 1

    letter_idx = 3
    for letter in sorted(set(''.join(df['target_word']))):
        if letter not in target_letter_vocab:
            target_letter_vocab[letter] = letter_idx
            letter_idx += 1

    def encode_input_letters(word):
        token_ids = [input_letter_vocab[char] for char in word if char in input_letter_vocab]
        padded = token_ids[:max_input_len] + [input_letter_vocab['<pad>']] * (max_input_len - len(token_ids))
        return padded

    def encode_target_letters(word):
        token_ids = [target_letter_vocab[char] for char in word if char in target_letter_vocab]
        padded = [target_letter_vocab['<pad>']] + token_ids[:max_target_len] + [target_letter_vocab['<pad>']] * (max_target_len - len(token_ids))
        return padded

    input_tensors = [torch.tensor(encode_input_letters(word)) for word in df['input_word']]
    target_tensors = [torch.tensor(encode_target_letters(word)) for word in df['target_word']]

    input_tensors = torch.stack(input_tensors)
    target_tensors = torch.stack(target_tensors)

    dataset = TensorDataset(input_tensors, target_tensors)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    return dataset, data_loader, input_letter_vocab, target_letter_vocab, max_input_len, max_target_len


**Step:3**

In [None]:
training_dataset,train_loader_ben,train_input_vocab,train_target_vocab,max_train_input_len,max_train_target_len =load_and_prepare_data(train_data_path,batch_size = 64)
print(train_input_vocab,train_target_vocab,max_train_input_len,max_train_target_len)

{'<pad>': 0, '<sos>': 1, '<eos>': 2, 'a': 3, 'b': 4, 'c': 5, 'd': 6, 'e': 7, 'f': 8, 'g': 9, 'h': 10, 'i': 11, 'j': 12, 'k': 13, 'l': 14, 'm': 15, 'n': 16, 'o': 17, 'p': 18, 'q': 19, 'r': 20, 's': 21, 't': 22, 'u': 23, 'v': 24, 'w': 25, 'x': 26, 'y': 27, 'z': 28} {'<pad>': 0, '<sos>': 1, '<eos>': 2, 'ঁ': 3, 'ং': 4, 'ঃ': 5, 'অ': 6, 'আ': 7, 'ই': 8, 'ঈ': 9, 'উ': 10, 'ঊ': 11, 'ঋ': 12, 'এ': 13, 'ঐ': 14, 'ও': 15, 'ঔ': 16, 'ক': 17, 'খ': 18, 'গ': 19, 'ঘ': 20, 'ঙ': 21, 'চ': 22, 'ছ': 23, 'জ': 24, 'ঝ': 25, 'ঞ': 26, 'ট': 27, 'ঠ': 28, 'ড': 29, 'ঢ': 30, 'ণ': 31, 'ত': 32, 'থ': 33, 'দ': 34, 'ধ': 35, 'ন': 36, 'প': 37, 'ফ': 38, 'ব': 39, 'ভ': 40, 'ম': 41, 'য': 42, 'র': 43, 'ল': 44, 'শ': 45, 'ষ': 46, 'স': 47, 'হ': 48, '়': 49, 'া': 50, 'ি': 51, 'ী': 52, 'ু': 53, 'ূ': 54, 'ৃ': 55, 'ে': 56, 'ৈ': 57, 'ো': 58, 'ৌ': 59, '্': 60, 'ৎ': 61, '২': 62} 22 22


**Step:4**

In [None]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader

def load_and_prepare_test_valid_data(path, batch_size=32,
                          input_letter_vocab=None,
                          target_letter_vocab=None,
                          max_input_len=None,
                          max_target_len=None):
    df = pd.read_csv(path, delimiter="\t", header=None)
    df.columns = ['target_word', 'input_word', 'dummy']
    df = df.drop(columns=['dummy'])
    df = df.dropna()
    df = df.reset_index(drop=True)
    df['input_word'] = df['input_word'].astype(str)
    df['target_word'] = df['target_word'].astype(str)
    if input_letter_vocab is None:
        input_letter_vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2}
        letter_idx = 3
        for letter in sorted(set(''.join(df['input_word']))):
            input_letter_vocab[letter] = letter_idx
            letter_idx += 1
    if target_letter_vocab is None:
        target_letter_vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2}
        letter_idx = 3
        for letter in sorted(set(''.join(df['target_word']))):
            if letter not in target_letter_vocab:
                target_letter_vocab[letter] = letter_idx
                letter_idx += 1
    if max_input_len is None:
        max_input_len = max(len(word) for word in df['input_word'])
    if max_target_len is None:
        max_target_len = max(len(word) for word in df['target_word'])
    def encode_input_letters(word):
        token_ids = [input_letter_vocab[char] for char in word if char in input_letter_vocab]
        padded = token_ids[:max_input_len] + [input_letter_vocab['<pad>']] * (max_input_len - len(token_ids))
        return padded

    def encode_target_letters(word):
        token_ids = [target_letter_vocab[char] for char in word if char in target_letter_vocab]
        padded = [target_letter_vocab['<sos>']] + token_ids[:max_target_len] + [target_letter_vocab['<eos>']]
        padded += [target_letter_vocab['<pad>']] * (max_target_len + 2 - len(padded))  # +2 for <sos> and <eos>
        return padded
    input_tensors = [torch.tensor(encode_input_letters(word)) for word in df['input_word']]
    target_tensors = [torch.tensor(encode_target_letters(word)) for word in df['target_word']]
    input_tensors = torch.stack(input_tensors)
    target_tensors = torch.stack(target_tensors)
    dataset = TensorDataset(input_tensors, target_tensors)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    return dataset, data_loader, input_letter_vocab, target_letter_vocab, max_input_len, max_target_len

**Step:5**

In [None]:
val_dataset, val_data_loader, val_input_letter_vocab, val_target_letter_vocab,val_max_input_len, val_max_target_len=load_and_prepare_test_valid_data(valid_data_path,64,train_input_vocab,train_target_vocab,max_train_input_len,max_train_target_len)
test_dataset, test_data_loader, test_input_letter_vocab, test_target_letter_vocab, test_max_input_len, test_max_target_len=load_and_prepare_test_valid_data(test_data_path,64,train_input_vocab,train_target_vocab,max_train_input_len,max_train_target_len)

**Step:6**

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import torch.optim as optim

class TextEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, embed_size, encoder_layers=1, drop_prob=0.5, cell_type='gru', bidirectional=False):
        super(TextEncoder, self).__init__()
        self.embedding = nn.Embedding(input_size, embed_size)
        self.dropout = nn.Dropout(drop_prob)
        self.cell_type = cell_type
        self.bidirectional = bidirectional

        rnn_cls = {'lstm': nn.LSTM, 'gru': nn.GRU, 'rnn': nn.RNN}[cell_type]
        self.rnn = rnn_cls(embed_size, hidden_size, encoder_layers,
                           dropout=drop_prob, bidirectional=bidirectional, batch_first=True)

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        outputs, hidden = self.rnn(embedded)
        return outputs, hidden

class BahdanauAttention(nn.Module):
    def __init__(self, enc_hidden_size, dec_hidden_size):
        super(BahdanauAttention, self).__init__()
        self.attn = nn.Linear(enc_hidden_size + dec_hidden_size, dec_hidden_size)
        self.v = nn.Parameter(torch.rand(dec_hidden_size))

    def forward(self, hidden, encoder_outputs):
        batch_size, src_len, _ = encoder_outputs.size()

        if len(hidden.shape) == 2:
            hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        else:
            hidden = hidden[-1].unsqueeze(1).repeat(1, src_len, 1)

        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        v = self.v.repeat(batch_size, 1).unsqueeze(1)
        attention = torch.bmm(v, energy.transpose(1, 2)).squeeze(1)
        return torch.softmax(attention, dim=1)

class AttnDecoder(nn.Module):
    def __init__(self, hidden_size, embed_size, output_size, decoder_layers=1, drop_prob=0.5, cell_type='gru', enc_hidden_size=None):
        super(AttnDecoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, embed_size)
        self.dropout = nn.Dropout(drop_prob)
        self.attention = BahdanauAttention(enc_hidden_size, hidden_size)

        rnn_cls = {'lstm': nn.LSTM, 'gru': nn.GRU, 'rnn': nn.RNN}[cell_type]
        self.rnn = rnn_cls(embed_size + enc_hidden_size, hidden_size, decoder_layers,
                           dropout=drop_prob, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, encoder_outputs):
        x = x.unsqueeze(1)
        embedded = self.dropout(self.embedding(x))
        attn_weights = self.attention(hidden[-1], encoder_outputs)
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs)
        rnn_input = torch.cat((embedded, context), dim=2)
        output, hidden = self.rnn(rnn_input, hidden)
        output = self.fc(output.squeeze(1))
        return output, hidden, attn_weights

class Seq2SeqModel(nn.Module):
    def __init__(self, input_size, output_size, hidden_size, embed_size, beam_width,
                 encoder_layers=1, decoder_layers=1, drop_prob=0.3, cell_type='gru', bidirectional=True):
        super(Seq2SeqModel, self).__init__()
        self.encoder = TextEncoder(input_size, hidden_size, embed_size,
                                   encoder_layers, drop_prob, cell_type, bidirectional)

        self.bidirectional = bidirectional
        self.beam_width = beam_width
        enc_hidden_size = 2 * hidden_size if bidirectional else hidden_size

        self.decoder = AttnDecoder(hidden_size, embed_size, output_size,
                                   decoder_layers, drop_prob, cell_type,
                                   enc_hidden_size=enc_hidden_size)

        self.cell_type = cell_type
        self.encoder_layers = encoder_layers
        self.decoder_layers = decoder_layers

    def forward(self, source, target, teacher_forcing_ratio=0.5):
        batch_size = source.size(0)
        target_len = target.size(1)
        output_vocab_size = self.decoder.fc.out_features

        outputs = torch.zeros(batch_size, target_len, output_vocab_size).to(source.device)
        all_attention_weights = []

        encoder_outputs, encoder_hidden = self.encoder(source)
        decoder_hidden = self._init_decoder_hidden(encoder_hidden)
        decoder_input = target[:, 0]

        for t in range(1, target_len):
            decoder_output, decoder_hidden, attn_weights = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            outputs[:, t] = decoder_output
            all_attention_weights.append(attn_weights)
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            decoder_input = target[:, t] if teacher_force else decoder_output.argmax(1)

        all_attention_weights = torch.stack(all_attention_weights, dim=1)
        return outputs, all_attention_weights

    def _init_decoder_hidden(self, encoder_hidden):
        decoder_layers = self.decoder.rnn.num_layers
        if self.cell_type == 'lstm':
            h, c = encoder_hidden
            if h.shape[0] > 1 and self.bidirectional:
                h = self._merge_bidirectional(h)
                c = self._merge_bidirectional(c)
            h = self._pad_or_trim(h, decoder_layers)
            c = self._pad_or_trim(c, decoder_layers)
            return (h, c)
        else:
            h = encoder_hidden
            if h.shape[0] > 1 and self.bidirectional:
                h = self._merge_bidirectional(h)
            h = self._pad_or_trim(h, decoder_layers)
            return h

    def _merge_bidirectional(self, hidden):
        return hidden.view(self.encoder.rnn.num_layers, 2, hidden.size(1), hidden.size(2)).sum(1)

    def _pad_or_trim(self, hidden, target_layers):
        if hidden.shape[0] < target_layers:
            pad = torch.zeros(target_layers - hidden.shape[0], *hidden.shape[1:], device=hidden.device)
            return torch.cat([hidden, pad], dim=0)
        return hidden[:target_layers]

    def beam_search_decode(self, source, sos_idx, eos_idx, max_len=50):
        device = source.device
        batch_size = source.size(0)
        assert batch_size == 1, "Beam search decoding supports batch size 1 for simplicity."

        encoder_outputs, encoder_hidden = self.encoder(source)
        decoder_hidden = self._init_decoder_hidden(encoder_hidden)

        beams = [(0.0, [sos_idx], decoder_hidden, [])]
        completed_sequences = []

        for _ in range(max_len):
            new_beams = []
            for log_prob, seq, hidden, attns in beams:
                decoder_input = torch.tensor([[seq[-1]]], device=device)
                with torch.no_grad():
                    decoder_output, hidden, attn_weights = self.decoder(decoder_input, hidden, encoder_outputs)
                    probs = F.log_softmax(decoder_output, dim=1)
                    topk_probs, topk_indices = probs.topk(self.beam_width)

                for k in range(self.beam_width):
                    next_token = topk_indices[0, k].item()
                    next_log_prob = log_prob + topk_probs[0, k].item()
                    new_seq = seq + [next_token]
                    new_attns = attns + [attn_weights]

                    if next_token == eos_idx:
                        completed_sequences.append((next_log_prob, new_seq, new_attns))
                    else:
                        new_beams.append((next_log_prob, new_seq, hidden, new_attns))

            beams = sorted(new_beams, key=lambda x: x[0], reverse=True)[:self.beam_width]

            if len(completed_sequences) >= self.beam_width:
                break

        if not completed_sequences:
            completed_sequences = [(log_prob, seq, attns) for log_prob, seq, _, attns in beams]

        completed_sequences = sorted(completed_sequences, key=lambda x: x[0], reverse=True)
        best_log_prob, best_seq, best_attns = completed_sequences[0]
        return best_seq, best_attns

#Traiining function
def train_model(model, data_loader, loss_function, optimizer, device):
    model.train()
    total_loss = 0

    for input_data, target_data in data_loader:
        input_data = input_data.to(device)
        target_data = target_data.to(device)

        optimizer.zero_grad()
        predictions, _ = model(input_data, target_data)
        output_size = predictions.shape[-1]
        predictions = predictions.view(-1, output_size)
        target_data = target_data.view(-1)

        loss = loss_function(predictions, target_data)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    return model, total_loss / len(data_loader)

# Evaluation function
def evaluate_model(model, data_loader, loss_function, device, pad_token_id=0):
    model.eval()
    total_loss = 0
    total_tokens = 0
    correct_tokens = 0

    with torch.no_grad():
        for input_data, target_data in data_loader:
            input_data = input_data.to(device)
            target_data = target_data.to(device)

            predictions, _ = model(input_data, target_data, teacher_forcing_ratio=0.0)
            output_size = predictions.shape[-1]
            loss = loss_function(predictions.view(-1, output_size), target_data.view(-1))
            total_loss += loss.item()

            predicted_tokens = predictions.argmax(dim=-1)
            mask = target_data != pad_token_id
            correct = (predicted_tokens == target_data) & mask
            correct_tokens += correct.sum().item()
            total_tokens += mask.sum().item()

    average_loss = total_loss / len(data_loader)
    accuracy = correct_tokens / total_tokens if total_tokens > 0 else 0.0
    return average_loss, accuracy * 100

# Hyperparameters
input_size = 29
output_size = 63
embed_size = 128
hidden_size = 128
encoder_layers = 3
decoder_layers = 2
beam_width = 3
cell_type = 'lstm'
batch_size = 64
num_epochs = 20
drop_prob = 0.2
learning_rate = 0.001
bidirectional = True

# Model, criterion, optimizer
model = Seq2SeqModel(input_size, output_size, hidden_size, embed_size,
                     beam_width, encoder_layers, decoder_layers,
                     drop_prob, cell_type, bidirectional)

print(model)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


Seq2SeqModel(
  (encoder): TextEncoder(
    (embedding): Embedding(29, 128)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(128, 128, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  )
  (decoder): AttnDecoder(
    (embedding): Embedding(63, 128)
    (dropout): Dropout(p=0.2, inplace=False)
    (attention): BahdanauAttention(
      (attn): Linear(in_features=384, out_features=128, bias=True)
    )
    (rnn): LSTM(384, 128, num_layers=2, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=128, out_features=63, bias=True)
  )
)


**Step:7**

In [None]:
import wandb
import numpy as np
from types import SimpleNamespace
import random

**Step:8**

In [None]:
import getpass
api_key = getpass.getpass("Enter your W&B API Key: ")  # Hidden input
wandb.login(key=api_key)

Enter your W&B API Key:  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mma24m022[0m ([33mma24m022-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

**Step:9**

In [None]:
sweep_config = {
    'method': 'bayes',
    'metric': {
        'name': 'val_accuracy',
        'goal': 'maximize'
    },
    'parameters': {
        'embedding_size': {
            'values': [256, 192, 128, 64, 32]
        },
        'dropout': {
            'values': [0.0,0.1,0.2,0.3,0.4,0.5]
        },
        'encoder_layers': {
            'values': [1,2,3]
        },
        'decoder_layers': {
            'values': [1,2,3]
        },
        'hidden_layer_size': {
            'values': [512, 256, 192, 128, 64]
        },
        'cell_type': {
            'values': ['lstm', 'rnn', 'gru']
        },
        'bidirectional': {
            'values': [True, False]
        },
        'batch_size': {
            'values': [128, 64, 32, 16]
        },
        'num_epochs': {
            'values': [5]
        },
        'learning_rate': {
            'values': [0.0001, 0.001, 0.005, 0.01]
        },
         'beam_width': {
            'values': [10, 5, 3, 2, 1]  # 1 = greedy decoding
        }
    }
}
sweep_id = wandb.sweep(sweep=sweep_config, project='DL_Translation_attention')


Create sweep with ID: 7zxwuoi9
Sweep URL: https://wandb.ai/ma24m022-indian-institute-of-technology-madras/DL_Translation_attention/sweeps/7zxwuoi9


**Step:10**

In [None]:
def main():

    with wandb.init() as run:
        run_name="ct-"+str(wandb.config.cell_type)+"_el-"+str(wandb.config.encoder_layers)+"_dl-"+str(wandb.config.decoder_layers)+"_drop-"+str(wandb.config.dropout)+"_es-"+str(wandb.config.embedding_size)+"_hs-"+str(wandb.config.hidden_layer_size)+"_bs-"+str(wandb.config.batch_size)+"_ep-"+str(wandb.config.num_epochs)+"lr"+str(wandb.config.learning_rate)
        wandb.run.name=run_name

        model = Seq2SeqModel(input_size=29, output_size=63, hidden_size=wandb.config.hidden_layer_size,embed_size=wandb.config.embedding_size,beam_width=wandb.config.beam_width,encoder_layers=wandb.config.encoder_layers,
                        decoder_layers=wandb.config.decoder_layers,drop_prob=wandb.config.dropout, cell_type=wandb.config.cell_type, bidirectional=wandb.config.bidirectional)
        print(model)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=wandb.config.learning_rate)

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        custom_dataset1,train_loader_ben,a,b,_,_ = load_and_prepare_data(train_data_path,batch_size = wandb.config.batch_size)
        custom_dataset,val_loader_ben,_,_,_,_ =load_and_prepare_data(valid_data_path,batch_size = wandb.config.batch_size)
        for epoch in range(wandb.config.num_epochs):
            trained_model, train_loss = train_model(model, train_loader_ben, criterion, optimizer, device)
            val_loss, val_accuracy = evaluate_model(trained_model,val_data_loader, criterion, device)
            model = trained_model
            wandb.log({'Epoch': epoch, 'train_loss': train_loss , ' val_loss': val_loss, 'val_accuracy':val_accuracy})
            print(f'Epoch {epoch+1}/{wandb.config.num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')
wandb.agent(sweep_id, function= main,count=15)
# wandb.finish()

[34m[1mwandb[0m: Agent Starting Run: qgpb1h6k with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_width: 1
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	decoder_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 192
[34m[1mwandb[0m: 	encoder_layers: 2
[34m[1mwandb[0m: 	hidden_layer_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_epochs: 5
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Seq2SeqModel(
  (encoder): TextEncoder(
    (embedding): Embedding(29, 192)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): RNN(192, 64, num_layers=2, batch_first=True, dropout=0.3)
  )
  (decoder): AttnDecoder(
    (embedding): Embedding(63, 192)
    (dropout): Dropout(p=0.3, inplace=False)
    (attention): BahdanauAttention(
      (attn): Linear(in_features=128, out_features=64, bias=True)
    )
    (rnn): RNN(256, 64, num_layers=3, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=64, out_features=63, bias=True)
  )
)
Epoch 1/5, Train Loss: 1.7995, Val Loss: 1.5928, Val Accuracy: 4.2100
Epoch 2/5, Train Loss: 1.3800, Val Loss: 1.5987, Val Accuracy: 7.8523
Epoch 3/5, Train Loss: 1.3103, Val Loss: 1.5592, Val Accuracy: 8.7525
Epoch 4/5, Train Loss: 1.2808, Val Loss: 1.5503, Val Accuracy: 9.6661
Epoch 5/5, Train Loss: 1.2631, Val Loss: 1.5834, Val Accuracy: 9.7935


0,1
val_loss,▇█▂▁▆
Epoch,▁▃▅▆█
train_loss,█▃▂▁▁
val_accuracy,▁▆▇██

0,1
val_loss,1.58338
Epoch,4.0
train_loss,1.26307
val_accuracy,9.7935


[34m[1mwandb[0m: Agent Starting Run: hjf4xxsu with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_width: 3
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: gru
[34m[1mwandb[0m: 	decoder_layers: 1
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	encoder_layers: 3
[34m[1mwandb[0m: 	hidden_layer_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_epochs: 5


Seq2SeqModel(
  (encoder): TextEncoder(
    (embedding): Embedding(29, 256)
    (dropout): Dropout(p=0, inplace=False)
    (rnn): GRU(256, 128, num_layers=3, batch_first=True)
  )
  (decoder): AttnDecoder(
    (embedding): Embedding(63, 256)
    (dropout): Dropout(p=0, inplace=False)
    (attention): BahdanauAttention(
      (attn): Linear(in_features=256, out_features=128, bias=True)
    )
    (rnn): GRU(384, 128, batch_first=True)
    (fc): Linear(in_features=128, out_features=63, bias=True)
  )
)
Epoch 1/5, Train Loss: 1.1847, Val Loss: 1.6976, Val Accuracy: 9.8481
Epoch 2/5, Train Loss: 1.0257, Val Loss: 1.6666, Val Accuracy: 12.8849
Epoch 3/5, Train Loss: 0.9196, Val Loss: 1.6691, Val Accuracy: 17.1423
Epoch 4/5, Train Loss: 0.8221, Val Loss: 1.5738, Val Accuracy: 23.3263
Epoch 5/5, Train Loss: 0.7263, Val Loss: 1.5036, Val Accuracy: 30.4081


0,1
val_loss,█▇▇▄▁
Epoch,▁▃▅▆█
train_loss,█▆▄▂▁
val_accuracy,▁▂▃▆█

0,1
val_loss,1.50355
Epoch,4.0
train_loss,0.72633
val_accuracy,30.40814


[34m[1mwandb[0m: Agent Starting Run: 59zq5e28 with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beam_width: 2
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	decoder_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	encoder_layers: 3
[34m[1mwandb[0m: 	hidden_layer_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_epochs: 5


Seq2SeqModel(
  (encoder): TextEncoder(
    (embedding): Embedding(29, 256)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(256, 512, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  )
  (decoder): AttnDecoder(
    (embedding): Embedding(63, 256)
    (dropout): Dropout(p=0.2, inplace=False)
    (attention): BahdanauAttention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
    )
    (rnn): LSTM(1280, 512, num_layers=3, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=512, out_features=63, bias=True)
  )
)
Epoch 1/5, Train Loss: 1.1066, Val Loss: 1.7676, Val Accuracy: 10.8296
Epoch 2/5, Train Loss: 0.9089, Val Loss: 1.8390, Val Accuracy: 14.9281
Epoch 3/5, Train Loss: 0.7108, Val Loss: 1.7446, Val Accuracy: 22.6008
Epoch 4/5, Train Loss: 0.5650, Val Loss: 1.7866, Val Accuracy: 25.9009
Epoch 5/5, Train Loss: 0.4781, Val Loss: 1.8113, Val Accuracy: 27.9573


0,1
val_loss,▃█▁▄▆
Epoch,▁▃▅▆█
train_loss,█▆▄▂▁
val_accuracy,▁▃▆▇█

0,1
val_loss,1.81132
Epoch,4.0
train_loss,0.47808
val_accuracy,27.95734


[34m[1mwandb[0m: Agent Starting Run: 7tdf1mct with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 3
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	decoder_layers: 1
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	embedding_size: 128
[34m[1mwandb[0m: 	encoder_layers: 3
[34m[1mwandb[0m: 	hidden_layer_size: 256
[34m[1mwandb[0m: 	learning_rate: 0.005
[34m[1mwandb[0m: 	num_epochs: 5




Seq2SeqModel(
  (encoder): TextEncoder(
    (embedding): Embedding(29, 128)
    (dropout): Dropout(p=0.4, inplace=False)
    (rnn): LSTM(128, 256, num_layers=3, batch_first=True, dropout=0.4, bidirectional=True)
  )
  (decoder): AttnDecoder(
    (embedding): Embedding(63, 128)
    (dropout): Dropout(p=0.4, inplace=False)
    (attention): BahdanauAttention(
      (attn): Linear(in_features=768, out_features=256, bias=True)
    )
    (rnn): LSTM(640, 256, batch_first=True, dropout=0.4)
    (fc): Linear(in_features=256, out_features=63, bias=True)
  )
)
Epoch 1/5, Train Loss: 1.0735, Val Loss: 1.7850, Val Accuracy: 9.2257
Epoch 2/5, Train Loss: 1.0186, Val Loss: 1.7881, Val Accuracy: 13.8567
Epoch 3/5, Train Loss: 0.9650, Val Loss: 1.8296, Val Accuracy: 15.2350


[34m[1mwandb[0m: [32m[41mERROR[0m Error while calling W&B API: context deadline exceeded (<Response [500]>)


Epoch 4/5, Train Loss: 0.9444, Val Loss: 1.9001, Val Accuracy: 15.8829


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


Step:11

In [None]:
input_size = 29  #30
output_size = 63 #70
embed_size = 256
beam_width=3
hidden_size = 128
encoder_layers = 3
decoder_layers = 1
cell_type = 'gru'
batch_size = 64
num_epochs = 35
drop_prob = 0.0
learning_rate = 0.001
Best_model = Seq2SeqModel(input_size, output_size, hidden_size,embed_size,beam_width, encoder_layers,decoder_layers,drop_prob, cell_type,bidirectional)
print(Best_model)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Best_model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(Best_model.parameters(), lr=learning_rate)

Seq2SeqModel(
  (encoder): TextEncoder(
    (embedding): Embedding(29, 256)
    (dropout): Dropout(p=0.0, inplace=False)
    (rnn): GRU(256, 128, num_layers=3, batch_first=True)
  )
  (decoder): AttnDecoder(
    (embedding): Embedding(63, 256)
    (dropout): Dropout(p=0.0, inplace=False)
    (attention): BahdanauAttention(
      (attn): Linear(in_features=256, out_features=128, bias=True)
    )
    (rnn): GRU(384, 128, batch_first=True)
    (fc): Linear(in_features=128, out_features=63, bias=True)
  )
)


**Step:12**

In [None]:
final_model = Best_model.to(device)
for epoch in range(35):
    print(f"\nEpoch {epoch+1} started.")
    final_model, _ = train_model(final_model, train_loader_ben, criterion, optimizer, device)
    print(f"Finished training for epoch {epoch+1}")
    train_loss, train_accuracy = evaluate_model(final_model, train_loader_ben, criterion, device)
    val_loss, val_accuracy = evaluate_model(final_model, val_data_loader, criterion, device)
    print(f'Epoch {epoch+1}/35')
    print(f' - Train Loss      : {train_loss:.4f}, Train Accuracy      : {train_accuracy:.2f}%')
    print(f' - Validation Loss : {val_loss:.4f}, Validation Accuracy : {val_accuracy:.2f}%')



Epoch 1 started.
Finished training for epoch 1
Epoch 1/35
 - Train Loss      : 1.4519, Train Accuracy      : 15.06%
 - Validation Loss : 1.7502, Validation Accuracy : 11.50%

Epoch 2 started.
Finished training for epoch 2
Epoch 2/35
 - Train Loss      : 1.2743, Train Accuracy      : 21.99%
 - Validation Loss : 1.5841, Validation Accuracy : 18.33%

Epoch 3 started.
Finished training for epoch 3
Epoch 3/35
 - Train Loss      : 1.1903, Train Accuracy      : 28.74%
 - Validation Loss : 1.5479, Validation Accuracy : 24.03%

Epoch 4 started.
Finished training for epoch 4
Epoch 4/35
 - Train Loss      : 1.2174, Train Accuracy      : 31.58%
 - Validation Loss : 1.5577, Validation Accuracy : 28.36%

Epoch 5 started.
Finished training for epoch 5
Epoch 5/35
 - Train Loss      : 1.0777, Train Accuracy      : 40.14%
 - Validation Loss : 1.5336, Validation Accuracy : 33.48%

Epoch 6 started.
Finished training for epoch 6
Epoch 6/35
 - Train Loss      : 0.9540, Train Accuracy      : 44.94%
 - Valid

Step:13

In [None]:
test_loss, test_accuracy = evaluate_model(final_model,test_data_loader, criterion, device)
print(f' Test Accuracy: {test_accuracy:.2f}')

 Test Accuracy: 45.13


In [None]:
!apt-get update -qq
!apt-get install -y fonts-noto

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  fonts-noto-cjk fonts-noto-cjk-extra fonts-noto-color-emoji fonts-noto-core fonts-noto-extra
  fonts-noto-ui-core fonts-noto-ui-extra fonts-noto-unhinted
The following NEW packages will be installed:
  fonts-noto fonts-noto-cjk fonts-noto-cjk-extra fonts-noto-color-emoji fonts-noto-core
  fonts-noto-extra fonts-noto-ui-core fonts-noto-ui-extra fonts-noto-unhinted
0 upgraded, 9 newly installed, 0 to remove and 161 not upgraded.
Need to get 316 MB of archives.
After this operation, 788 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 fonts-noto-core all 20201225-1build1 [12.2 MB]
Get:2 http://archive.ubunt

Step:14

In [None]:
def run_inference(model, dataloader, device):
    model.eval()
    predictions = []
    actual = []
    with torch.no_grad():
        for latin, devanagari in dataloader:
            latin = latin.to(device)
            devanagari = devanagari.to(device)
            output, _ = model(latin, devanagari, 0)
            deb = devanagari.cpu().numpy()
            actual.append(deb)
            if output.dim() == 3:
                output = output.argmax(2)
            elif output.dim() == 2:
                output = output.argmax(1)
            else:
                print("Unexpected output dimension:", output.dim())
                continue

            latin = latin.cpu().numpy()
            output = output.cpu().numpy()
            predictions.append((latin, output))
    return predictions, actual
latin_idx2token = {idx: char for char, idx in test_input_letter_vocab.items()}
bangla_idx2token = {idx: char for char, idx in test_target_letter_vocab.items()}

**step:15**

In [None]:
def decode_sequence(indices, idx2token, target_vocab):
    valid_indices = []
    for idx in indices:
        if idx in idx2token and idx not in (target_vocab['<pad>'], target_vocab['<sos>'], target_vocab['<eos>']):
            valid_indices.append(idx)
    decoded_text = ''
    for idx in valid_indices:
        decoded_text += idx2token[idx]
    return decoded_text

**Step;16**

In [None]:
def process_output_indices(indices, idx2token, target_vocab):
    decoded_text = ''
    for idx in indices:
        if idx == target_vocab.get('<eos>'):
            break
        if idx in (target_vocab.get('<pad>'), target_vocab.get('<sos>')):
            continue
        decoded_text += idx2token.get(idx, '')
    return decoded_text

**Step:17**

In [None]:
import pandas as pd
from pathlib import Path
test_predictions, actual = run_inference(final_model, test_data_loader, device)
seq2seq_results = []
for (src_indices, output_indices), act_ind in zip(test_predictions, actual):
    for i in range(src_indices.shape[0]):
        input_text = decode_sequence(src_indices[i], latin_idx2token, test_input_letter_vocab)
        actual_target_text = decode_sequence(act_ind[i], bangla_idx2token, test_target_letter_vocab)
        predicted_text = process_output_indices(output_indices[i], bangla_idx2token, test_target_letter_vocab)
        seq2seq_results.append([input_text, actual_target_text, predicted_text])
results_df = pd.DataFrame(seq2seq_results, columns=["Input", "Actual", "Predicted"])
def char_level_accuracy(actual, predicted):
    matches = sum(a == b for a, b in zip(actual, predicted))
    return matches / max(len(actual), len(predicted)) if max(len(actual), len(predicted)) > 0 else 0

results_df["Accuracy"] = results_df.apply(lambda row: char_level_accuracy(row["Actual"], row["Predicted"]), axis=1)
total = len(results_df)
count_100 = (results_df["Accuracy"] == 1.0).sum()
count_75 = ((results_df["Accuracy"] > 0.75) & (results_df["Accuracy"] < 1.0)).sum()
count_50 = ((results_df["Accuracy"] > 0.5) & (results_df["Accuracy"] <= 0.75)).sum()
count_25 = ((results_df["Accuracy"] > 0.25) & (results_df["Accuracy"] <= 0.5)).sum()
count_0 = (results_df["Accuracy"] <= 0.25).sum()
def accuracy_highlighter(row):
    acc = row.Accuracy
    if acc == 1.0:
        color = 'background-color: #d4edda'  # Green
    elif acc > 0.75:
        color = 'background-color: #cce5ff'  # Light blue
    elif acc > 0.5:
        color = 'background-color: #fff3cd'  # Light yellow
    elif acc > 0.25:
        color = 'background-color: #f8d7da'  # Light pink
    else:
        color = ''  # No color
    return [color, color, color, '']

# Style the DataFrame
styled_table = results_df.style.set_properties(**{
    'border': '1px solid black',
    'text-align': 'left',
    'padding': '6px'
}).apply(accuracy_highlighter, axis=1).hide(axis="columns", subset=["Accuracy"])

html_content = styled_table.to_html()
summary_html = f"""
<div class="legend">
    <h3>Prediction Accuracy Summary</h3>
    <ul>
        <li><strong>Total Predictions:</strong> {total}</li>
        <li style="background-color: #d4edda; padding: 6px;">✅ 100% Match: {count_100}</li>
        <li style="background-color: #cce5ff; padding: 6px;">✅ Above 75%: {count_75}</li>
        <li style="background-color: #fff3cd; padding: 6px;">✅ Above 50%: {count_50}</li>
        <li style="background-color: #f8d7da; padding: 6px;">✅ Above 25%: {count_25}</li>
        <li>❌ ≤ 25% Match: {count_0}</li>
    </ul>
</div>
"""
html_full = f"""
<!DOCTYPE html>
<html>
<head>
    <meta charset="UTF-8">
    <title>Seq2Seq Prediction Results</title>
    <style>
        body {{
            font-family: Arial, sans-serif;
            background-color: #f5f5f5;
            padding: 30px;
        }}
        h2 {{
            text-align: center;
            color: #333;
        }}
        .legend {{
            max-width: 600px;
            margin: 0 auto 30px auto;
            padding: 15px;
            border: 1px solid #ccc;
            background-color: #fff;
        }}
        .legend h3 {{
            margin-top: 0;
        }}
        table {{
            margin: auto;
            border-collapse: collapse;
            box-shadow: 0 0 10px rgba(0,0,0,0.1);
        }}
        th {{
            background-color: #4CAF50;
            color: white;
        }}
        td, th {{
            padding: 10px 15px;
            border: 1px solid #ddd;
        }}
        tr:nth-child(even) {{
            background-color: #f9f9f9;
        }}
        tr:hover {{
            background-color: #f1f1f1;
        }}
    </style>
</head>
<body>
    <h2>Character-Level Transliteration Predictions using Seq2Seq + Attention</h2>
    {summary_html}
    {html_content}
</body>
</html>
"""
with open("predictions_attention.html", "w", encoding="utf-8") as f:
    f.write(html_full)

Step:18

In [None]:
import torch
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['font.family'] = 'Noto Sans Bengali'
import matplotlib.font_manager as fm
for font in fm.findSystemFonts(fontpaths=None, fontext='ttf'):
    if "NotoSansBengali" in font:
        print(font)
import seaborn as sns
import numpy as np
wandb.init()
def index_to_word(indices, idx2char):
    return ''.join([idx2char[idx] for idx in indices if idx in idx2char and idx2char[idx] not in ['<pad>', '<sos>', '<eos>']])
def run_inference_with_attention(model, input_word, input_vocab, target_vocab, idx2input_char, idx2target_char, device, max_input_len, max_target_len):
    model.eval()
    input_indices = [input_vocab.get(char, input_vocab['<pad>']) for char in input_word]  # Default to <pad> if char not in vocab
    input_indices += [input_vocab['<pad>']] * (max_input_len - len(input_indices))  # Padding
    input_tensor = torch.tensor(input_indices, dtype=torch.long).unsqueeze(0).to(device)

    with torch.no_grad():
        encoder_outputs, encoder_hidden = model.encoder(input_tensor)
        decoder_hidden = model._init_decoder_hidden(encoder_hidden)
        decoder_input = torch.tensor([target_vocab['<sos>']], device=device)  # Start of sequence token

        decoded_indices = []
        all_attention_weights = []

        for _ in range(max_target_len + 2):  # +2 for <sos> and <eos> tokens
            decoder_output, decoder_hidden,_ = model.decoder(decoder_input, decoder_hidden, encoder_outputs)
            top1 = decoder_output.argmax(1)
            decoded_indices.append(top1.item())
            attention_weights = model.decoder.attention(decoder_hidden[-1], encoder_outputs)
            all_attention_weights.append(attention_weights.squeeze(0).cpu().numpy())
            if top1.item() == target_vocab['<eos>']:  # End of sequence
                break
            decoder_input = top1

    decoded_word = index_to_word(decoded_indices, idx2target_char)
    return decoded_word, all_attention_weights
import matplotlib.pyplot as plt
import matplotlib.patches as patches
def draw_attention_overlay(input_word, predicted_word, attention_weights):
    from matplotlib.font_manager import FontProperties
    bengali_font_path = "/usr/share/fonts/truetype/noto/NotoSansBengali-Regular.ttf"
    bengali_font = FontProperties(fname=bengali_font_path)

    fig, ax = plt.subplots(figsize=(len(input_word) * 0.7, len(predicted_word) * 0.7))
    ax.set_xlim(0, len(input_word))
    ax.set_ylim(0, len(predicted_word))
    ax.axis('off')

    for y, out_char in enumerate(predicted_word):
        for x, in_char in enumerate(input_word):
            weight = attention_weights[y][x]
            color_intensity = plt.cm.Greens(weight)
            rect = patches.Rectangle((x, len(predicted_word) - y - 1), 1, 1, linewidth=1, edgecolor='white', facecolor=color_intensity)
            ax.add_patch(rect)
    def is_bengali(char):
        return '\u0980' <= char <= '\u09FF'
    for i, char in enumerate(input_word):
        font_prop = bengali_font if is_bengali(char) else None
        ax.text(i + 0.5, len(predicted_word) + 0.1, char, ha='center', va='bottom', fontsize=12, fontproperties=font_prop)

    for i, char in enumerate(predicted_word):
        font_prop = bengali_font if is_bengali(char) else None
        ax.text(-0.1, len(predicted_word) - i - 0.5, char, ha='right', va='center', fontsize=12, fontproperties=font_prop)

    plt.tight_layout()
    wandb.log({"Attention Connectivity Map": wandb.Image(fig)})
    plt.close(fig)
input_words = [ "erao", "ezahar","ejahar","ekatabaddho","eeraneo"]

for input_word in input_words:
    predicted_word, attention_weights = run_inference_with_attention(
        final_model,
        input_word,
        test_input_letter_vocab,
        test_target_letter_vocab,
        latin_idx2token,
        bangla_idx2token,
        device,
        test_max_input_len,
        test_max_target_len
    )

    attention_matrix = np.stack(attention_weights, axis=0)
    attention_matrix = attention_matrix[:len(predicted_word), :len(input_word)]

    draw_attention_overlay(input_word, predicted_word, attention_matrix)

    print("Input word:", input_word)
    print("Predicted word:", predicted_word)
    print("Length of predicted word:", len(predicted_word))
    print("Attention shape:", len(attention_weights), "x", len(attention_weights[0]))
    print("-" * 50)

wandb.finish()

/usr/share/fonts/truetype/noto/NotoSansBengaliUI-Light.ttf
/usr/share/fonts/truetype/noto/NotoSansBengaliUI-SemiBold.ttf
/usr/share/fonts/truetype/noto/NotoSansBengali-Medium.ttf
/usr/share/fonts/truetype/noto/NotoSansBengali-SemiBold.ttf
/usr/share/fonts/truetype/noto/NotoSansBengaliUI-Condensed.ttf
/usr/share/fonts/truetype/noto/NotoSansBengali-Bold.ttf
/usr/share/fonts/truetype/noto/NotoSansBengali-Thin.ttf
/usr/share/fonts/truetype/noto/NotoSansBengaliUI-SemiCondensed.ttf
/usr/share/fonts/truetype/noto/NotoSansBengaliUI-Regular.ttf
/usr/share/fonts/truetype/noto/NotoSansBengaliUI-Medium.ttf
/usr/share/fonts/truetype/noto/NotoSansBengali-ExtraBold.ttf
/usr/share/fonts/truetype/noto/NotoSansBengaliUI-ExtraCondensed.ttf
/usr/share/fonts/truetype/noto/NotoSansBengali-Condensed.ttf
/usr/share/fonts/truetype/noto/NotoSansBengali-ExtraLight.ttf
/usr/share/fonts/truetype/noto/NotoSansBengaliUI-Black.ttf
/usr/share/fonts/truetype/noto/NotoSansBengali-Regular.ttf
/usr/share/fonts/truetype/no

  plt.tight_layout()
  plt.tight_layout()


Input word: erao
Predicted word: এরাও
Length of predicted word: 4
Attention shape: 24 x 22
--------------------------------------------------
Input word: ezahar
Predicted word: এজাহার
Length of predicted word: 6
Attention shape: 24 x 22
--------------------------------------------------
Input word: ejahar
Predicted word: এজাহার
Length of predicted word: 6
Attention shape: 24 x 22
--------------------------------------------------
Input word: ekatabaddho
Predicted word: একতাবদ্ধ
Length of predicted word: 8
Attention shape: 24 x 22
--------------------------------------------------
Input word: eeraneo
Predicted word: ইরানেও
Length of predicted word: 6
Attention shape: 24 x 22
--------------------------------------------------


**Step:19**

In [None]:
import torch
import matplotlib.pyplot as plt
import matplotlib
from matplotlib.font_manager import FontProperties
import wandb
matplotlib.rcParams['font.family'] = 'Noto Sans Bengali'
import matplotlib.font_manager as fm
for font in fm.findSystemFonts(fontpaths=None, fontext='ttf'):
    if "NotoSansBengali" in font:
        print(font)
import seaborn as sns
import numpy as np
wandb.init()
def index_to_word(indices, idx2char):
    return ''.join([idx2char[idx] for idx in indices if idx in idx2char and idx2char[idx] not in ['<pad>', '<sos>', '<eos>']])
def run_inference_with_attention(model, input_word, input_vocab, target_vocab, idx2input_char, idx2target_char, device, max_input_len, max_target_len):
    model.eval()
    input_indices = [input_vocab.get(char, input_vocab['<pad>']) for char in input_word]  # Default to <pad> if char not in vocab
    input_indices += [input_vocab['<pad>']] * (max_input_len - len(input_indices))  # Padding
    input_tensor = torch.tensor(input_indices, dtype=torch.long).unsqueeze(0).to(device)

    with torch.no_grad():
        encoder_outputs, encoder_hidden = model.encoder(input_tensor)
        decoder_hidden = model._init_decoder_hidden(encoder_hidden)
        decoder_input = torch.tensor([target_vocab['<sos>']], device=device)  # Start of sequence token

        decoded_indices = []
        all_attention_weights = []

        for _ in range(max_target_len + 2):  # +2 for <sos> and <eos> tokens
            decoder_output, decoder_hidden,_ = model.decoder(decoder_input, decoder_hidden, encoder_outputs)
            top1 = decoder_output.argmax(1)
            decoded_indices.append(top1.item())
            attention_weights = model.decoder.attention(decoder_hidden[-1], encoder_outputs)
            all_attention_weights.append(attention_weights.squeeze(0).cpu().numpy())
            if top1.item() == target_vocab['<eos>']:  # End of sequence
                break
            decoder_input = top1

    decoded_word = index_to_word(decoded_indices, idx2target_char)
    return decoded_word, all_attention_weights
import matplotlib.pyplot as plt
import matplotlib.patches as patches
def draw_model_connectivity(input_word, predicted_word, attention_weights):
    from matplotlib.font_manager import FontProperties
    bengali_font_path = "/usr/share/fonts/truetype/noto/NotoSansBengali-Regular.ttf"
    bengali_font = FontProperties(fname=bengali_font_path)

    fig, ax = plt.subplots(figsize=(len(input_word), len(predicted_word)))
    ax.set_xlim(0, len(input_word))
    ax.set_ylim(0, len(predicted_word))
    ax.axis('off')

    # Positions for characters
    input_positions = [(i + 0.5, len(predicted_word) + 0.5) for i in range(len(input_word))]
    output_positions = [(0 - 0.5, len(predicted_word) - i - 0.5) for i in range(len(predicted_word))]

    def is_bengali(char):
        return '\u0980' <= char <= '\u09FF'

    # Draw characters
    for i, char in enumerate(input_word):
        font_prop = bengali_font if is_bengali(char) else None
        x, y = input_positions[i]
        ax.text(x, y, char, ha='center', va='bottom', fontsize=12, fontproperties=font_prop)

    for i, char in enumerate(predicted_word):
        font_prop = bengali_font if is_bengali(char) else None
        x, y = output_positions[i]
        ax.text(x, y, char, ha='right', va='center', fontsize=12, fontproperties=font_prop)

    # Draw connections
    for out_idx, out_pos in enumerate(output_positions):
        for in_idx, in_pos in enumerate(input_positions):
            weight = attention_weights[out_idx][in_idx]
            line = plt.Line2D(
                [in_pos[0], out_pos[0]],
                [in_pos[1], out_pos[1]],
                linewidth=2 * weight,  # line thickness based on attention
                color='green',
                alpha=weight  # opacity also based on attention
            )
            ax.add_line(line)

    plt.tight_layout()
    wandb.log({"Model Connectivity Map": wandb.Image(fig)})
    plt.close(fig)
input_words = [ "ejahar","erao","ezahar","ekatabaddho","eeraneo"]

for input_word in input_words:
    predicted_word, attention_weights = run_inference_with_attention(
        final_model,
        input_word,
        test_input_letter_vocab,
        test_target_letter_vocab,
        latin_idx2token,
        bangla_idx2token,
        device,
        test_max_input_len,
        test_max_target_len
    )

    attention_matrix = np.stack(attention_weights, axis=0)
    attention_matrix = attention_matrix[:len(predicted_word), :len(input_word)]
    draw_model_connectivity(input_word, predicted_word, attention_matrix)
    print("Input word:", input_word)
    print("Predicted word:", predicted_word)
    print("Length of predicted word:", len(predicted_word))
    print("Attention shape:", len(attention_weights), "x", len(attention_weights[0]))
    print("-" * 50)

wandb.finish()

/usr/share/fonts/truetype/noto/NotoSansBengaliUI-Light.ttf
/usr/share/fonts/truetype/noto/NotoSansBengaliUI-SemiBold.ttf
/usr/share/fonts/truetype/noto/NotoSansBengali-Medium.ttf
/usr/share/fonts/truetype/noto/NotoSansBengali-SemiBold.ttf
/usr/share/fonts/truetype/noto/NotoSansBengaliUI-Condensed.ttf
/usr/share/fonts/truetype/noto/NotoSansBengali-Bold.ttf
/usr/share/fonts/truetype/noto/NotoSansBengali-Thin.ttf
/usr/share/fonts/truetype/noto/NotoSansBengaliUI-SemiCondensed.ttf
/usr/share/fonts/truetype/noto/NotoSansBengaliUI-Regular.ttf
/usr/share/fonts/truetype/noto/NotoSansBengaliUI-Medium.ttf
/usr/share/fonts/truetype/noto/NotoSansBengali-ExtraBold.ttf
/usr/share/fonts/truetype/noto/NotoSansBengaliUI-ExtraCondensed.ttf
/usr/share/fonts/truetype/noto/NotoSansBengali-Condensed.ttf
/usr/share/fonts/truetype/noto/NotoSansBengali-ExtraLight.ttf
/usr/share/fonts/truetype/noto/NotoSansBengaliUI-Black.ttf
/usr/share/fonts/truetype/noto/NotoSansBengali-Regular.ttf
/usr/share/fonts/truetype/no

  plt.tight_layout()
  plt.tight_layout()


Input word: eeraneo
Predicted word: ইরানেও
Length of predicted word: 6
Attention shape: 24 x 22
--------------------------------------------------
Input word: ekatabaddho
Predicted word: একতাবদ্ধ
Length of predicted word: 8
Attention shape: 24 x 22
--------------------------------------------------
Input word: ejahar
Predicted word: এজাহার
Length of predicted word: 6
Attention shape: 24 x 22
--------------------------------------------------
Input word: erao
Predicted word: এরাও
Length of predicted word: 4
Attention shape: 24 x 22
--------------------------------------------------
Input word: ezahar
Predicted word: এজাহার
Length of predicted word: 6
Attention shape: 24 x 22
--------------------------------------------------
Input word: ekatabaddho
Predicted word: একতাবদ্ধ
Length of predicted word: 8
Attention shape: 24 x 22
--------------------------------------------------
Input word: eeraneo
Predicted word: ইরানেও
Length of predicted word: 6
Attention shape: 24 x 22
-----------------