In [17]:
import os 

In [18]:
!pip install torch torchvision numpy pillow h5py opencv-python xmltodict





In [19]:
import os
import glob
import torch
import xml.etree.ElementTree as ET
import numpy as np
from PIL import Image
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
import torch.nn.functional as F

from pathlib import Path
import numpy as np
import math
from itertools import groupby
import h5py
import numpy as np
import unicodedata
import cv2
import torch
from torch import nn
from torchvision.models import resnet50, resnet101
from torch.autograd import Variable
import torchvision

from torch.utils.data import Dataset
import time



In [20]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=128):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


class OCR(nn.Module):

    def __init__(self, vocab_len, hidden_dim, nheads,
                 num_encoder_layers, num_decoder_layers):
        super().__init__()

        # create ResNet-101 backbone
        self.backbone = resnet101()
        del self.backbone.fc

        # create conversion layer
        self.conv = nn.Conv2d(2048, hidden_dim, 1)

        # create a default PyTorch transformer
        self.transformer = nn.Transformer(
            hidden_dim, nheads, num_encoder_layers, num_decoder_layers)

        # prediction heads with length of vocab
        # DETR used basic 3 layer MLP for output
        self.vocab = nn.Linear(hidden_dim,vocab_len)

        # output positional encodings (object queries)
        self.decoder = nn.Embedding(vocab_len, hidden_dim)
        self.query_pos = PositionalEncoding(hidden_dim, .2)

        # spatial positional encodings, sine positional encoding can be used.
        # Detr baseline uses sine positional encoding.
        self.row_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
        self.col_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
        self.trg_mask = None
  
    def generate_square_subsequent_mask(self, sz):
        mask = torch.triu(torch.ones(sz, sz), 1)
        mask = mask.masked_fill(mask==1, float('-inf'))
        return mask

    def get_feature(self,x):
        x = self.backbone.conv1(x)
        x = self.backbone.bn1(x)   
        x = self.backbone.relu(x)
        x = self.backbone.maxpool(x)

        x = self.backbone.layer1(x)
        x = self.backbone.layer2(x)
        x = self.backbone.layer3(x)
        x = self.backbone.layer4(x)
        return x


    def make_len_mask(self, inp):
        return (inp == 0).transpose(0, 1)


    def forward(self, inputs, trg):
        # propagate inputs through ResNet-101 up to avg-pool layer
        x = self.get_feature(inputs)

        # convert from 2048 to 256 feature planes for the transformer
        h = self.conv(x)

        # construct positional encodings
        bs,_,H, W = h.shape
        pos = torch.cat([
            self.col_embed[:W].unsqueeze(0).repeat(H, 1, 1),
            self.row_embed[:H].unsqueeze(1).repeat(1, W, 1),
        ], dim=-1).flatten(0, 1).unsqueeze(1)

        # generating subsequent mask for target
        if self.trg_mask is None or self.trg_mask.size(0) != len(trg):
            self.trg_mask = self.generate_square_subsequent_mask(trg.shape[1]).to(trg.device)

        # Padding mask
        trg_pad_mask = self.make_len_mask(trg)

        # Getting postional encoding for target
        trg = self.decoder(trg)
        trg = self.query_pos(trg)
        
        output = self.transformer(pos + 0.1 * h.flatten(2).permute(2, 0, 1), trg.permute(1,0,2), tgt_mask=self.trg_mask, 
                                  tgt_key_padding_mask=trg_pad_mask.permute(1,0))

        return self.vocab(output.transpose(0,1))


def make_model(vocab_len, hidden_dim=256, nheads=4,
                 num_encoder_layers=4, num_decoder_layers=4):
    
    return OCR(vocab_len, hidden_dim, nheads,
                 num_encoder_layers, num_decoder_layers)

In [21]:
from torch.utils.data import Dataset
from PIL import Image
import torchvision.transforms as transforms

class PromitoLipiDataset(Dataset):
    def __init__(self, img_dir, annotation_file, tokenizer, transform=None):
        self.img_dir = img_dir
        self.annotations = load_annotations(annotation_file)  # Load annotations
        self.image_files = list(self.annotations.keys())  # Get image filenames
        self.tokenizer = tokenizer  # Use the Tokenizer
        self.transform = transform

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        image_name = self.image_files[idx]
        label_text = self.annotations[image_name]  # Get Bangla text label

        # Load Image
        img_path = os.path.join(self.img_dir, image_name)
        img = Image.open(img_path).convert("RGB")

        if self.transform:
            img = self.transform(img)

        # Convert text to token indices
        y_train = self.tokenizer.encode(label_text)

        # Pad sequence to max length
        y_train = np.pad(y_train, (0, self.tokenizer.maxlen - len(y_train)), mode="constant")

        return img, torch.tensor(y_train, dtype=torch.long)



# Image Transformations for ResNet-101
transform = transforms.Compose([
    transforms.Resize((128, 128)),  # Resize images
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])


In [22]:
class Tokenizer:
    """Handles text-token conversion for Bangla OCR."""

    def __init__(self, class_mapping, max_text_length=128):
        self.PAD_TK, self.UNK_TK, self.SOS_TK, self.EOS_TK = "¶", "¤", "SOS", "EOS"
        
        # Convert class_mapping to a list of characters
        self.chars = [self.PAD_TK, self.UNK_TK, self.SOS_TK, self.EOS_TK] + list(class_mapping.values())
        self.vocab_size = len(self.chars)
        self.maxlen = max_text_length

        # Create character-to-index and index-to-character mappings
        self.char_to_idx = {c: i for i, c in enumerate(self.chars)}
        self.idx_to_char = {i: c for c, i in self.char_to_idx.items()}

    def encode(self, text):
        """Encodes Bangla text into token indices."""
        text = ['SOS'] + list(text) + ['EOS']
        encoded = [self.char_to_idx.get(c, self.char_to_idx["¤"]) for c in text]
        return np.array(encoded)

    def decode(self, tokens):
        """Decodes token indices back to Bangla text."""
        text = "".join([self.idx_to_char.get(i, "") for i in tokens])
        return text.replace("SOS", "").replace("EOS", "").replace("¶", "").replace("¤", "")



# # Bangla Charset (Letters, Digits, and Common Symbols)
# bangla_charset = "অইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহড়ঢ়য়ৎঃংঁ" \
#                  "০১২৩৪৫৬৭৮৯" \
#                  "ািীেু্র্যক্ষন্তত্রঙ্গস্থস্বক্তস্তন্দচ্ছদ্ধন্ত্রত্তষ্টন্নল্পম্পূৃৈৌ।"

class_mapping = {
    0: 'blank', 1: 'অ', 2: 'ই', 3: 'ঈ', 4: 'উ', 5: 'ঊ', 6: 'ঋ', 7: 'এ', 8: 'ঐ', 9: 'ও', 10: 'ঔ',
    11: 'ক', 12: 'খ', 13: 'গ', 14: 'ঘ', 15: 'ঙ', 16: 'চ', 17: 'ছ', 18: 'জ', 19: 'ঝ', 20: 'ঞ', 21: 'ট',
    22: 'ঠ', 23: 'ড', 24: 'ঢ', 25: 'ণ', 26: 'ত', 27: 'থ', 28: 'দ', 29: 'ধ', 30: 'ন', 31: 'প', 32: 'ফ',
    33: 'ব', 34: 'ভ', 35: 'ম', 36: 'য', 37: 'র', 38: 'ল', 39: 'শ', 40: 'ষ', 41: 'স', 42: 'হ', 43: 'ড়',
    44: 'ঢ়', 45: 'য়', 46: 'ৎ', 47: 'ঃ', 48: 'ং', 49: 'ঁ', 50: '০', 51: '১', 52: '২', 53: '৩', 54: '৪',
    55: '৫', 56: '৬', 57: '৭', 58: '৮', 59: '৯', 60: 'া', 61: 'ি', 62: 'ী', 63: 'ে', 64: 'ু', 65: 'faka',
    66: '্র', 67: '্য', 68: 'ক্ষ', 69: 'ন্ত', 70: 'ত্র', 71: 'ঙ্গ', 72: 'স্থ', 73: 'স্ব', 74: 'ক্ত',
    75: 'স্ত', 76: 'ন্দ', 77: 'চ্ছ', 78: 'দ্ধ', 79: 'ন্ত্র', 80: 'ফাকা', 81: 'ত্ত', 82: 'ষ্ট', 83: 'ন্ন',
    84: 'ল্প', 85: 'ম্প', 86: 'faka', 87: 'ূ', 88: 'ৃ', 89: 'ৈ', 90: 'faka', 91: 'ৌ', 92: '।'
}

# Initialize Tokenizer with Bangla characters
tokenizer = Tokenizer(class_mapping)



In [23]:
import os

def load_annotations(annotation_file):
    """Loads annotations from a text file and converts class indices to Bangla text."""
    annotations = {}

    # Check if the file exists
    if not os.path.exists(annotation_file):
        print(f"❌ Error: Annotation file not found at {annotation_file}")
        return annotations  # Return empty dictionary

    with open(annotation_file, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split(":")
            if len(parts) != 2:
                print(f"⚠️ Warning: Skipping malformed line: {line.strip()}")
                continue  # Skip corrupt lines

            image_name = parts[0].strip()  # Extract image filename

            try:
                # Convert index numbers to actual Bangla text
                label_indices = list(map(int, parts[1].strip().split()))
                label_text = "".join(class_mapping.get(idx, "") for idx in label_indices)
                annotations[image_name] = label_text  # Store result
            except ValueError:
                print(f"⚠️ Warning: Skipping line with invalid numbers: {line.strip()}")

    return annotations

# Load annotations from file
annotation_file_path = "/kaggle/input/promitilipi/imageannotationsid_train.txt"
annotations = load_annotations(annotation_file_path)


In [24]:
batch_size = 16

# Example Usage:
dataset = PromitoLipiDataset(
    img_dir="/kaggle/input/promitilipi/preprocessed_images/preprocessed_images",
    annotation_file=annotation_file_path,
    tokenizer=tokenizer,  # Pass the tokenizer
    transform=transforms.Compose([transforms.ToTensor()])
)



In [25]:
from torch.utils.data import random_split

# Define dataset
# dataset = PromitoLipiDataset(
#     img_dir="/kaggle/input/promitilipi/PromitoLipi2/PromitoLipi2/WordImages(bmp)",
#     xml_dir="/kaggle/input/promitilipi/PromitoLipi2/PromitoLipi2/WordAnnotations(xml)",
#     tokenizer=tokenizer,
#     transform=transform
# )

# Define split sizes (80% Train, 10% Validation, 10% Test)
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size  # Ensures total remains same

# Randomly split dataset
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# Create DataLoaders
batch_size = 16

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Print dataset sizes
print(f"Training Set: {len(train_dataset)} samples")
print(f"Validation Set: {len(val_dataset)} samples")
print(f"Test Set: {len(test_dataset)} samples")


Training Set: 7864 samples
Validation Set: 983 samples
Test Set: 983 samples


In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = make_model(vocab_len=tokenizer.vocab_size).to(device)


In [27]:
class LabelSmoothing(nn.Module):
    "Implement label smoothing."
    def __init__(self, size, padding_idx=0, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(size_average=False)
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None
        
    def forward(self, x, target):
        assert x.size(1) == self.size
        true_dist = x.data.clone()
        true_dist.fill_(self.smoothing / (self.size - 2))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        true_dist[:, self.padding_idx] = 0
        mask = torch.nonzero(target.data == self.padding_idx)
        if mask.dim() > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)
        self.true_dist = true_dist
        return self.criterion(x, Variable(true_dist, requires_grad=False))


In [28]:
criterion = LabelSmoothing(size=tokenizer.vocab_size, padding_idx=0, smoothing=0.1).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001, weight_decay=0.0004)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.95)


In [29]:
!pip install jiwer




In [30]:
def train(model, criterion, optimiser, scheduler,dataloader):
 
    model.train()
    total_loss = 0
    for batch, (imgs, labels_y,) in enumerate(dataloader):
          imgs = imgs.to(device)
          labels_y = labels_y.to(device)
    
          optimiser.zero_grad()
          output = model(imgs.float(),labels_y.long()[:,:-1])
 
          norm = (labels_y != 0).sum()
          loss = criterion(output.log_softmax(-1).contiguous().view(-1, tokenizer.vocab_size), labels_y[:,1:].contiguous().view(-1).long()) / norm
 
          loss.backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), 0.2)
          optimizer.step()
          total_loss += loss.item() * norm
 
    return total_loss / len(dataloader)
 
from jiwer import cer

def evaluate(model, criterion, dataloader, tokenizer):
    model.eval()
    epoch_loss = 0
    total_cer = 0  # Track total CER
    total_samples = 0

    with torch.no_grad():
        for batch, (imgs, labels_y,) in enumerate(dataloader):
            imgs = imgs.to(device)
            labels_y = labels_y.to(device)

            # Model prediction
            output = model(imgs.float(), labels_y.long()[:, :-1])

            # Compute loss
            norm = (labels_y != 0).sum()
            loss = criterion(
                output.log_softmax(-1).contiguous().view(-1, tokenizer.vocab_size),
                labels_y[:, 1:].contiguous().view(-1).long()
            ) / norm

            epoch_loss += loss.item() * norm

            # Convert model output to text (Decoding)
            predicted_tokens = output.argmax(-1).cpu().numpy()  # Get best predictions
            predicted_texts = [tokenizer.decode(pred) for pred in predicted_tokens]  # Convert to text
            actual_texts = [tokenizer.decode(label.cpu().numpy()) for label in labels_y[:, 1:]]  # Ground truth text

            # Compute CER for this batch
            batch_cer = sum(cer(a, b) for a, b in zip(actual_texts, predicted_texts)) / len(actual_texts)
            total_cer += batch_cer
            total_samples += 1

    # Average CER across all validation samples
    avg_cer = total_cer / total_samples if total_samples > 0 else 0
    return epoch_loss / len(dataloader), avg_cer

     

In [31]:
import os

# Paths
annotation_file_path = "/kaggle/input/promitilipi/imageannotationsid_train.txt"
image_dir = "/kaggle/input/promitilipi/preprocessed_images/preprocessed_images"

# Load annotation file
if not os.path.exists(annotation_file_path):
    print(f"❌ Error: Annotation file not found at {annotation_file_path}")
    exit()

# Read image filenames from annotation file
with open(annotation_file_path, "r", encoding="utf-8") as file:
    annotation_lines = file.readlines()

# Extract image filenames (assuming they are before ':' in each line)
annotation_files = set(line.split(":")[0].strip() for line in annotation_lines)

# Get actual image filenames in the folder
image_files = set(os.listdir(image_dir))

# Find missing and extra files
missing_files = annotation_files - image_files  # Mentioned in annotations but not found
extra_files = image_files - annotation_files  # Found in the folder but not in annotations

# Print results
print(f"🔍 Total Images in Annotations: {len(annotation_files)}")
print(f"📂 Total Images in Folder: {len(image_files)}")
print(f"❌ Missing Images: {len(missing_files)}")
print(f"⚠️ Extra Images (not listed in annotations): {len(extra_files)}")

# Show some missing files if any
if missing_files:
    print("\n🚨 Missing Image Files (First 10 shown):")
    print("\n".join(list(missing_files)[:10]))

# Show some extra files if any
if extra_files:
    print("\n⚠️ Extra Image Files (First 10 shown):")
    print("\n".join(list(extra_files)[:10]))

print("\n✅ Cross-check complete!")


🔍 Total Images in Annotations: 9830
📂 Total Images in Folder: 9830
❌ Missing Images: 0
⚠️ Extra Images (not listed in annotations): 0

✅ Cross-check complete!


In [34]:
import time
import numpy as np
import torch

# Function to calculate epoch time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

# Define model save paths
best_loss_model_path = "/kaggle/working/best_model_loss.pth"  # ✅ Best model based on Loss
best_cer_model_path = "/kaggle/working/best_model_cer.pth"    # ✅ Best model based on CER

# Initialize best metrics
best_valid_loss = np.inf
best_valid_cer = np.inf
c = 0

# Training loop
for epoch in range(150):
    print(f'Epoch: {epoch+1:02}, Learning Rate: {scheduler.get_last_lr()}')

    start_time = time.time()

    train_loss = train(model, criterion, optimizer, scheduler, train_loader)
    valid_loss, valid_cer = evaluate(model, criterion, val_loader, tokenizer)  # ✅ Returns CER

    epoch_mins, epoch_secs = epoch_time(start_time, time.time())

    c += 1

    # ✅ Save Best Model Based on Loss
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), best_loss_model_path)
        print(f"✅ Model saved (Best Loss: {valid_loss:.3f})")
        c = 0  # Reset counter

    # ✅ Save Best Model Based on CER
    if valid_cer < best_valid_cer:
        best_valid_cer = valid_cer
        torch.save(model.state_dict(), best_cer_model_path)
        print(f"✅ Model saved (Best CER: {valid_cer:.3f})")

    # ✅ Adjust Learning Rate if No Improvement
    if c > 4:
        scheduler.step()
        c = 0  # Reset counter

    print(f'Time: {epoch_mins}m {epoch_secs}s')
    print(f'Train Loss: {train_loss:.3f}')
    print(f'Val   Loss: {valid_loss:.3f}')
    print(f'CER   Score: {valid_cer:.3f}')  # ✅ Display CER Score


Epoch: 01, Learning Rate: [0.0001]
✅ Model saved (Best Loss: 94.537)
✅ Model saved (Best CER: 0.858)
Time: 1m 5s
Train Loss: 99.855
Val   Loss: 94.537
CER   Score: 0.858
Epoch: 02, Learning Rate: [0.0001]
✅ Model saved (Best Loss: 92.727)
✅ Model saved (Best CER: 0.816)
Time: 1m 4s
Train Loss: 95.740
Val   Loss: 92.727
CER   Score: 0.816
Epoch: 03, Learning Rate: [0.0001]
✅ Model saved (Best Loss: 89.883)
✅ Model saved (Best CER: 0.742)
Time: 1m 4s
Train Loss: 92.094
Val   Loss: 89.883
CER   Score: 0.742
Epoch: 04, Learning Rate: [0.0001]
✅ Model saved (Best CER: 0.543)
Time: 1m 4s
Train Loss: 88.440
Val   Loss: 90.471
CER   Score: 0.543
Epoch: 05, Learning Rate: [0.0001]
✅ Model saved (Best Loss: 85.610)
Time: 1m 4s
Train Loss: 85.869
Val   Loss: 85.610
CER   Score: 0.587
Epoch: 06, Learning Rate: [0.0001]
✅ Model saved (Best Loss: 85.035)
Time: 1m 4s
Train Loss: 83.710
Val   Loss: 85.035
CER   Score: 0.592
Epoch: 07, Learning Rate: [0.0001]
✅ Model saved (Best Loss: 83.921)
Time: 1m 

In [35]:
# Load the best model based on CER
best_cer_model_path = "/kaggle/working/best_model_cer.pth"  # ✅ Path to Best CER Model

# Initialize model
model = make_model(vocab_len=tokenizer.vocab_size).to(device)  # ✅ Recreate model architecture
model.load_state_dict(torch.load(best_cer_model_path, map_location=device))  # ✅ Load weights
model.eval()  # Set to evaluation mode

print("✅ Best CER Model Loaded Successfully!")



  model.load_state_dict(torch.load(best_cer_model_path, map_location=device))  # ✅ Load weights


✅ Best CER Model Loaded Successfully!


In [36]:
from jiwer import cer

def test_model(model, criterion, dataloader, tokenizer):
    model.eval()
    total_loss = 0
    total_cer = 0
    total_samples = 0

    with torch.no_grad():
        for batch, (imgs, labels_y) in enumerate(dataloader):
            imgs = imgs.to(device)
            labels_y = labels_y.to(device)

            # Get model predictions
            output = model(imgs.float(), labels_y.long()[:, :-1])

            # Compute loss
            norm = (labels_y != 0).sum()
            loss = criterion(
                output.log_softmax(-1).contiguous().view(-1, tokenizer.vocab_size),
                labels_y[:, 1:].contiguous().view(-1).long()
            ) / norm
            total_loss += loss.item() * norm

            # Convert predictions to text
            predicted_tokens = output.argmax(-1).cpu().numpy()  # Get best predictions
            predicted_texts = [tokenizer.decode(pred) for pred in predicted_tokens]  # Convert to text
            actual_texts = [tokenizer.decode(label.cpu().numpy()) for label in labels_y[:, 1:]]  # Ground truth

            # Compute CER for this batch
            batch_cer = sum(cer(a, b) for a, b in zip(actual_texts, predicted_texts)) / len(actual_texts)
            total_cer += batch_cer
            total_samples += 1

    # Compute average CER
    avg_cer = total_cer / total_samples if total_samples > 0 else 0
    return avg_cer

# # 🔥 Compute CER on the test dataset
test_cer = test_model(model, criterion, test_loader, tokenizer)

print(f"✅ Test CER Score: {test_cer:.3f}")


✅ Test CER Score: 0.299


In [56]:
from jiwer import cer

def test_model_with_logging(model, criterion, dataloader, tokenizer):
    model.eval()
    total_loss = 0
    total_cer = 0
    total_samples = 0
    image_results = []  # Store results for each image

    with torch.no_grad():
        for batch, (imgs, labels_y) in enumerate(dataloader):
            imgs = imgs.to(device)
            labels_y = labels_y.to(device)

            # Get model predictions
            output = model(imgs.float(), labels_y.long()[:, :-1])

            # Compute loss
            norm = (labels_y != 0).sum()
            loss = criterion(
                output.log_softmax(-1).contiguous().view(-1, tokenizer.vocab_size),
                labels_y[:, 1:].contiguous().view(-1).long()
            ) / norm
            total_loss += loss.item() * norm

            # Convert predictions to text
            predicted_tokens = output.argmax(-1).cpu().numpy()  # Get best predictions
            predicted_texts = [tokenizer.decode(pred) for pred in predicted_tokens]  # Convert to text
            actual_texts = [tokenizer.decode(label.cpu().numpy()) for label in labels_y[:, 1:]]  # Ground truth

            # Compute CER for this batch
            for actual, predicted in zip(actual_texts, predicted_texts):
                img_cer = cer(actual, predicted)  # CER for this image
                total_cer += img_cer
                total_samples += 1
                
                # Store for printing
                image_results.append({
                    "Actual Text": actual,
                    "Predicted Text": predicted,
                    "CER": img_cer
                })

    # Compute average CER
    avg_cer = total_cer / total_samples if total_samples > 0 else 0

    # Print results for each image
    print("\n🔍 **Detailed Image-wise CER Analysis**")
    for idx, result in enumerate(image_results[:10]):  # Show first 10 samples
        print(f"📌 Image {idx + 1}:")
        print(f"✅ Actual Text    : {result['Actual Text']}")
        print(f"🔠 Predicted Text : {result['Predicted Text']}")
        print(f"📉 CER for Image  : {result['CER']:.3f}\n")
    
    print(f"✅ **Final Test CER Score: {avg_cer:.3f}**")
    return avg_cer

# 🔥 Run CER calculation with logging
test_cer = test_model_with_logging(model, criterion, test_loader, tokenizer)



🔍 **Detailed Image-wise CER Analysis**
📌 Image 1:
✅ Actual Text    : ঋণ
🔠 Predicted Text : ঋণ
📉 CER for Image  : 0.000

📌 Image 2:
✅ Actual Text    : কারণ
🔠 Predicted Text : াানণ
📉 CER for Image  : 0.500

📌 Image 3:
✅ Actual Text    : িকছু
🔠 Predicted Text : ককছু
📉 CER for Image  : 0.250

📌 Image 4:
✅ Actual Text    : ১
🔠 Predicted Text : ১
📉 CER for Image  : 0.000

📌 Image 5:
✅ Actual Text    : ঔ
🔠 Predicted Text : ঔ
📉 CER for Image  : 0.000

📌 Image 6:
✅ Actual Text    : নদরই
🔠 Predicted Text : ইতইরই
📉 CER for Image  : 0.750

📌 Image 7:
✅ Actual Text    : এ
🔠 Predicted Text : অ
📉 CER for Image  : 1.000

📌 Image 8:
✅ Actual Text    : থেক
🔠 Predicted Text : পেক
📉 CER for Image  : 0.333

📌 Image 9:
✅ Actual Text    : ১
🔠 Predicted Text : ১
📉 CER for Image  : 0.000

📌 Image 10:
✅ Actual Text    : ওৎ
🔠 Predicted Text : ওৎ
📉 CER for Image  : 0.000

✅ **Final Test CER Score: 0.298**


# Skip texts with length > 2

In [70]:
from jiwer import cer

def test_model_with_logging(model, criterion, dataloader, tokenizer):
    model.eval()
    total_loss = 0
    total_cer = 0
    total_samples = 0
    image_results = []  # Store results for each image

    with torch.no_grad():
        for batch, (imgs, labels_y) in enumerate(dataloader):
            imgs = imgs.to(device)
            labels_y = labels_y.to(device)

            # Get model predictions
            output = model(imgs.float(), labels_y.long()[:, :-1])

            # Compute loss
            norm = (labels_y != 0).sum()
            loss = criterion(
                output.log_softmax(-1).contiguous().view(-1, tokenizer.vocab_size),
                labels_y[:, 1:].contiguous().view(-1).long()
            ) / norm
            total_loss += loss.item() * norm

            # Convert predictions to text
            predicted_tokens = output.argmax(-1).cpu().numpy()  # Get best predictions
            predicted_texts = [tokenizer.decode(pred) for pred in predicted_tokens]  # Convert to text
            actual_texts = [tokenizer.decode(label.cpu().numpy()) for label in labels_y[:, 1:]]  # Ground truth

            # Compute CER for this batch (Skipping short actual texts)
            for actual, predicted in zip(actual_texts, predicted_texts):
                if len(actual.strip()) > 2:  # Skip texts with length ≤ 2
                    continue

                img_cer = cer(actual, predicted)  # CER for this image
                total_cer += img_cer
                total_samples += 1
                
                # Store for printing
                image_results.append({
                    "Actual Text": actual,
                    "Predicted Text": predicted,
                    "CER": img_cer
                })
    print("len :", len(image_results))
    # Compute average CER
    avg_cer = total_cer / total_samples if total_samples > 0 else 0

    # Print results for each image
    print("\n🔍 **Detailed Image-wise CER Analysis**")
    for idx, result in enumerate(image_results[:3]):  # Show first 10 samples
        print(f"📌 Image {idx + 1}:")
        print(f"✅ Actual Text    : {result['Actual Text']}")
        print(f"🔠 Predicted Text : {result['Predicted Text']}")
        print(f"📉 CER for Image  : {result['CER']:.3f}\n")
    
    print(f"✅ **Final Test CER Score: {avg_cer:.3f}**")
    return avg_cer

# 🔥 Run CER calculation with logging
test_cer = test_model_with_logging(model, criterion, test_loader, tokenizer)


len : 547

🔍 **Detailed Image-wise CER Analysis**
📌 Image 1:
✅ Actual Text    : ঋণ
🔠 Predicted Text : ঋণ
📉 CER for Image  : 0.000

📌 Image 2:
✅ Actual Text    : ১
🔠 Predicted Text : ১
📉 CER for Image  : 0.000

📌 Image 3:
✅ Actual Text    : ঔ
🔠 Predicted Text : ঔ
📉 CER for Image  : 0.000

✅ **Final Test CER Score: 0.091**


# Skip texts with length ≤ 2

In [69]:
from jiwer import cer


def test_model_with_logging(model, criterion, dataloader, tokenizer):
    model.eval()
    total_loss = 0
    total_cer = 0
    total_samples = 0
    image_results = []  # Store results for each image

    with torch.no_grad():
        for batch, (imgs, labels_y) in enumerate(dataloader):
            imgs = imgs.to(device)
            labels_y = labels_y.to(device)

            # Get model predictions
            output = model(imgs.float(), labels_y.long()[:, :-1])

            # Compute loss
            norm = (labels_y != 0).sum()
            loss = criterion(
                output.log_softmax(-1).contiguous().view(-1, tokenizer.vocab_size),
                labels_y[:, 1:].contiguous().view(-1).long()
            ) / norm
            total_loss += loss.item() * norm

            # Convert predictions to text
            predicted_tokens = output.argmax(-1).cpu().numpy()  # Get best predictions
            predicted_texts = [tokenizer.decode(pred) for pred in predicted_tokens]  # Convert to text
            actual_texts = [tokenizer.decode(label.cpu().numpy()) for label in labels_y[:, 1:]]  # Ground truth

            # Compute CER for this batch (Skipping short actual texts)
            for actual, predicted in zip(actual_texts, predicted_texts):
                if len(actual.strip()) <=2:  # Skip texts with length ≤ 2
                    continue

                img_cer = cer(actual, predicted)  # CER for this image
                total_cer += img_cer
                total_samples += 1
                
                # Store for printing
                image_results.append({
                    "Actual Text": actual,
                    "Predicted Text": predicted,
                    "CER": img_cer
                })

    print("len :", len(image_results))

    # Compute average CER
    avg_cer = total_cer / total_samples if total_samples > 0 else 0

    # Print results for each image
    print("\n🔍 **Detailed Image-wise CER Analysis**")
    for idx, result in enumerate(image_results[:3]):  # Show first 10 samples
        print(f"📌 Image {idx + 1}:")
        print(f"✅ Actual Text    : {result['Actual Text']}")
        print(f"🔠 Predicted Text : {result['Predicted Text']}")
        print(f"📉 CER for Image  : {result['CER']:.3f}\n")
    
    print(f"✅ **Final Test CER Score: {avg_cer:.3f}**")
    return avg_cer

# 🔥 Run CER calculation with logging
test_cer = test_model_with_logging(model, criterion, test_loader, tokenizer)


len : 436

🔍 **Detailed Image-wise CER Analysis**
📌 Image 1:
✅ Actual Text    : কারণ
🔠 Predicted Text : াানণ
📉 CER for Image  : 0.500

📌 Image 2:
✅ Actual Text    : িকছু
🔠 Predicted Text : ককছু
📉 CER for Image  : 0.250

📌 Image 3:
✅ Actual Text    : নদরই
🔠 Predicted Text : ইতইরই
📉 CER for Image  : 0.750

✅ **Final Test CER Score: 0.557**
