In [1]:
import pandas

In [None]:
import os
import xml.etree.ElementTree as ET
import shutil

# ✅ Paths (Modify based on actual Kaggle structure)
word_images_dir = "/kaggle/input/promitilipi/PromitoLipi2/PromitoLipi2/WordImages(bmp)"
word_annotations_dir = "/kaggle/input/promitilipi/PromitoLipi2/PromitoLipi2/WordAnnotations(xml)"
output_dir = "/kaggle/working/IAM_Format/"

# ✅ Create output folders
iam_images_dir = os.path.join(output_dir, "images")
os.makedirs(iam_images_dir, exist_ok=True)

# ✅ IAM annotations file
iam_annotations_file = os.path.join(output_dir, "annotations.txt")

# ✅ Bangla Character Mapping
class_mapping = {
    0: 'blank', 1: 'অ', 2: 'ই', 3: 'ঈ', 4: 'উ', 5: 'ঊ', 6: 'ঋ', 7: 'এ', 8: 'ঐ', 9: 'ও', 10: 'ঔ',
    11: 'ক', 12: 'খ', 13: 'গ', 14: 'ঘ', 15: 'ঙ', 16: 'চ', 17: 'ছ', 18: 'জ', 19: 'ঝ', 20: 'ঞ', 21: 'ট',
    22: 'ঠ', 23: 'ড', 24: 'ঢ', 25: 'ণ', 26: 'ত', 27: 'থ', 28: 'দ', 29: 'ধ', 30: 'ন', 31: 'প', 32: 'ফ',
    33: 'ব', 34: 'ভ', 35: 'ম', 36: 'য', 37: 'র', 38: 'ল', 39: 'শ', 40: 'ষ', 41: 'স', 42: 'হ', 43: 'ড়',
    44: 'ঢ়', 45: 'য়', 46: 'ৎ', 47: 'ঃ', 48: 'ং', 49: 'ঁ', 50: '০', 51: '১', 52: '২', 53: '৩', 54: '৪',
    55: '৫', 56: '৬', 57: '৭', 58: '৮', 59: '৯', 60: 'া', 61: 'ি', 62: 'ী', 63: 'ে', 64: 'ু', 65: 'faka',
    66: '্র', 67: '্য', 68: 'ক্ষ', 69: 'ন্ত', 70: 'ত্র', 71: 'ঙ্গ', 72: 'স্থ', 73: 'স্ব', 74: 'ক্ত',
    75: 'স্ত', 76: 'ন্দ', 77: 'চ্ছ', 78: 'দ্ধ', 79: 'ন্ত্র', 80: 'ফাকা', 81: 'ত্ত', 82: 'ষ্ট', 83: 'ন্ন',
    84: 'ল্প', 85: 'ম্প', 86: 'faka', 87: 'ূ', 88: 'ৃ', 89: 'ৈ', 90: 'faka', 91: 'ৌ', 92: '।'
}

def extract_text_from_xml(xml_path):
    """Extracts text from XML annotation file and converts class indices to Bangla words."""
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()

        # Extract all <name> values from <object> elements
        objects = root.findall("object")
        labels = [obj.find("name").text for obj in objects if obj.find("name") is not None]

        # Convert index labels (e.g., "11 31 62") to Bangla text using class_mapping
        bangla_text = "".join([class_mapping.get(int(label), "") for label in labels if label.isdigit()])

        return bangla_text if bangla_text.strip() else ""  # Return text if not empty
    except ET.ParseError:
        print(f"❌ Error parsing {xml_path}")
        return ""

# # ✅ Process dataset and save in IAM format
# with open(iam_annotations_file, "w", encoding="utf-8") as f:
#     for xml_file in sorted(os.listdir(word_annotations_dir)):
#         if not xml_file.endswith(".xml"):
#             continue  # Skip non-XML files

#         xml_path = os.path.join(word_annotations_dir, xml_file)
#         img_name = xml_file.replace(".xml", ".bmp")
#         img_path = os.path.join(word_images_dir, img_name)

#         # Extract text annotation
#         text_label = extract_text_from_xml(xml_path)
#         if not text_label.strip():
#             continue  # Skip images with no text

#         # ✅ Convert image format to PNG for IAM dataset
#         new_img_name = img_name.replace(".bmp", ".png")
#         new_img_path = os.path.join(iam_images_dir, new_img_name)
#         shutil.copy(img_path, new_img_path)

#         # ✅ Save annotation in IAM format (Image Name + Label)
#         f.write(f"{new_img_name} {text_label}\n")

print("✅ Conversion to IAM format completed!")
print(f"Annotations saved in: {iam_annotations_file}")
print(f"Images saved in: {iam_images_dir}")


In [None]:

!pip install -q datasets jiwer

In [None]:
import pandas as pd

# ✅ Define the correct file path
annotation_file_path = "/kaggle/input/promitilipi/IAM_annotations.txt"

# ✅ Read the annotations file into a DataFrame
df = pd.read_csv(annotation_file_path, delimiter=" ", header=None, names=["file_name", "text"], quoting=3)



In [None]:
df = df.drop_duplicates(subset=["text"]).reset_index(drop=True)


In [None]:
# df = df.head(200)

In [None]:
df

In [None]:

from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2)
# we reset the indices to start from zero
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [None]:
import torch
from torch.utils.data import Dataset
from PIL import Image

class IAMDataset(Dataset):
    def __init__(self, root_dir, df, processor, max_target_length=128):
        self.root_dir = root_dir
        self.df = df
        self.processor = processor
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # get file name + text 
        file_name = self.df['file_name'][idx]
        text = self.df['text'][idx]
        # prepare image (i.e. resize + normalize)
        image = Image.open(self.root_dir + file_name).convert("RGB")
        pixel_values = self.processor(image, return_tensors="pt").pixel_values
        # add labels (input_ids) by encoding the text
        labels = self.processor.tokenizer(text, 
                                          padding="max_length", 
                                          max_length=self.max_target_length).input_ids
        # important: make sure that PAD tokens are ignored by the loss function
        labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]

        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding

In [None]:
from transformers import TrOCRProcessor

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
train_dataset = IAMDataset(root_dir='/kaggle/input/promitilipi/IAM_Format/images/',
                           df=train_df,
                           processor=processor)
eval_dataset = IAMDataset(root_dir='/kaggle/input/promitilipi/IAM_Format/images/',
                           df=test_df,
                           processor=processor)

In [None]:
print("Number of training examples:", len(train_dataset))
print("Number of validation examples:", len(eval_dataset))

In [None]:
encoding = train_dataset[0]
for k,v in encoding.items():
  print(k, v.shape)

In [None]:
image = Image.open(train_dataset.root_dir + train_df['file_name'][0]).convert("RGB")
image

In [None]:
labels = encoding['labels']
labels[labels == -100] = processor.tokenizer.pad_token_id
label_str = processor.decode(labels, skip_special_tokens=True)
print(label_str)

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=4)

In [None]:
from transformers import VisionEncoderDecoderModel
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-stage1")
model.to(device)

In [None]:
# set special tokens used for creating the decoder_input_ids from the labels
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size

# set beam search parameters
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 64
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

In [None]:
!pip install evaluate


In [None]:
import evaluate

cer_metric = evaluate.load("cer")


In [None]:
def compute_cer(pred_ids, label_ids):
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return cer

In [None]:
from transformers import AdamW
from tqdm.notebook import tqdm

optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(20):  # loop over the dataset multiple times
   # train
   model.train()
   train_loss = 0.0
   for batch in tqdm(train_dataloader):
      # get the inputs
      for k,v in batch.items():
        batch[k] = v.to(device)

      # forward + backward + optimize
      outputs = model(**batch)
      loss = outputs.loss
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

      train_loss += loss.item()

   print(f"Loss after epoch {epoch}:", train_loss/len(train_dataloader))
    
   # evaluate
   model.eval()
   valid_cer = 0.0
   with torch.no_grad():
     for batch in tqdm(eval_dataloader):
       # run batch generation
       outputs = model.generate(batch["pixel_values"].to(device))
       # compute metrics
       cer = compute_cer(pred_ids=outputs, label_ids=batch["labels"])
       valid_cer += cer 

   print("Validation CER:", valid_cer / len(eval_dataloader))

model.save_pretrained(".")