In [1]:
# Install required libraries
!pip install mltu
!pip install torch torchvision torchaudio
!pip install torchsummaryX


Collecting mltu
  Downloading mltu-1.2.5-py3-none-any.whl.metadata (3.4 kB)
Collecting qqdm==0.0.7 (from mltu)
  Downloading qqdm-0.0.7.tar.gz (5.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting onnxruntime>=1.15.0 (from mltu)
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting addict (from qqdm==0.0.7->mltu)
  Downloading addict-2.4.0-py3-none-any.whl.metadata (1.0 kB)
Collecting jupyter (from qqdm==0.0.7->mltu)
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting coloredlogs (from onnxruntime>=1.15.0->mltu)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime>=1.15.0->mltu)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Collecting jupyterlab (from jupyter->qqdm==0.0.7->mltu)
  Downloading jupyterlab-4.4.2-py3-none-any.whl.metadata (16 kB)
Collecting async-lru>=1.0.0 (

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
import os
import tarfile
from tqdm import tqdm
from urllib.request import urlopen
from zipfile import ZipFile
from io import BytesIO

def download_and_unzip(url, extract_to='Datasets', chunk_size=1024*1024):
    print("Downloading dataset...")
    http_response = urlopen(url)

    data = b''
    iterations = http_response.length // chunk_size + 1
    for _ in tqdm(range(iterations)):
        data += http_response.read(chunk_size)

    zipfile = ZipFile(BytesIO(data))
    zipfile.extractall(path=extract_to)
    print(f"Dataset extracted to: {extract_to}")

# Download IAM Words dataset (includes words.tgz + words.txt)
dataset_path = os.path.join('Datasets', 'IAM_Words')
if not os.path.exists(dataset_path):
    download_and_unzip('https://git.io/J0fjL', extract_to='Datasets')

    # Extract the words.tgz archive inside it
    words_tgz = os.path.join(dataset_path, "words.tgz")
    with tarfile.open(words_tgz) as tar:
        tar.extractall(path=os.path.join(dataset_path, "words"))

print("✅ Dataset is ready!")


Downloading dataset...


100%|██████████| 784/784 [03:59<00:00,  3.27it/s]


Dataset extracted to: Datasets
✅ Dataset is ready!


In [4]:
import os

base_path = "Datasets/IAM_Words"

for root, dirs, files in os.walk(base_path):
    print(f"📁 {root}")
    for d in dirs:
        print(f"  └── {d}")
    break  # only show top-level folder structure


📁 Datasets/IAM_Words
  └── words


In [5]:
import os

for root, dirs, files in os.walk("Datasets/IAM_Words"):
    for file in files:
        if file.endswith("words.txt"):
            print("✅ Found words.txt at:", os.path.join(root, file))


✅ Found words.txt at: Datasets/IAM_Words/words.txt


In [6]:
import os

for root, dirs, files in os.walk("Datasets/IAM_Words"):
    for file in files:
        if file.endswith(".png"):
            print("✅ Found existing PNG:", os.path.join(root, file))
            break
    if files:
        break


In [7]:
import os

print("🔍 Looking for images...")

count = 0
for root, dirs, files in os.walk("Datasets/IAM_Words"):
    for file in files:
        if file.lower().endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tif", ".tiff")):
            print(os.path.join(root, file))
            count += 1
            if count >= 5:
                break
    if count >= 5:
        break

if count == 0:
    print("❌ No image files found.")


🔍 Looking for images...
Datasets/IAM_Words/words/f07/f07-092a/f07-092a-03-02.png
Datasets/IAM_Words/words/f07/f07-092a/f07-092a-08-01.png
Datasets/IAM_Words/words/f07/f07-092a/f07-092a-04-06.png
Datasets/IAM_Words/words/f07/f07-092a/f07-092a-08-04.png
Datasets/IAM_Words/words/f07/f07-092a/f07-092a-03-00.png


In [8]:
dataset = []
vocab = set()
max_len = 0

for root, dirs, files in os.walk("Datasets/IAM_Words/words"):
    for file in files:
        if file.endswith(".png"):
            full_path = os.path.join(root, file)
            label = os.path.splitext(file)[0]  # use filename (e.g., "b05-058-03-07")
            dataset.append([full_path, label])
            vocab.update(label)
            max_len = max(max_len, len(label))

print(f"\n✅ Samples from filenames: {len(dataset)}")
print(f"✅ Vocab size: {len(vocab)}")
print(f"✅ Max label length: {max_len}")



✅ Samples from filenames: 115320
✅ Vocab size: 30
✅ Max label length: 14


In [9]:
import os
import json

class Configs:
    def __init__(self):
        self.vocab = "".join(sorted(vocab))
        self.max_text_length = max_len
        self.input_shape = (128, 32, 3)  # (width, height, channels)
        self.batch_size = 64
        self.learning_rate = 0.0001
        self.model_path = "IAM_model"

    def save(self, path="IAM_model/configs.json"):
        os.makedirs(os.path.dirname(path), exist_ok=True)
        with open(path, "w") as f:
            json.dump(self.__dict__, f, indent=4)
        print(f"✅ Config saved to: {path}")

# Create and save config
configs = Configs()
configs.save()


✅ Config saved to: IAM_model/configs.json


In [10]:
!pip install mltu






In [11]:
from mltu.torch.dataProvider import DataProvider
from mltu.preprocessors import ImageReader
from mltu.transformers import ImageResizer, LabelIndexer, LabelPadding

data_provider = DataProvider(
    dataset=dataset,
    skip_validation=True,
    batch_size=configs.batch_size,
    data_preprocessors=[ImageReader(image_class="PIL")],
    transformers=[
        ImageResizer(width=configs.input_shape[0], height=configs.input_shape[1]),
        LabelIndexer(configs.vocab),
        LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab))
    ],
    use_cache=True,
)


INFO:DataProvider:Skipping Dataset validation...


In [12]:
!pip install transformers datasets evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [13]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_mod

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [14]:
import os
from tqdm import tqdm

dataset = []
vocab = set()
max_len = 0

# Adjust these paths if needed
words_txt_path = "Datasets/IAM_Words/words.txt"
image_root_path = "Datasets/IAM_Words/words"

# Create an index of available images
image_index = set()
for root, dirs, files in os.walk(image_root_path):
    for file in files:
        if file.endswith(".png"):
            rel_path = os.path.relpath(os.path.join(root, file), image_root_path)
            rel_path = rel_path.replace("\\", "/")  # For Windows compatibility
            image_index.add(rel_path)

# Process words.txt
with open(words_txt_path, "r") as f:
    for line in tqdm(f):
        if line.startswith("#"):
            continue

        parts = line.strip().split()
        file_id = parts[0]
        status = parts[1]
        if status != "ok":
            continue

        label = " ".join(parts[8:])
        sub_path = file_id.replace("-", "/") + ".png"

        if sub_path in image_index:
            full_path = os.path.join(image_root_path, sub_path)
            dataset.append([full_path, label])
            vocab.update(label)
            max_len = max(max_len, len(label))


115338it [00:00, 671491.52it/s]


In [15]:
import pandas as pd
from datasets import Dataset

df = pd.DataFrame(dataset, columns=["image_path", "text"])
dataset_hf = Dataset.from_pandas(df)


In [16]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel

# Load processor and model
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_mod

In [17]:
import cv2
from PIL import Image
import numpy as np

def preprocess(example):
    try:
        # Load image and convert to RGB
        image = cv2.imread(example["image_path"], cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = Image.fromarray(image)

        # Encode image
        encoding = processor(images=image, return_tensors="pt")
        pixel_values = encoding.pixel_values.squeeze().numpy().tolist()

        # Tokenize text label
        labels = processor.tokenizer(
            example["text"],
            padding="max_length",
            max_length=64,
            truncation=True
        ).input_ids

        return {"pixel_values": pixel_values, "labels": labels}

    except Exception as e:
        print(f"❌ Error processing {example['image_path']}: {e}")
        return {"pixel_values": None, "labels": None}


In [18]:
processed_dataset = dataset_hf.map(
    preprocess,
    remove_columns=["image_path", "text"],
    batched=False,
    num_proc=1
)

# Drop bad samples
processed_dataset = processed_dataset.filter(lambda x: x["pixel_values"] is not None)


In [19]:
# Step 6: Split processed dataset into train and validation
split_dataset = processed_dataset.train_test_split(test_size=0.1)

train_ds = split_dataset["train"]
val_ds = split_dataset["test"]

print(f"✔️ Train size: {len(train_ds)}")
print(f"✔️ Validation size: {len(val_ds)}")


✔️ Train size: 0
✔️ Validation size: 0


In [20]:
!pip install --upgrade transformers --quiet
import importlib
importlib.reload(__import__("transformers"))


<module 'transformers' from '/usr/local/lib/python3.11/dist-packages/transformers/__init__.py'>

In [21]:
def preprocess(example):
    try:
        import cv2
        from PIL import Image
        import numpy as np

        image = cv2.imread(example["image_path"], cv2.IMREAD_COLOR)
        if image is None:
            raise ValueError("Image could not be read")

        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = Image.fromarray(image)

        encoding = processor(images=image, return_tensors="pt")
        pixel_values = encoding.pixel_values.squeeze().numpy().tolist()

        labels = processor.tokenizer(
            example["text"],
            padding="max_length",
            max_length=64,
            truncation=True
        ).input_ids

        return {"pixel_values": pixel_values, "labels": labels}

    except Exception as e:
        print(f"❌ Error processing {example['image_path']}: {e}")
        return {"pixel_values": None, "labels": None}


In [22]:
# Apply preprocessing to all samples
processed_dataset = dataset_hf.map(preprocess, remove_columns=dataset_hf.column_names)

# Filter out failed samples
processed_dataset = processed_dataset.filter(lambda example: example["pixel_values"] is not None)


In [23]:
print("📁 Total parsed entries from words.txt:", len(dataset))
if len(dataset) > 0:
    print("🔎 First sample path:", dataset[0][0])
    print("📝 First label:", dataset[0][1])
else:
    print("❌ Dataset is empty. Something went wrong while parsing.")


📁 Total parsed entries from words.txt: 0
❌ Dataset is empty. Something went wrong while parsing.


In [24]:
!find Datasets/IAM_Words/words -name "*.png" | head -n 10


Datasets/IAM_Words/words/f07/f07-092a/f07-092a-03-02.png
Datasets/IAM_Words/words/f07/f07-092a/f07-092a-08-01.png
Datasets/IAM_Words/words/f07/f07-092a/f07-092a-04-06.png
Datasets/IAM_Words/words/f07/f07-092a/f07-092a-08-04.png
Datasets/IAM_Words/words/f07/f07-092a/f07-092a-03-00.png
Datasets/IAM_Words/words/f07/f07-092a/f07-092a-03-05.png
Datasets/IAM_Words/words/f07/f07-092a/f07-092a-02-05.png
Datasets/IAM_Words/words/f07/f07-092a/f07-092a-01-10.png
Datasets/IAM_Words/words/f07/f07-092a/f07-092a-08-03.png
Datasets/IAM_Words/words/f07/f07-092a/f07-092a-05-05.png


In [25]:
print("✅ Number of samples parsed so far:", len(dataset))


✅ Number of samples parsed so far: 0


In [26]:
import os
from tqdm import tqdm

dataset = []
vocab = set()
max_len = 0

words_txt_path = "Datasets/IAM_Words/words.txt"
image_root_path = "Datasets/IAM_Words/words"

# Rebuild image path index
available_images = set()
for root, _, files in os.walk(image_root_path):
    for file in files:
        if file.endswith(".png"):
            rel_path = os.path.relpath(os.path.join(root, file), image_root_path).replace("\\", "/")
            available_images.add(rel_path)

# Start parsing words.txt
for line in tqdm(open(words_txt_path, "r").readlines(), desc="🔍 Parsing words.txt"):
    if line.startswith("#"):
        continue
    parts = line.strip().split()
    file_id = parts[0]
    status = parts[1]
    if status != "ok":
        continue
    label = " ".join(parts[8:])
    filename = file_id + ".png"
    matched_path = next((path for path in available_images if path.endswith(filename)), None)
    if not matched_path:
        continue
    full_path = os.path.join(image_root_path, matched_path)
    dataset.append([full_path, label])
    vocab.update(label)
    max_len = max(max_len, len(label))

print("✅ Total matched samples:", len(dataset))


🔍 Parsing words.txt:   0%|          | 357/115338 [00:05<28:17, 67.74it/s]


KeyboardInterrupt: 

In [27]:
df = pd.DataFrame(dataset, columns=["image_path", "text"])


In [28]:
import pandas as pd
from datasets import Dataset

# Create DataFrame from the parsed list
df = pd.DataFrame(dataset, columns=["image_path", "text"])

# Optional: Save it for backup
df.to_csv("iam_dataset_subset.csv", index=False)

# Convert to HuggingFace Dataset
dataset_hf = Dataset.from_pandas(df)
print("✅ Dataset size loaded into HF Dataset:", len(dataset_hf))


✅ Dataset size loaded into HF Dataset: 280


In [29]:
from PIL import Image
import cv2
import numpy as np
from transformers import TrOCRProcessor

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")

def preprocess(example):
    try:
        image = cv2.imread(example["image_path"], cv2.IMREAD_COLOR)
        if image is None:
            raise ValueError("Image could not be read")
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = Image.fromarray(image)
        encoding = processor(images=image, return_tensors="pt")
        pixel_values = encoding.pixel_values.squeeze().numpy().tolist()
        labels = processor.tokenizer(
            example["text"],
            padding="max_length",
            max_length=64,
            truncation=True
        ).input_ids
        return {"pixel_values": pixel_values, "labels": labels}
    except Exception as e:
        print(f"❌ Failed: {example['image_path']} | {e}")
        return {"pixel_values": None, "labels": None}


In [30]:
# Process images one by one to reduce memory usage
processed_dataset = dataset_hf.map(preprocess, remove_columns=dataset_hf.column_names, batched=False)

# Filter out failed samples
processed_dataset = processed_dataset.filter(lambda x: x["pixel_values"] is not None)

print("✅ Final usable samples after preprocessing:", len(processed_dataset))


Map:   0%|          | 0/280 [00:00<?, ? examples/s]

Filter:   0%|          | 0/280 [00:00<?, ? examples/s]

✅ Final usable samples after preprocessing: 280


In [31]:
import torch
from torch.utils.data import DataLoader

# Custom collate function to handle padding
def collate_fn(batch):
    pixel_values = torch.tensor([item["pixel_values"] for item in batch], dtype=torch.float32)

    labels = [torch.tensor(item["labels"]) for item in batch]
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=processor.tokenizer.pad_token_id)

    return {
        "pixel_values": pixel_values,
        "labels": labels
    }

# Create DataLoader
train_dataloader = DataLoader(processed_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

print("✅ Dataloader created with", len(train_dataloader), "batches.")


✅ Dataloader created with 35 batches.


In [32]:
from transformers import VisionEncoderDecoderModel

# Load model
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_mod

In [33]:
# ✅ Fix the error: Set decoder start and pad token IDs
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id

# ✅ Now start training
model.train()
for epoch in range(2):
    ...


In [34]:
from tqdm import tqdm

model.train()
for epoch in range(2):  # Keep epochs low for quick test
    total_loss = 0
    print(f"\n🔁 Epoch {epoch + 1}")
    for batch in tqdm(train_dataloader):
        pixel_values = batch["pixel_values"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(pixel_values=pixel_values, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f"✅ Avg Loss after Epoch {epoch + 1}: {total_loss / len(train_dataloader):.4f}")



🔁 Epoch 1


  0%|          | 0/35 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
100%|██████████| 35/35 [01:35<00:00,  2.74s/it]


✅ Avg Loss after Epoch 1: 1.2886

🔁 Epoch 2


100%|██████████| 35/35 [01:33<00:00,  2.68s/it]

✅ Avg Loss after Epoch 2: 0.1755





In [43]:
from google.colab import files

uploaded = files.upload()  # 👈 This will open a file picker


Saving r06-097.png to r06-097.png


In [45]:
model.save_pretrained("trocr-finetuned-iam")
processor.save_pretrained("trocr-finetuned-iam")


[]

In [46]:
from transformers import VisionEncoderDecoderModel, TrOCRProcessor

# ✅ Load from saved local directory
model = VisionEncoderDecoderModel.from_pretrained("trocr-finetuned-iam")
processor = TrOCRProcessor.from_pretrained("trocr-finetuned-iam")
model.to(device)


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_mod

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (i

In [47]:
# STEP 1 (do this only once after training)
model.save_pretrained("trocr-finetuned-iam")
processor.save_pretrained("trocr-finetuned-iam")

# STEP 2 (for inference later)
model = VisionEncoderDecoderModel.from_pretrained("trocr-finetuned-iam")
processor = TrOCRProcessor.from_pretrained("trocr-finetuned-iam")
model.to(device)


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_mod

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (i

In [48]:
from PIL import Image
import cv2

# ✅ Uploaded image file
test_image_path = "r03-096-05-05.png"

# ✅ Load and preprocess the image
image = cv2.imread(test_image_path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image = Image.fromarray(image)
image = image.resize((384, 96))  # You can tweak this size if needed

# ✅ Generate prediction
pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)
generated_ids = model.generate(pixel_values)

# ✅ Decode prediction
predicted_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

print("📝 Predicted Text:", predicted_text)


📝 Predicted Text: to,of
