In [None]:
%%writefile handwriting_ocr_finetuning.ipynb
# %% [markdown]
# # Handwriting OCR Fine-Tuning with TrOCR
# ## Complete Pipeline with Kaggle Hardware Optimization

In [1]:
# %% [code]
# Install dependencies
!pip install -q datasets transformers[torch] evaluate jiwer torchvision opencv-python-headless


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m:00:01[0mm00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.51.2-py3-none-any.whl.metadata (38 kB)
Downloading transformers-4.51.2-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m76.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.51.1
    Uninstalling transformers-4.51.1:
      Successfully uninstalled transformers-4.51.1
Successfully installed transformers-4.51.2
Note: you may need to restart the kernel to use updated packages.


In [3]:
%%script echo skipping
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

skipping


In [4]:
# %% [code]
import os
import cv2
import torch
import numpy as np
from PIL import Image
from datasets import load_dataset, DatasetDict, concatenate_datasets
from transformers import (
    TrOCRProcessor,
    VisionEncoderDecoderModel,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    default_data_collator,
    EarlyStoppingCallback
)
import evaluate
from torch.nn import DataParallel

2025-04-13 16:33:12.249061: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744561992.449138      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744561992.506510      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
# Hardware-aware configuration
NUM_GPUS = torch.cuda.device_count()
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
# Training parameters
MODEL_NAME = "microsoft/trocr-large-handwritten"
MAX_EPOCHS = 10
LEARNING_RATE = 5e-5
IMAGE_SIZE = (384, 384)

In [11]:
# Batch size configuration (4 for P100, 8 for T4s)
BATCH_SIZE = 8 if "T4" in torch.cuda.get_device_name(0) else 4
BATCH_SIZE *= NUM_GPUS  # Scale with multiple GPUs
GRADIENT_ACCUMULATION_STEPS = 2 if NUM_GPUS == 1 else 1

In [12]:
from datasets import load_dataset, concatenate_datasets, DatasetDict

class KaggleDataLoader:
    """Handles dataset loading from Hugging Face"""
    
    @staticmethod
    def load_iam():
        """Load IAM Handwriting from Hugging Face"""
        # Load the IAM dataset from Hugging Face
        return load_dataset("Teklia/IAM-line")
    
    @staticmethod
    def load_imgur5k():
        """Load Imgur5K from Hugging Face"""
        # Load the Imgur5K dataset from Hugging Face
        return load_dataset("staghado/IMGUR-dataset")
    
    @staticmethod
    def create_datasets():
        """Create combined dataset with validation split"""
        # Load IAM and Imgur5K datasets
        iam = KaggleDataLoader.load_iam()
        imgur = KaggleDataLoader.load_imgur5k()
        
        # Combine training data and split into train/validation sets
        combined_train = concatenate_datasets([iam["train"], imgur["train"]])
        train_val_split = combined_train.train_test_split(test_size=0.1)
        
        return DatasetDict({
            "train": train_val_split["train"],
            "validation": train_val_split["test"],
            "test": iam["test"]  # Use IAM's test set as the test set
        })

# Initialize datasets
dataset = KaggleDataLoader.create_datasets()

# Print information about the dataset splits
print(dataset)

README.md:   0%|          | 0.00/2.14k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/167M [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/24.7M [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/73.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6482 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/976 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2915 [00:00<?, ? examples/s]

IMGUR5K-Handwriting-Dataset.zip:   0%|          | 0.00/4.74G [00:00<?, ?B/s]

chunk_0.zip:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

chunk_1.zip:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

chunk_2.zip:   0%|          | 0.00/2.49G [00:00<?, ?B/s]

chunk_3.zip:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

chunk_4.zip:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

chunk_5.zip:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

chunk_6.zip:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

chunk_7.zip:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

chunk_8.zip:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

chunk_9.zip:   0%|          | 0.00/2.36G [00:00<?, ?B/s]

dataset_info.zip:   0%|          | 0.00/15.2M [00:00<?, ?B/s]

imgur8k-dataset.zip:   0%|          | 0.00/4.71G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['image', 'text'],
        num_rows: 737146
    })
    validation: Dataset({
        features: ['image', 'text'],
        num_rows: 81906
    })
    test: Dataset({
        features: ['image', 'text'],
        num_rows: 2915
    })
})


In [15]:
from transformers import TrOCRProcessor
from PIL import Image
import numpy as np
import cv2

class OCRPreprocessor:
    """Production-grade preprocessing with GPU-aware optimizations"""
    
    def __init__(self, model_name, image_size=(224, 224), aug_prob=0.5):
        self.processor = TrOCRProcessor.from_pretrained(model_name)
        self.image_size = image_size
        self.aug_prob = aug_prob
        
    def _process_image(self, img):
        """Full image processing pipeline"""
        # Convert to grayscale if not already
        if img.mode != "L":
            img = img.convert("L")
        img = np.array(img)
        
        # Noise reduction
        img = cv2.fastNlMeansDenoising(img, h=10)
        img = cv2.medianBlur(img, 3)
        
        # Adaptive thresholding
        img = cv2.adaptiveThreshold(
            img, 255,
            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY, 11, 2
        )
        
        # Augmentation
        if np.random.rand() < self.aug_prob:
            img = self._perspective_augmentation(img)
            
        # Resize to the desired image size
        img = cv2.resize(img, self.image_size)
        return img
    
    def _perspective_augmentation(self, img):
        """Add perspective variation"""
        h, w = img.shape
        pts1 = np.float32([[0, 0], [w, 0], [0, h], [w, h]])  # Source points
        # Add random uniform perturbations to the destination points
        pts2 = np.float32([
            [np.random.uniform(-w * 0.05, w * 0.05), np.random.uniform(-h * 0.05, h * 0.05)],
            [w + np.random.uniform(-w * 0.05, w * 0.05), np.random.uniform(-h * 0.05, h * 0.05)],
            [np.random.uniform(-w * 0.05, w * 0.05), h + np.random.uniform(-h * 0.05, h * 0.05)],
            [w + np.random.uniform(-w * 0.05, w * 0.05), h + np.random.uniform(-h * 0.05, h * 0.05)]
        ])  # Destination points
        M = cv2.getPerspectiveTransform(pts1, pts2)  # Perspective transformation matrix
        return cv2.warpPerspective(img, M, (w, h))  # Apply the perspective warp
    
    def process_batch(self, examples):
        """Batch processing for dataset"""
        processed_images = []

        for img_data in examples["image"]:
            if isinstance(img_data, str):  # If the image is a path
                img = Image.open(img_data)
            else:  # If the image is raw data
                img = img_data
            
            # Process the image
            processed_img = Image.fromarray(self._process_image(img)).convert("RGB")
            processed_images.append(processed_img)
        
        # Ensure texts are strings
        texts = [
            str(text) if text is not None else ""  # Convert to string or use an empty string if None
            for text in examples["text"]
        ]
        
        # Processor operations
        pixel_values = self.processor(processed_images, return_tensors="pt").pixel_values
        labels = self.processor.tokenizer(
            texts, 
            padding="max_length", 
            max_length=64, 
            return_tensors="pt"
        ).input_ids
        
        return {"pixel_values": pixel_values, "labels": labels}

# Subset the 'train' split of the dataset to 500 examples
subset_size = 500
train_dataset = dataset["train"].shuffle(seed=42).select(range(subset_size))  # Take first 500 examples of the train split

# Update the subset dataset with preprocessing
MODEL_NAME = "microsoft/trocr-base-handwritten"  # Example model name
IMAGE_SIZE = (224, 224)

preprocessor = OCRPreprocessor(model_name=MODEL_NAME, image_size=IMAGE_SIZE)
processed_dataset = train_dataset.map(
    preprocessor.process_batch,
    batched=True,
    batch_size=16,  # Smaller batch size to reduce memory usage
    remove_columns=["image", "text"]  # Remove unused columns
)

preprocessor_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [16]:
print(processed_dataset)

Dataset({
    features: ['pixel_values', 'labels'],
    num_rows: 500
})


In [18]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel

# Initialize processor
MODEL_NAME = "microsoft/trocr-base-handwritten"  # Example model name
processor = TrOCRProcessor.from_pretrained(MODEL_NAME)

# Initialize model with DataParallel
model = VisionEncoderDecoderModel.from_pretrained(MODEL_NAME)
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.2"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_mod

In [19]:
if NUM_GPUS > 1:
    model = DataParallel(model)
model.to(DEVICE)


DataParallel(
  (module): VisionEncoderDecoderModel(
    (encoder): ViTModel(
      (embeddings): ViTEmbeddings(
        (patch_embeddings): ViTPatchEmbeddings(
          (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
        )
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (encoder): ViTEncoder(
        (layer): ModuleList(
          (0-11): 12 x ViTLayer(
            (attention): ViTAttention(
              (attention): ViTSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=False)
                (key): Linear(in_features=768, out_features=768, bias=False)
                (value): Linear(in_features=768, out_features=768, bias=False)
              )
              (output): ViTSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.0, inplace=False)
              )
            )
            (intermediate): ViTIntermediate(
              (dens

In [21]:
# Split into train and validation sets
train_test_split = processed_dataset.train_test_split(test_size=0.2, seed=42)  # 80% train, 20% validation
train_dataset = train_test_split["train"]
validation_dataset = train_test_split["test"]

In [22]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    num_train_epochs=MAX_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE//NUM_GPUS if NUM_GPUS > 1 else BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE//NUM_GPUS if NUM_GPUS > 1 else BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    fp16=True,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
    report_to="none",
    predict_with_generate=True,
    generation_max_length=64,
    metric_for_best_model="cer",
    load_best_model_at_end=True,
    greater_is_better=False,
    warmup_ratio=0.1,
    weight_decay=0.01,
    dataloader_num_workers=4*NUM_GPUS,
    remove_unused_columns=False,  # Prevent trainer from dropping columns
)

In [23]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [29]:
from transformers import Seq2SeqTrainer, EarlyStoppingCallback

# Filter out empty labels from datasets
def filter_empty_labels(example):
    return example["labels"] is not None and len(example["labels"]) > 0

train_dataset = train_dataset.filter(filter_empty_labels)
validation_dataset = validation_dataset.filter(filter_empty_labels)

# Sanity check to ensure no empty labels
train_dataset = train_dataset.filter(lambda example: len(example["labels"]) > 0)
validation_dataset = validation_dataset.filter(lambda example: len(example["labels"]) > 0)

# If the model is wrapped in DataParallel, access the underlying model
if isinstance(model, torch.nn.DataParallel):
    model = model.module

# Update compute_metrics to handle empty predictions/references
def compute_metrics(pred):
    predictions = pred.predictions
    labels = pred.label_ids

    # Decode predictions and labels
    decoded_preds = processor.tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Remove empty references and corresponding predictions
    valid_pairs = [
        (pred.strip(), label.strip())
        for pred, label in zip(decoded_preds, decoded_labels)
        if label.strip()  # Ensure reference is not empty
    ]
    decoded_preds, decoded_labels = zip(*valid_pairs) if valid_pairs else ([], [])

    if not decoded_labels:  # If all references were empty, return a default metric
        return {"cer": 1.0}

    # Compute metrics (e.g., CER)
    cer = calculate_cer(decoded_labels, decoded_preds)
    return {"cer": cer}

# Initialize trainer with early stopping
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Use the processed train split
    eval_dataset=validation_dataset,  # Use the processed validation split
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Start training
trainer.train()

Filter:   0%|          | 0/400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Cer
1,No log,0.0,1.0
2,No log,0.0,1.0
3,No log,0.0,1.0
4,0.103700,0.0,1.0


There were missing keys in the checkpoint model loaded: ['decoder.output_projection.weight'].


TrainOutput(global_step=100, training_loss=0.10371293067932129, metrics={'train_runtime': 417.8224, 'train_samples_per_second': 9.573, 'train_steps_per_second': 0.598, 'total_flos': 1.1972563055935488e+18, 'train_loss': 0.10371293067932129, 'epoch': 4.0})

In [None]:
# Access the test dataset
test_dataset = dataset["test"]

# Preprocess the test dataset
processed_test_dataset = test_dataset.map(
    preprocessor.process_batch,
    batched=True,
    batch_size=16,
    remove_columns=["image", "text"]
)

#Final evaluation
''''results = trainer.evaluate(processed_test_dataset, metric_key_prefix="test")
print(f"Final CER: {results['test_cer']*100:.2f}%")
print(f"Final WER: {results['test_wer']*100:.2f}%")''''''

In [38]:
def compute_metrics(pred):
    predictions = pred.predictions
    labels = pred.label_ids

    # Decode predictions and labels
    decoded_preds = processor.tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Remove empty references and corresponding predictions
    valid_pairs = [
        (pred.strip(), label.strip())
        for pred, label in zip(decoded_preds, decoded_labels)
        if label.strip()  # Ensure reference is not empty
    ]
    decoded_preds, decoded_labels = zip(*valid_pairs) if valid_pairs else ([], [])

    if not decoded_labels:  # If all references were empty, return default metrics
        return {"cer": 1.0, "wer": 1.0}

    # Compute metrics
    cer_value = calculate_cer(decoded_labels, decoded_preds)
    wer_value = cer_value  # For simplicity, you can use CER as a proxy for WER or use another method for WER
    return {"cer": cer_value, "wer": wer_value}

In [None]:
# Final evaluation
results = trainer.evaluate(processed_test_dataset, metric_key_prefix="test")
print(f"Final CER: {results['test_cer']*100:.2f}%")
print(f"Final WER: {results['test_wer']*100:.2f}%")

In [37]:
# Save model (handling DataParallel)
model_to_save = model.module if hasattr(model, 'module') else model
torch.save(model_to_save.state_dict(), "trocr_finetuned.pth")
model_to_save.save_pretrained("./final_model")
processor.save_pretrained("./final_model")

[]

In [33]:
import os
import torch

# Check current working directory
print("Current working directory:", os.getcwd())

# Check if final_model directory exists
if os.path.exists("./final_model"):
    print("Directory './final_model' exists. Contents:", os.listdir("./final_model"))
else:
    print("Directory './final_model' does not exist.")

# Check if .pth file exists
if os.path.exists("trocr_finetuned.pth"):
    print("File 'trocr_finetuned.pth' exists.")
else:
    print("File 'trocr_finetuned.pth' does not exist.")

# Test write permissions
try:
    with open("test_write_permissions.txt", "w") as f:
        f.write("Testing write permissions.")
    print("Write permissions are OK.")
    os.remove("test_write_permissions.txt")
except IOError:
    print("No write permissions in the current directory.")

Current working directory: /kaggle/working
Directory './final_model' exists. Contents: ['config.json', 'generation_config.json', 'special_tokens_map.json', 'tokenizer_config.json', 'preprocessor_config.json', 'merges.txt', 'vocab.json', 'tokenizer.json', 'model.safetensors']
File 'trocr_finetuned.pth' exists.
Write permissions are OK.


In [None]:
# %% [markdown]
# ## Inference Pipeline

# %% [code]
class ProductionOCR:
    def __init__(self, model_path):
        self.processor = TrOCRProcessor.from_pretrained(model_path)
        self.model = VisionEncoderDecoderModel.from_pretrained(model_path).to(DEVICE)
        
    def predict(self, image_path):
        image = Image.open(image_path).convert("RGB")
        pixel_values = self.processor(image, return_tensors="pt").pixel_values.to(DEVICE)
        outputs = self.model.generate(pixel_values)
        return self.processor.decode(outputs[0], skip_special_tokens=True)

# Usage
ocr = ProductionOCR("./final_model")
print(ocr.predict("/kaggle/input/iam-handwriting/test/a01-007-02.png"))