In [None]:
import zipfile
import json
import os
from collections import defaultdict
from tqdm import tqdm
import shutil
import random
import requests

In [None]:
train_zip = '../data/raw/train2014.zip'
annotations_zip = '../data/raw/cocotext.v2.zip'

In [None]:
def image_id_to_train_filename(image_id):
    return f'COCO_train2014_{image_id:012d}.jpg'


def load_annotations(zip_path, json_filename='cocotext.v2.json'):
    with zipfile.ZipFile(zip_path, 'r') as zipf:
        with zipf.open(json_filename) as file:
            return json.load(file)


def get_annotated_image_ids(coco_data):
    annotated_ids = set()
    for ann in coco_data['anns'].values():
        annotated_ids.add(ann['image_id'])
    return annotated_ids


def extract_annotated_images(zip_path, output_dir, target_filenames):
    os.makedirs(output_dir, exist_ok=True)
    with zipfile.ZipFile(zip_path, 'r') as zipf:
        zip_files = zipf.namelist()
        matched_files = [f for f in zip_files if os.path.basename(f) in target_filenames]

        if not matched_files:
            print(f'No matching files found in {zip_path}')
            return

        for file in tqdm(matched_files, desc=f'Extracting from {os.path.basename(zip_path)}'):
            zipf.extract(file, output_dir)
            
def count_images(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as zipf:
        return len([f for f in zipf.namelist() if f.lower().endswith('.jpg')])

def extract_anns(zip_path, json_filename='cocotext.v2.json'):
    with zipfile.ZipFile(zip_path, 'r') as zipf:
        with zipf.open(json_filename) as json_file:
            return json.load(json_file)

def contar_labeled(coco_text_data):
    etiquetadas = set()
    for ann_id, ann in coco_text_data['anns'].items():
        img_id = ann['image_id']
        if ann['legibility'] in ['legible', 'illegible']:
            etiquetadas.add(img_id)
    return len(etiquetadas)

In [None]:
output_dir = '../data/raw/subset/train2014'

# load annotations and get annotated image IDs
coco_data = load_annotations(annotations_zip)
annotated_ids = get_annotated_image_ids(coco_data)

# convert IDs to expected filenames
train_filenames = set(image_id_to_train_filename(img_id) for img_id in annotated_ids)

# extract only annotated images from train2014.zip
extract_annotated_images(train_zip, output_dir, train_filenames)

In [None]:
def get_annotations_by_image_id(coco_text_data):
    """Groups annotations by image_id for easy lookup."""
    img_id_to_anns = defaultdict(list)
    for ann_id, ann in coco_text_data['anns'].items():
        # We only want to train on legible text
        if ann['legibility'] == 'legible':
            img_id_to_anns[ann['image_id']].append(ann)
    return img_id_to_anns

def train_filename_to_image_id(filename):
    """Converts 'COCO_train2014_000000123456.jpg' to 123456."""
    return int(filename.split('_')[-1].split('.')[0])

In [13]:
import cv2
from PIL import Image
import csv

def create_cropped_dataset(image_dir, output_dir, img_id_to_anns):
    """
    Crops text regions from images and saves them for training.
    This version uses the `csv` module to correctly handle labels
    that contain commas.
    """
    os.makedirs(output_dir, exist_ok=True)
    print(f"Processing images in '{image_dir}' and saving crops to '{output_dir}'...")

    labels_file_path = os.path.join(output_dir, 'labels.csv')

    with open(labels_file_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)

        image_files = os.listdir(image_dir)

        for filename in tqdm(image_files, desc="Cropping text regions"):
            image_id = train_filename_to_image_id(filename)

            if image_id not in img_id_to_anns:
                continue

            image_path = os.path.join(image_dir, filename)
            image = cv2.imread(image_path)
            if image is None:
                continue

            annotations = img_id_to_anns[image_id]

            for i, ann in enumerate(annotations):
                bbox = [int(p) for p in ann['bbox']]
                text_label = ann['utf8_string']

                if not text_label or len(text_label) < 2:
                    continue

                x, y, w, h = bbox

                if w <= 0 or h <= 0 or x < 0 or y < 0 or (x+w) > image.shape[1] or (y+h) > image.shape[0]:
                    continue

                cropped_image = image[y:y+h, x:x+w]
                crop_filename = f"{image_id}_{i}.png"


                writer.writerow([crop_filename, text_label])

                cv2.imwrite(os.path.join(output_dir, crop_filename), cropped_image)

In [15]:
train_images_dir = '../data/raw/subset/train2014/train2014'
val_images_dir = '../data/raw/subset/val2014'

processed_train_dir = '../data/processed/train'
processed_val_dir = '../data/processed/val'

annotations_zip = '../data/raw/cocotext.v2.zip'
coco_text_data = extract_anns(annotations_zip)
img_id_to_anns = get_annotations_by_image_id(coco_text_data)

create_cropped_dataset(train_images_dir, processed_train_dir, img_id_to_anns)
create_cropped_dataset(val_images_dir, processed_val_dir, img_id_to_anns)

Processing images in '../data/raw/subset/train2014/train2014' and saving crops to '../data/processed/train'...


Cropping text regions: 100%|██████████| 15656/15656 [00:47<00:00, 328.60it/s]


Processing images in '../data/raw/subset/val2014' and saving crops to '../data/processed/val'...


Cropping text regions: 100%|██████████| 7829/7829 [00:24<00:00, 322.65it/s]


In [17]:
# In your training script
import pandas as pd
from torch.utils.data import Dataset
from PIL import Image

class OCRDataset(Dataset):
    def __init__(self, root_dir, processor):
        self.root_dir = root_dir
        df = pd.read_csv(os.path.join(root_dir, 'labels.csv'), header=None, names=['file_name', 'text'])

        df.dropna(inplace=True) # Remove rows with any NaN values
        df = df[df['text'].str.len() > 0] # Be extra sure and remove empty strings
        df.reset_index(drop=True, inplace=True) # Reset index after dropping rows
        self.df = df

        self.processor = processor

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        file_name = self.df['file_name'][idx]
        text = self.df['text'][idx]

        text = str(text)

        image = Image.open(os.path.join(self.root_dir, file_name)).convert("RGB")
        pixel_values = self.processor(image, return_tensors="pt").pixel_values

        labels = self.processor.tokenizer(text,
                                          padding="max_length",
                                          max_length=64,
                                          truncation=True).input_ids

        labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]
        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding

In [18]:
import torch
from transformers import (
    VisionEncoderDecoderModel,
    TrOCRProcessor,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    default_data_collator
)
import evaluate

model_name = "microsoft/trocr-base-printed"
processor = TrOCRProcessor.from_pretrained(model_name)
model = VisionEncoderDecoderModel.from_pretrained(model_name)

model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size

train_dataset = OCRDataset(root_dir='../data/processed/train', processor=processor)
eval_dataset = OCRDataset(root_dir='../data/processed/val', processor=processor)

training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    fp16=True,
    output_dir="../models/finetuned_trocr_weights",
    logging_steps=50,
    save_steps=1000,
    eval_steps=1000,
    num_train_epochs=3,
    report_to="none",
)

cer_metric = evaluate.load("cer")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels_ids, skip_special_tokens=True)

    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    return {"cer": cer}

trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=processor.feature_extractor,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=default_data_collator,
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-printed and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream t

In [50]:
trainer.train()

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-printed and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Seq2SeqTrainer(


Step,Training Loss
50,3.0511
100,2.5508
150,2.4874
200,2.3744
250,2.4859
300,2.4831
350,2.3327
400,2.1752
450,2.1962
500,2.24


The channel dimension is ambiguous. Got image shape (3, 14, 3). Assuming channels are the first dimension. Use the [input_data_format](https://huggingface.co/docs/transformers/main/internal/image_processing_utils#transformers.image_transforms.rescale.input_data_format) parameter to assign the channel dimension.
The channel dimension is ambiguous. Got image shape (3, 12, 3). Assuming channels are the first dimension. Use the [input_data_format](https://huggingface.co/docs/transformers/main/internal/image_processing_utils#transformers.image_transforms.rescale.input_data_format) parameter to assign the channel dimension.
The channel dimension is ambiguous. Got image shape (3, 7, 3). Assuming channels are the first dimension. Use the [input_data_format](https://huggingface.co/docs/transformers/main/internal/image_processing_utils#transformers.image_transforms.rescale.input_data_format) parameter to assign the channel dimension.
The channel dimension is ambiguous. Got image shape (3, 35, 3)

TrainOutput(global_step=15510, training_loss=1.3168030462750768, metrics={'train_runtime': 3827.0107, 'train_samples_per_second': 32.422, 'train_steps_per_second': 4.053, 'total_flos': 9.284722649877971e+19, 'train_loss': 1.3168030462750768, 'epoch': 3.0})

In [30]:
from PIL import Image
import torch

image_path = "../imgs/date.png"
try:
    image = Image.open(image_path).convert("RGB")
except FileNotFoundError:
    print(f"Error: Image not found at {image_path}")
    exit()

pixel_values = processor(images=image, return_tensors="pt").pixel_values

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pixel_values = pixel_values.to(device)
model.to(device)

generated_ids = model.generate(pixel_values, max_length=128)

generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

print(f"Predicted text: {generated_text}")

Predicted text: VIERNES 28 DE NOVLEMBRE DE 1986
