# PDF Detection Dataset Builder

This notebook converts the provided PDF annotations into a YOLO-style dataset, rasterizes the source pages, and shows how to fine-tune a detector (e.g., YOLOv8/YOLO11) to find QR codes, signatures, and stamps.


In [1]:
from __future__ import annotations

import json
import math
import hashlib
import shutil
from dataclasses import dataclass
from collections import defaultdict, Counter
from pathlib import Path
from typing import Dict, List, Tuple

from PIL import Image

try:
    from pdf2image import convert_from_path
except ImportError as exc:  # pragma: no cover
    raise ImportError(
        "pdf2image is required. Install via `pip install pdf2image` and make sure Poppler is available"
    ) from exc

PROJECT_ROOT = Path("/Users/prenl/Desktop/hackathon").resolve()

PDF_DIR = PROJECT_ROOT / "pdfs"
ANNOTATION_JSON = PROJECT_ROOT / "selected_annotations.json"
DATASET_DIR = PROJECT_ROOT / "dataset_yolo"
CACHE_DIR = PROJECT_ROOT / "_pdf_cache"
TEST_PDF_DIR = PROJECT_ROOT / "test"
POPPLER_PATH = "/opt/homebrew/bin"

DATASET_DIR.mkdir(exist_ok=True)
CACHE_DIR.mkdir(exist_ok=True)

print(f"Project root: {PROJECT_ROOT}")
print(f"PDF count: {len(list(PDF_DIR.glob('*.pdf')))}")


Project root: /Users/prenl/Desktop/hackathon
PDF count: 45


In [2]:
@dataclass(frozen=True)
class AnnotationRecord:
    pdf_name: str
    page_index: int
    category: str
    bbox: Dict[str, float]
    page_size: Dict[str, float]


def parse_annotations(json_path: Path) -> List[AnnotationRecord]:
    data = json.loads(json_path.read_text())
    records: List[AnnotationRecord] = []
    for pdf_name, pages in data.items():
        for page_key, page_payload in pages.items():
            page_index = int(page_key.split("_")[-1])
            page_size = page_payload["page_size"]
            for ann_wrapper in page_payload["annotations"]:
                (annotation_id, annotation_data), = ann_wrapper.items()
                records.append(
                    AnnotationRecord(
                        pdf_name=pdf_name,
                        page_index=page_index,
                        category=annotation_data["category"],
                        bbox=annotation_data["bbox"],
                        page_size=page_size,
                    )
                )
    return records


records = parse_annotations(ANNOTATION_JSON)
category_counts = Counter(record.category for record in records)
print(f"Loaded {len(records)} annotations across {len(category_counts)} categories")
category_counts


Loaded 258 annotations across 3 categories


Counter({'signature': 103, 'qr': 95, 'stamp': 60})

In [3]:
class PDFDatasetBuilder:
    def __init__(
        self,
        pdf_dir: Path,
        cache_dir: Path,
        output_dir: Path,
        records: List[AnnotationRecord],
        dpi: int = 300,
        val_ratio: float = 0.15,
        test_ratio: float = 0.10,
    ) -> None:
        self.pdf_dir = pdf_dir
        self.cache_dir = cache_dir
        self.output_dir = output_dir
        self.records = records
        self.dpi = dpi
        self.val_ratio = val_ratio
        self.test_ratio = test_ratio
        self.category_to_id = self._build_category_mapping()
        self.page_groups = self._group_records()

    def _build_category_mapping(self) -> Dict[str, int]:
        categories = sorted({record.category for record in self.records})
        return {category: idx for idx, category in enumerate(categories)}

    def _group_records(self) -> Dict[str, Dict[int, List[AnnotationRecord]]]:
        grouped: Dict[str, Dict[int, List[AnnotationRecord]]] = defaultdict(lambda: defaultdict(list))
        for record in self.records:
            grouped[record.pdf_name][record.page_index].append(record)
        return grouped

    def _assign_split(self, pdf_name: str) -> str:
        digest = hashlib.blake2b(pdf_name.encode("utf-8"), digest_size=4).digest()
        value = int.from_bytes(digest, byteorder="big") / 2**32
        if value < self.test_ratio:
            return "test"
        if value < self.test_ratio + self.val_ratio:
            return "val"
        return "train"

    def _rasterize_pdf(self, pdf_path: Path) -> Dict[int, Path]:
        pdf_cache = self.cache_dir / pdf_path.stem
        pdf_cache.mkdir(parents=True, exist_ok=True)
        existing = sorted(pdf_cache.glob("*.png"))
        if existing:
            return {int(img.stem.split("_page_")[-1]): img for img in existing}

        pages = convert_from_path(str(pdf_path), dpi=self.dpi, fmt="png", poppler_path=POPPLER_PATH)
        page_map: Dict[int, Path] = {}
        for idx, page in enumerate(pages, start=1):
            image_path = pdf_cache / f"{pdf_path.stem}_page_{idx:03d}.png"
            page.save(image_path)
            page_map[idx] = image_path
        return page_map

    @staticmethod
    def _bbox_to_yolo(record: AnnotationRecord, image_size: Tuple[int, int]) -> Tuple[float, float, float, float]:
        img_w, img_h = image_size
        page_w = record.page_size["width"]
        page_h = record.page_size["height"]
        scale_x = img_w / page_w
        scale_y = img_h / page_h
        x = record.bbox["x"] * scale_x
        y = record.bbox["y"] * scale_y
        w = record.bbox["width"] * scale_x
        h = record.bbox["height"] * scale_y
        x_center = (x + w / 2) / img_w
        y_center = (y + h / 2) / img_h
        return x_center, y_center, w / img_w, h / img_h

    def build(self, overwrite: bool = False) -> Path:
        for subdir in ("images/train", "images/val", "images/test", "labels/train", "labels/val", "labels/test"):
            (self.output_dir / subdir).mkdir(parents=True, exist_ok=True)

        for pdf_name, pages in self.page_groups.items():
            split = self._assign_split(pdf_name)
            pdf_path = self.pdf_dir / pdf_name
            if not pdf_path.exists():
                print(f"Missing PDF for {pdf_name}, skipping")
                continue
            rasterized_pages = self._rasterize_pdf(pdf_path)
            for page_index, annotations in pages.items():
                image_path = rasterized_pages.get(page_index)
                if image_path is None:
                    print(f"Page {page_index} missing for {pdf_name}, skipping")
                    continue
                split_image_dir = self.output_dir / "images" / split
                split_label_dir = self.output_dir / "labels" / split
                target_name = f"{pdf_path.stem}_page_{page_index:03d}.png"
                target_image_path = split_image_dir / target_name
                if not target_image_path.exists() or overwrite:
                    shutil.copy2(image_path, target_image_path)
                img_w, img_h = Image.open(image_path).size
                label_lines = []
                for ann in annotations:
                    class_id = self.category_to_id[ann.category]
                    x_c, y_c, w, h = self._bbox_to_yolo(ann, (img_w, img_h))
                    label_lines.append(f"{class_id} {x_c:.6f} {y_c:.6f} {w:.6f} {h:.6f}")
                label_path = split_label_dir / f"{target_image_path.stem}.txt"
                label_path.write_text("\n".join(label_lines))
        self._write_data_yaml()
        return self.output_dir

    def _write_data_yaml(self) -> None:
        names = [category for category, _ in sorted(self.category_to_id.items(), key=lambda kv: kv[1])]
        yaml_path = self.output_dir / "data.yaml"
        yaml_content = (
            f"path: {self.output_dir.resolve()}\n"
            "train: images/train\n"
            "val: images/val\n"
            "test: images/test\n"
            "names:\n"
        )
        yaml_content += "\n".join([f"  {idx}: {name}" for idx, name in enumerate(names)]) + "\n"
        yaml_path.write_text(yaml_content)
        print(f"YOLO data file written to {yaml_path}")



In [4]:
builder = PDFDatasetBuilder(
    pdf_dir=PDF_DIR,
    cache_dir=CACHE_DIR,
    output_dir=DATASET_DIR,
    records=records,
    dpi=300,
    val_ratio=0.15,
    test_ratio=0.10,
)
dataset_path = builder.build(overwrite=False)
print(f"Dataset ready at {dataset_path}")
print("Class mapping:", builder.category_to_id)


YOLO data file written to /Users/prenl/Desktop/hackathon/dataset_yolo/data.yaml
Dataset ready at /Users/prenl/Desktop/hackathon/dataset_yolo
Class mapping: {'qr': 0, 'signature': 1, 'stamp': 2}


In [5]:
def summarize_split(split: str) -> Dict[str, int]:
    label_dir = DATASET_DIR / "labels" / split
    stats = Counter()
    for label_file in label_dir.glob("*.txt"):
        for line in label_file.read_text().splitlines():
            class_id = line.split()[0]
            stats[class_id] += 1
    return stats

for split in ("train", "val", "test"):
    split_stats = summarize_split(split)
    print(f"[{split}] images: {len(list((DATASET_DIR / 'images' / split).glob('*.png')))}")
    for class_id, count in sorted(split_stats.items(), key=lambda kv: int(kv[0])):
        class_name = [name for name, idx in builder.category_to_id.items() if idx == int(class_id)][0]
        print(f"  class {class_id} ({class_name}): {count}")


[train] images: 78
  class 0 (qr): 95
  class 1 (signature): 77
  class 2 (stamp): 43
[val] images: 9
  class 1 (signature): 14
  class 2 (stamp): 10
[test] images: 4
  class 1 (signature): 12
  class 2 (stamp): 7


## Fine-tune YOLO

Install Ultralytics and fine-tune a lightweight checkpoint (YOLOv8n/YOLO11n) on the exported dataset. The `data.yaml` created above already points to the train/val/test folders. The callback below prints a concise loss/metric summary after every epoch.


In [7]:
# !pip install ultralytics --quiet

import torch
from ultralytics import YOLO

MODEL_NAME = "yolov10n.pt"
EPOCHS = 77
BATCH = 8

if not torch.backends.mps.is_available():
    raise RuntimeError("PyTorch MPS backend is unavailable on this machine")

DEVICE = "mps" # cpu, mps, cuda
print(f"Using device: {DEVICE}")


def log_epoch_summary(trainer):
    loss_dict = {name: value for name, value in zip(trainer.loss_names, trainer.loss_items)}
    metrics = trainer.metrics or {}
    loss_str = " ".join([f"{k}:{v:.4f}" for k, v in loss_dict.items()])
    metric_str = " ".join([f"{k}:{v:.4f}" for k, v in metrics.items()])
    print(
        f"[epoch {trainer.epoch + 1}/{trainer.epochs}] "
        f"losses[{loss_str}] metrics[{metric_str if metric_str else 'n/a'}]"
    )

model = YOLO(MODEL_NAME).to(DEVICE)
model.add_callback("on_fit_epoch_end", log_epoch_summary)

results = model.train(
    data=str(DATASET_DIR / "data.yaml"),
    epochs=EPOCHS,
    imgsz=1024,
    batch=BATCH,
    lr0=0.001,
    workers=4,
    device=DEVICE,
)
results


Using device: mps
[KDownloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov10n.pt to 'yolov10n.pt': 100% ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 5.6MB 8.5MB/s 0.7s0.6s<0.0s7s9s
[KDownloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov10n.pt to 'yolov10n.pt': 100% ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 5.6MB 8.5MB/s 0.7s
New https://pypi.org/project/ultralytics/8.3.229 available üòÉ Update with 'pip install -U ultralytics'
Ultralytics 8.3.228 üöÄ Python-3.10.18 torch-2.9.1 MPS (arm64)
New https://pypi.org/project/ultralytics/8.3.229 available üòÉ Update with 'pip install -U ultralytics'
Ultralytics 8.3.228 üöÄ Python-3.10.18 torch-2.9.1 MPS (arm64)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=8, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=/Users/p

ultralytics.utils.metrics.DetMetrics object with attributes:

ap_class_index: array([1, 2])
box: ultralytics.utils.metrics.Metric object
confusion_matrix: <ultralytics.utils.metrics.ConfusionMatrix object at 0x13458a530>
curves: ['Precision-Recall(B)', 'F1-Confidence(B)', 'Precision-Confidence(B)', 'Recall-Confidence(B)']
curves_results: [[array([          0,    0.001001,    0.002002,    0.003003,    0.004004,    0.005005,    0.006006,    0.007007,    0.008008,    0.009009,     0.01001,    0.011011,    0.012012,    0.013013,    0.014014,    0.015015,    0.016016,    0.017017,    0.018018,    0.019019,     0.02002,    0.021021,    0.022022,    0.023023,
          0.024024,    0.025025,    0.026026,    0.027027,    0.028028,    0.029029,     0.03003,    0.031031,    0.032032,    0.033033,    0.034034,    0.035035,    0.036036,    0.037037,    0.038038,    0.039039,     0.04004,    0.041041,    0.042042,    0.043043,    0.044044,    0.045045,    0.046046,    0.047047,
          0.048048, 