# 04 - Traditional Baselines (CPU)

Run traditional OCR baselines: PaddleOCR, EasyOCR, DocTR.
These models run on CPU and serve as baselines.

In [None]:
# !pip install paddlepaddle paddleocr easyocr python-doctr[torch]

In [None]:
import sys
from pathlib import Path

PROJECT_ROOT = Path(".").resolve().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

import yaml
with open(PROJECT_ROOT / "config" / "benchmark_config.yaml") as f:
    config = yaml.safe_load(f)

SAMPLE_SET = PROJECT_ROOT / "data" / "sample_sets" / "stratified_100.json"
IMAGE_DIR = PROJECT_ROOT / config["paths"]["image_dir"]
OUTPUT_DIR = PROJECT_ROOT / config["paths"]["raw_outputs_dir"]

CPU_MODELS = ["paddleocr", "easyocr", "doctr"]

In [None]:
from src.pipeline.runner import run_model_on_sample_set, load_model_registry

registry = load_model_registry(str(PROJECT_ROOT / "config" / "model_registry.yaml"))
profiles = {}

for model_key in CPU_MODELS:
    print(f"\n{'='*60}")
    print(f"Running: {model_key}")
    print(f"{'='*60}")
    try:
        profile = run_model_on_sample_set(
            model_key=model_key,
            sample_set_path=SAMPLE_SET,
            image_dir=IMAGE_DIR,
            output_dir=OUTPUT_DIR,
            registry=registry,
            device="cpu",
            skip_existing=True,
        )
        profiles[model_key] = profile
    except Exception as e:
        print(f"FAILED: {e}")
        import traceback
        traceback.print_exc()

In [None]:
import pandas as pd

rows = []
for model_key, profile in profiles.items():
    rows.append({
        "model": model_key,
        "pages": profile.total_pages,
        "avg_sec/page": round(profile.avg_time_per_page, 2),
        "pages/min": round(profile.pages_per_minute, 1),
    })

df = pd.DataFrame(rows)
print(df.to_string(index=False))