# 04 - Traditional Baselines (CPU)

Run traditional OCR baselines: PaddleOCR, EasyOCR, DocTR, Tesseract.
These models run on CPU. Can run locally or on Colab.

## Setup (Colab)
1. Upload `colab_data.zip` to your Google Drive root
2. Any runtime is fine (CPU sufficient)
3. Run all cells in order

In [None]:
# === Setup: clone repo, install deps, unpack data ===
# Uncomment for Colab:
# !git clone https://github.com/srepho/OCR_Opensource.git 2>/dev/null || echo "Already cloned"
# %cd OCR_Opensource
# !pip install -q -e ".[traditional]"
# !apt-get install -q -y tesseract-ocr
# from google.colab import drive
# drive.mount('/content/drive')
# import zipfile, os
# DATA_ZIP = '/content/drive/MyDrive/colab_data.zip'
# if os.path.exists(DATA_ZIP):
#     with zipfile.ZipFile(DATA_ZIP, 'r') as zf:
#         zf.extractall('.')

# For local: pip install -e ".[traditional]" && brew install tesseract

In [None]:
import sys
from pathlib import Path

PROJECT_ROOT = Path(".").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

import yaml
with open("config/benchmark_config.yaml") as f:
    config = yaml.safe_load(f)

SAMPLE_SET = "data/sample_sets/stratified_100.json"
IMAGE_DIR = config["paths"]["image_dir"]
OUTPUT_DIR = config["paths"]["raw_outputs_dir"]

CPU_MODELS = ["tesseract", "paddleocr", "easyocr", "doctr"]

assert Path("data/sample_sets/stratified_100.json").exists(), "Sample sets missing"
print(f"Will run {len(CPU_MODELS)} models on {SAMPLE_SET}")

In [None]:
from src.pipeline.runner import run_model_on_sample_set, load_model_registry

registry = load_model_registry("config/model_registry.yaml")
profiles = {}

for model_key in CPU_MODELS:
    print(f"\n{'='*60}")
    print(f"Running: {model_key}")
    print(f"{'='*60}")
    try:
        profile = run_model_on_sample_set(
            model_key=model_key,
            sample_set_path=SAMPLE_SET,
            image_dir=IMAGE_DIR,
            output_dir=OUTPUT_DIR,
            registry=registry,
            device="cpu",
            skip_existing=True,
        )
        profiles[model_key] = profile
    except Exception as e:
        print(f"FAILED: {e}")
        import traceback
        traceback.print_exc()

In [None]:
import pandas as pd

rows = []
for model_key, profile in profiles.items():
    rows.append({
        "model": model_key,
        "pages": profile.total_pages,
        "avg_sec/page": round(profile.avg_time_per_page, 2),
        "pages/min": round(profile.pages_per_minute, 1),
    })

df = pd.DataFrame(rows)
print(df.to_string(index=False))