# 04 - Traditional Baselines (CPU)

Run traditional OCR baselines: PaddleOCR, EasyOCR, DocTR, Tesseract.
These models run on CPU. Can run locally or on Colab.

## Setup (Colab)
1. Upload `colab_data.zip` to your Google Drive root
2. Any runtime is fine (CPU sufficient)
3. Run all cells in order

In [None]:
# === Setup: clone repo, install deps, unpack data ===
# Uncomment for Colab:
# !git clone https://github.com/srepho/OCR_Opensource.git 2>/dev/null || echo "Already cloned"
# %cd OCR_Opensource
# !pip install -q -e ".[traditional]"
# !apt-get install -q -y tesseract-ocr
# from google.colab import drive
# drive.mount('/content/drive')
# import zipfile, os
# DATA_ZIP = '/content/drive/MyDrive/colab_data.zip'
# if os.path.exists(DATA_ZIP):
#     with zipfile.ZipFile(DATA_ZIP, 'r') as zf:
#         zf.extractall('.')

# For local: pip install -e ".[traditional]" && brew install tesseract

In [None]:
import sys
from pathlib import Path

PROJECT_ROOT = Path(".").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

import yaml
with open("config/benchmark_config.yaml") as f:
    config = yaml.safe_load(f)

SAMPLE_SET = "data/sample_sets/stratified_100.json"
IMAGE_DIR = config["paths"]["image_dir"]
OUTPUT_DIR = config["paths"]["raw_outputs_dir"]

CPU_MODELS = ["tesseract", "paddleocr", "easyocr", "doctr"]

assert Path("data/sample_sets/stratified_100.json").exists(), "Sample sets missing"
print(f"Will run {len(CPU_MODELS)} models on {SAMPLE_SET}")

## Quick Validation (1 page per model)

In [None]:
import gc, json, time, traceback
from PIL import Image
from src.pipeline.runner import instantiate_adapter, load_model_registry

registry = load_model_registry("config/model_registry.yaml")

# Load one test image
sample = json.loads(Path("data/sample_sets/quick_dev.json").read_text())
test_page = sample["pages"][0]
test_image_path = Path(IMAGE_DIR) / test_page["pdf_stem"] / f"page_{test_page['page_num']:03d}.png"
test_image = Image.open(str(test_image_path)).convert("RGB")
print(f"Test image: {test_image_path.name} ({test_image.size})\n")

validation_results = {}

for model_key in CPU_MODELS:
    print(f"--- {model_key} ---")
    adapter = None
    try:
        adapter = instantiate_adapter(model_key, registry, device="cpu")
        t0 = time.time()
        adapter.load_model()
        load_time = time.time() - t0
        
        t0 = time.time()
        result = adapter.ocr_page(test_image)
        infer_time = time.time() - t0
        
        text_preview = result.text[:150].replace('\n', ' ')
        print(f"  PASS  load={load_time:.1f}s  infer={infer_time:.1f}s  chars={len(result.text)}  preview: {text_preview}")
        validation_results[model_key] = "PASS"
    except Exception as e:
        print(f"  FAIL  {type(e).__name__}: {e}")
        traceback.print_exc(limit=2)
        validation_results[model_key] = f"FAIL: {e}"
    finally:
        if adapter is not None:
            try:
                adapter.unload_model()
            except Exception:
                pass
            del adapter
        gc.collect()
    print()

print("=" * 60)
print("VALIDATION SUMMARY")
print("=" * 60)
passed = [k for k, v in validation_results.items() if v == "PASS"]
for k, v in validation_results.items():
    status = "PASS" if v == "PASS" else "FAIL"
    print(f"  [{status}] {k}")
print(f"\n{len(passed)}/{len(validation_results)} models passed")

CPU_VALIDATED = passed
print(f"\nModels for full benchmark: {CPU_VALIDATED}")

## Full Benchmark (validated models only)

In [None]:
from src.pipeline.runner import run_model_on_sample_set

profiles = {}

for model_key in CPU_VALIDATED:
    print(f"\n{'='*60}")
    print(f"Running: {model_key}")
    print(f"{'='*60}")
    try:
        profile = run_model_on_sample_set(
            model_key=model_key,
            sample_set_path=SAMPLE_SET,
            image_dir=IMAGE_DIR,
            output_dir=OUTPUT_DIR,
            registry=registry,
            device="cpu",
            skip_existing=True,
        )
        profiles[model_key] = profile
    except Exception as e:
        print(f"FAILED: {e}")
        import traceback
        traceback.print_exc()

In [None]:
import pandas as pd

rows = []
for model_key, profile in profiles.items():
    rows.append({
        "model": model_key,
        "pages": profile.total_pages,
        "avg_sec/page": round(profile.avg_time_per_page, 2),
        "pages/min": round(profile.pages_per_minute, 1),
    })

df = pd.DataFrame(rows)
print(df.to_string(index=False))