# 02 - Tier 1 Benchmark (T4 GPU)

Run all Tier 1 models on the stratified sample set.
Designed for Colab Pro with T4 GPU.

## Setup
1. Upload `colab_data.zip` to your Google Drive root
2. Select **T4 GPU** runtime in Colab
3. Run all cells in order

**Runtime**: ~2 hours for all Tier 1 models on 100 pages

In [None]:
# === Colab Setup: clone repo, install deps, unpack data ===
!git clone https://github.com/srepho/OCR_Opensource.git 2>/dev/null || echo "Already cloned"
%cd OCR_Opensource
!pip install -q -e .
!pip install -q -r requirements/colab_tier1.txt

# Mount Google Drive and unpack data
from google.colab import drive
drive.mount('/content/drive')

import zipfile, os
DATA_ZIP = '/content/drive/MyDrive/colab_data.zip'
if os.path.exists(DATA_ZIP):
    with zipfile.ZipFile(DATA_ZIP, 'r') as zf:
        zf.extractall('.')
    print(f"Unpacked data from {DATA_ZIP}")
else:
    print(f"WARNING: {DATA_ZIP} not found - upload colab_data.zip to Drive root")

In [None]:
import sys, torch
from pathlib import Path

PROJECT_ROOT = Path(".").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
if torch.cuda.is_available():
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")

# Verify data is present
assert Path("data/sample_sets/stratified_100.json").exists(), "Sample sets missing - check data setup"
assert Path("data/images").exists(), "Images missing - check data setup"
print("Data check OK")

In [None]:
import yaml

with open("config/benchmark_config.yaml") as f:
    config = yaml.safe_load(f)

SAMPLE_SET = "data/sample_sets/stratified_100.json"
IMAGE_DIR = config["paths"]["image_dir"]
OUTPUT_DIR = config["paths"]["raw_outputs_dir"]

# Tier 1 models (T4-compatible, <=8GB VRAM)
TIER1_MODELS = [
    "doctr",
    "lighton_ocr",
    "got_ocr2",
    "florence2",
    "dots_ocr",
    "deepseek_ocr",
    "nanonets_ocr",
    "ocrflux",
    "granite_vision",
    "monkey_ocr",
    "paddleocr_vl15",
    "qwen25_vl_3b",
    "granite_docling_258m",
]

print(f"Will run {len(TIER1_MODELS)} models on {SAMPLE_SET}")

In [None]:
from src.pipeline.runner import run_model_on_sample_set, load_model_registry

registry = load_model_registry("config/model_registry.yaml")
profiles = {}

for model_key in TIER1_MODELS:
    print(f"\n{'='*60}")
    print(f"Running: {model_key}")
    print(f"{'='*60}")
    try:
        profile = run_model_on_sample_set(
            model_key=model_key,
            sample_set_path=SAMPLE_SET,
            image_dir=IMAGE_DIR,
            output_dir=OUTPUT_DIR,
            registry=registry,
            device="cuda",
            skip_existing=True,
        )
        profiles[model_key] = profile
    except Exception as e:
        print(f"FAILED: {e}")
        import traceback
        traceback.print_exc()
    
    # Clear GPU memory between models
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        import gc; gc.collect()

In [None]:
# Summary of all runs
import pandas as pd

rows = []
for model_key, profile in profiles.items():
    rows.append({
        "model": model_key,
        "pages": profile.total_pages,
        "avg_sec/page": round(profile.avg_time_per_page, 2),
        "pages/min": round(profile.pages_per_minute, 1),
        "peak_GPU_MB": round(profile.peak_gpu_memory_mb, 0),
    })

df = pd.DataFrame(rows)
print(df.to_string(index=False))

# Save results back to Drive for persistence
import shutil
DRIVE_RESULTS = '/content/drive/MyDrive/ocr_results'
shutil.copytree('results/raw_outputs', f'{DRIVE_RESULTS}/raw_outputs', dirs_exist_ok=True)
print(f"\nResults saved to {DRIVE_RESULTS}")