# 00 - Data Preparation

This notebook handles:
1. Extracting PDFs from zip batches
2. Rendering PDF pages to PNG images
3. Extracting embedded text as ground truth

Run locally before Colab benchmarks.

In [None]:
import sys
from pathlib import Path

# Ensure project root is on path
PROJECT_ROOT = Path(".").resolve().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print(f"Project root: {PROJECT_ROOT}")

In [None]:
import yaml

with open(PROJECT_ROOT / "config" / "benchmark_config.yaml") as f:
    config = yaml.safe_load(f)

# Show key paths
for key, val in config["paths"].items():
    print(f"{key}: {val}")

## Step 1: Extract PDFs from Zip Batches

In [None]:
from src.data_prep.extract_pdfs import extract_all_batches

zip_paths = [PROJECT_ROOT / p for p in config["zip_batches"]]
pdf_dir = PROJECT_ROOT / config["paths"]["pdf_dir"]

print(f"Looking for zip batches: {[str(p) for p in zip_paths]}")
print(f"Output directory: {pdf_dir}")

pdfs = extract_all_batches(zip_paths, pdf_dir)

In [None]:
# Verify extraction
import os

pdf_files = sorted(pdf_dir.glob("*.pdf"))
print(f"Total PDFs: {len(pdf_files)}")

total_size = sum(f.stat().st_size for f in pdf_files)
print(f"Total size: {total_size / (1024*1024):.1f} MB")
print(f"Average size: {total_size / len(pdf_files) / 1024:.0f} KB" if pdf_files else "No PDFs")

# Show first 10
for f in pdf_files[:10]:
    print(f"  {f.name} ({f.stat().st_size / 1024:.0f} KB)")
if len(pdf_files) > 10:
    print(f"  ... and {len(pdf_files) - 10} more")

## Step 2: Render PDF Pages to PNG

In [None]:
from src.data_prep.render_pages import render_all_pdfs

image_dir = PROJECT_ROOT / config["paths"]["image_dir"]
dpi = config["rendering"]["dpi"]

print(f"Rendering at {dpi} DPI to {image_dir}")

rendered = render_all_pdfs(
    pdf_dir=pdf_dir,
    image_dir=image_dir,
    dpi=dpi,
    max_pages=config["rendering"].get("max_pages_per_pdf"),
)

In [None]:
# Verify rendering
total_images = sum(len(v) for v in rendered.values())
print(f"Rendered {total_images} page images from {len(rendered)} PDFs")

# Show a sample image
from IPython.display import display
from PIL import Image

sample_stem = list(rendered.keys())[0]
sample_images = rendered[sample_stem]
if sample_images:
    img = Image.open(str(sample_images[0]))
    print(f"Sample: {sample_images[0].name} ({img.size[0]}x{img.size[1]})")
    display(img.resize((400, int(400 * img.size[1] / img.size[0]))))

## Step 3: Extract Embedded Text (Ground Truth)

In [None]:
from src.data_prep.extract_embedded_text import extract_all_pdfs

gt_text_dir = PROJECT_ROOT / config["paths"]["embedded_text_dir"]
threshold = config["ground_truth"]["discrepancy_threshold"]

print(f"Extracting embedded text to {gt_text_dir}")
print(f"Discrepancy threshold: {threshold}")

all_meta = extract_all_pdfs(
    pdf_dir=pdf_dir,
    output_dir=gt_text_dir,
    discrepancy_threshold=threshold,
)

In [None]:
# Summary statistics
import json

meta_path = gt_text_dir / "metadata.json"
with open(meta_path) as f:
    global_meta = json.load(f)

print(f"Total PDFs: {global_meta['total_pdfs']}")
print(f"Total pages: {global_meta['total_pages']}")
print(f"Pages with discrepancy: {global_meta['pages_with_discrepancy']}")
print(f"Empty pages: {global_meta['empty_pages']}")

In [None]:
# Show sample extracted text
sample_text_dir = gt_text_dir / sample_stem
sample_text_files = sorted(sample_text_dir.glob("page_*.txt"))
if sample_text_files:
    text = sample_text_files[0].read_text()
    print(f"Sample GT from {sample_stem}/page_001.txt:")
    print("=" * 60)
    print(text[:500])
    if len(text) > 500:
        print(f"\n... ({len(text)} total characters)")

## Summary

Data preparation complete. Next steps:
- Run `01_build_sample_sets.ipynb` to create stratified sample sets
- Then run Colab notebooks for model benchmarking