# 01 - Build Sample Sets

Create stratified sample sets from the corpus for benchmarking.
Run locally after `00_data_prep.ipynb`.

In [None]:
import sys
from pathlib import Path

PROJECT_ROOT = Path(".").resolve().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

import yaml
with open(PROJECT_ROOT / "config" / "benchmark_config.yaml") as f:
    config = yaml.safe_load(f)

In [None]:
from src.data_prep.build_sample_sets import build_page_index, build_all_sample_sets

gt_text_dir = PROJECT_ROOT / config["paths"]["embedded_text_dir"]
image_dir = PROJECT_ROOT / config["paths"]["image_dir"]
sample_dir = PROJECT_ROOT / config["paths"]["sample_sets_dir"]

# Build page index first to see content distribution
pages = build_page_index(gt_text_dir, image_dir)
print(f"Total indexed pages: {len(pages)}")

# Content type distribution
from collections import Counter
type_counts = Counter(p["content_type"] for p in pages)
print("\nContent type distribution:")
for t, c in type_counts.most_common():
    print(f"  {t}: {c} ({100*c/len(pages):.1f}%)")

In [None]:
# Build all sample sets
build_all_sample_sets(
    embedded_text_dir=gt_text_dir,
    image_dir=image_dir,
    output_dir=sample_dir,
    seed=config["sampling"]["random_seed"],
)

In [None]:
# Verify sample sets
import json

for ss_file in sorted(sample_dir.glob("*.json")):
    with open(ss_file) as f:
        ss = json.load(f)
    print(f"{ss['name']}:")
    print(f"  Pages: {ss['total_pages']}, Docs: {ss['unique_documents']}")
    print(f"  Content types: {ss['content_type_distribution']}")
    print()