# Offline EDA Report (Jury Evaluation)

This notebook loads `eda_report.json`, generates plots, and exports them to `eda_report.pdf`.

**Expected workflow**:
1. Run `./run_eda.sh` (or `python eda.py --images data/images --out outputs/eda`) to generate EDA artifacts.
2. Open and run this notebook if you want interactive plots.

Notes:
- This notebook is **offline** and must **not** be used during extraction.
- PDF export is done via matplotlib `PdfPages` so it works without `nbconvert`.

In [None]:
from pathlib import Path
import json

import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages

report_path = Path('outputs/eda/eda_report.json')
assert report_path.exists(), f'Missing {report_path}. Run EDA first.'
report = json.loads(report_path.read_text(encoding='utf-8'))

out_pdf = report_path.parent / 'eda_report.pdf'
print('Loaded:', report_path)
print('Will write:', out_pdf)

In [None]:
samples = report.get('samples') or []
widths = [int(s.get('width') or 0) for s in samples if isinstance(s, dict)]
heights = [int(s.get('height') or 0) for s in samples if isinstance(s, dict)]

kws = report.get('keyword_frequency') or {}
langs = report.get('language_distribution') or {}
heat = (report.get('layout_heatmap') or {}).get('counts')
grid = int((report.get('layout_heatmap') or {}).get('grid_size') or 0)

print('Sample count:', len(samples))
print('Heatmap grid:', grid)

In [None]:
# Image size histogram
plt.figure(figsize=(8.5, 4.5))
plt.title('Image width distribution')
plt.hist([w for w in widths if w > 0], bins=20)
plt.xlabel('Width (px)'); plt.ylabel('Count'); plt.grid(True, alpha=0.2)
plt.show()

plt.figure(figsize=(8.5, 4.5))
plt.title('Image height distribution')
plt.hist([h for h in heights if h > 0], bins=20)
plt.xlabel('Height (px)'); plt.ylabel('Count'); plt.grid(True, alpha=0.2)
plt.show()

In [None]:
# Keyword frequency bar chart
items = [(str(k), int(v)) for k, v in (kws.items() if isinstance(kws, dict) else [])]
items.sort(key=lambda kv: kv[1], reverse=True)
items = items[:15]

plt.figure(figsize=(10, 5.5))
plt.title('Keyword frequency (top 15)')
plt.bar([k for k, _ in items], [v for _, v in items])
plt.xticks(rotation=45, ha='right')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
# Language pie chart
lang_items = [(str(k), int(v)) for k, v in (langs.items() if isinstance(langs, dict) else [])]
lang_items.sort(key=lambda kv: kv[1], reverse=True)

plt.figure(figsize=(7, 7))
plt.title('Language distribution')
if lang_items:
    plt.pie([v for _, v in lang_items], labels=[k for k, _ in lang_items], autopct='%1.1f%%')
else:
    plt.text(0.5, 0.5, 'No OCR language data', ha='center', va='center')
plt.show()

In [None]:
# Bounding box heatmap
plt.figure(figsize=(7, 7))
plt.title('Text location heatmap')
if isinstance(heat, list) and grid > 0:
    arr = np.array(heat, dtype=float)
    plt.imshow(arr, cmap='hot', origin='upper')
    plt.colorbar(fraction=0.046, pad=0.04)
    plt.axis('off')
else:
    plt.text(0.5, 0.5, 'No heatmap data', ha='center', va='center')
plt.show()

In [None]:
# Export all plots to PDF (multi-page)
with PdfPages(str(out_pdf)) as pdf:
    # Width histogram
    fig = plt.figure(figsize=(8.5, 5))
    plt.title('Image width distribution')
    plt.hist([w for w in widths if w > 0], bins=20)
    plt.xlabel('Width (px)'); plt.ylabel('Count'); plt.grid(True, alpha=0.2)
    pdf.savefig(fig); plt.close(fig)

    # Height histogram
    fig = plt.figure(figsize=(8.5, 5))
    plt.title('Image height distribution')
    plt.hist([h for h in heights if h > 0], bins=20)
    plt.xlabel('Height (px)'); plt.ylabel('Count'); plt.grid(True, alpha=0.2)
    pdf.savefig(fig); plt.close(fig)

    # Keyword bar
    fig = plt.figure(figsize=(10, 6))
    plt.title('Keyword frequency (top 15)')
    plt.bar([k for k, _ in items], [v for _, v in items])
    plt.xticks(rotation=45, ha='right')
    plt.ylabel('Count')
    plt.tight_layout()
    pdf.savefig(fig); plt.close(fig)

    # Language pie
    fig = plt.figure(figsize=(8, 8))
    plt.title('Language distribution')
    if lang_items:
        plt.pie([v for _, v in lang_items], labels=[k for k, _ in lang_items], autopct='%1.1f%%')
    else:
        plt.text(0.5, 0.5, 'No OCR language data', ha='center', va='center')
    pdf.savefig(fig); plt.close(fig)

    # Heatmap
    fig = plt.figure(figsize=(8, 8))
    plt.title('Text location heatmap')
    if isinstance(heat, list) and grid > 0:
        arr = np.array(heat, dtype=float)
        plt.imshow(arr, cmap='hot', origin='upper')
        plt.colorbar(fraction=0.046, pad=0.04)
        plt.axis('off')
    else:
        plt.text(0.5, 0.5, 'No heatmap data', ha='center', va='center')
    pdf.savefig(fig); plt.close(fig)

print('Wrote:', out_pdf)