# Explore Dataset

This notebook helps you explore the labelled dataset, run OCR+features+classify, and inspect examples.

## Setup

Make sure you've:
1. Labelled some images using `streamlit run ui/label_app.py`
2. Installed all dependencies from `requirements.txt`


In [None]:
import sys
from pathlib import Path

# Add src to path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root))

import pandas as pd
from src.ocr import run_ocr
from src.features import compute_features
from src.heuristics import classify


## Load Labels


In [None]:
labels_file = project_root / "data" / "labelled" / "labels.csv"
df = pd.read_csv(labels_file)
print(f"Total labelled images: {len(df)}")
print(f"\nLabel distribution:")
print(df["true_label"].value_counts())


## Inspect Examples

Run OCR, compute features, and classify a few examples:


In [None]:
# Example: process first image
if len(df) > 0:
    first_row = df.iloc[0]
    image_path = project_root / first_row["image_path"]
    true_label = first_row["true_label"]
    
    print(f"Image: {image_path}")
    print(f"True label: {true_label}")
    
    # Run OCR
    ocr_result = run_ocr(image_path)
    print(f"\nOCR text (first 200 chars): {ocr_result.full_text[:200]}...")
    
    # Compute features
    features = compute_features(ocr_result)
    print(f"\nFeatures:")
    print(f"  - Line count: {features.layout.line_count}")
    print(f"  - Cooking verbs: {features.num_cooking_verbs}")
    print(f"  - Workout terms: {features.num_workout_terms}")
    
    # Classify
    result = classify(features, threshold=5.0)
    print(f"\nClassification:")
    print(f"  - Predicted: {result.item_type}")
    print(f"  - Scores: {result.scores}")
    print(f"  - Correct: {result.item_type == true_label}")
