# Build Real Dataset Index

This notebook creates an index file for the real wave dataset from the labels.json file.

## Purpose
- Load wave parameter labels from JSON file
- Verify image files exist on disk
- Create JSONL index for downstream processing
- Analyze dataset statistics

In [None]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from pathlib import Path

In [None]:
# Import utility functions from previous notebook
%run 02_data_loading.ipynb

## Configuration

In [None]:
# Configuration - modify these paths as needed
IMAGES_DIR = "data/real/images"
LABELS_JSON = "data/real/labels.json"
OUT_INDEX = "data/processed/real_index.jsonl"

print(f"Images directory: {IMAGES_DIR}")
print(f"Labels file: {LABELS_JSON}")
print(f"Output index: {OUT_INDEX}")

## Index Building Function

In [None]:
def build_real_index(images_dir: str, labels_json: str, out_jsonl: str) -> list:
    """Build index from real dataset labels and images."""
    
    # Load labels
    with open(labels_json, "r", encoding="utf-8") as f:
        labels = json.load(f)

    if not isinstance(labels, dict):
        raise ValueError("labels.json must be a dict: {filename: {...}}")

    ensure_dir(os.path.dirname(out_jsonl))

    records = []
    missing = 0

    for filename, ann in labels.items():
        img_path = os.path.join(images_dir, filename)
        if not os.path.exists(img_path):
            missing += 1
            print(f"Warning: Missing image {img_path}")
            continue

        rec = {
            "image_path": img_path,
            "height_meters": float(ann["height_meters"]),
            "wave_type": str(ann["wave_type"]),
            "direction": str(ann["direction"]),
            "confidence": str(ann.get("confidence", "medium")),
            "notes": str(ann.get("notes", "")),
            "data_key": int(ann.get("data_key", -1)),
            "source": "real",
        }
        records.append(rec)

    # Write JSONL
    write_jsonl(records, out_jsonl)

    print(f"Saved {len(records)} records to {out_jsonl}")
    if missing:
        print(f"Warning: {missing} images missing on disk")
    
    return records

## Build Index

In [None]:
# Check if input files exist
if os.path.exists(LABELS_JSON):
    print(f"✓ Labels file found: {LABELS_JSON}")
else:
    print(f"✗ Labels file not found: {LABELS_JSON}")
    print("Please ensure the labels.json file exists in the correct location.")

if os.path.exists(IMAGES_DIR):
    print(f"✓ Images directory found: {IMAGES_DIR}")
    image_count = len([f for f in os.listdir(IMAGES_DIR) if f.lower().endswith(('.jpg', '.jpeg', '.png'))])
    print(f"  Found {image_count} image files")
else:
    print(f"✗ Images directory not found: {IMAGES_DIR}")
    print("Please ensure the images directory exists.")

In [None]:
# Build the index if files exist
if os.path.exists(LABELS_JSON) and os.path.exists(IMAGES_DIR):
    records = build_real_index(IMAGES_DIR, LABELS_JSON, OUT_INDEX)
    print(f"\n✓ Index built successfully with {len(records)} records")
else:
    print("\n⚠️ Cannot build index - missing input files")
    # Create dummy data for demonstration
    print("Creating dummy data for demonstration...")
    records = [
        {
            "image_path": f"data/real/images/img_{i:03d}.jpg",
            "height_meters": float(np.random.uniform(0.5, 2.5)),
            "wave_type": np.random.choice(["beach_break", "reef_break", "point_break", "closeout", "a_frame"]),
            "direction": np.random.choice(["left", "right", "both"]),
            "confidence": np.random.choice(["high", "medium", "low"]),
            "notes": f"Example note {i}",
            "data_key": i,
            "source": "real"
        }
        for i in range(100)
    ]

## Dataset Analysis

In [None]:
# Convert to DataFrame for analysis
df = pd.DataFrame(records)

print(f"Dataset Summary:")
print(f"Total samples: {len(df)}")
print(f"\nHeight statistics:")
print(df['height_meters'].describe())

print(f"\nWave type distribution:")
print(df['wave_type'].value_counts())

print(f"\nDirection distribution:")
print(df['direction'].value_counts())

print(f"\nConfidence distribution:")
print(df['confidence'].value_counts())

In [None]:
# Visualize dataset statistics
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Height distribution
axes[0, 0].hist(df['height_meters'], bins=20, alpha=0.7, edgecolor='black')
axes[0, 0].set_title('Wave Height Distribution')
axes[0, 0].set_xlabel('Height (meters)')
axes[0, 0].set_ylabel('Frequency')

# Wave type distribution
wave_type_counts = df['wave_type'].value_counts()
axes[0, 1].bar(wave_type_counts.index, wave_type_counts.values)
axes[0, 1].set_title('Wave Type Distribution')
axes[0, 1].set_xlabel('Wave Type')
axes[0, 1].set_ylabel('Count')
axes[0, 1].tick_params(axis='x', rotation=45)

# Direction distribution
direction_counts = df['direction'].value_counts()
axes[1, 0].pie(direction_counts.values, labels=direction_counts.index, autopct='%1.1f%%')
axes[1, 0].set_title('Direction Distribution')

# Confidence distribution
confidence_counts = df['confidence'].value_counts()
axes[1, 1].bar(confidence_counts.index, confidence_counts.values, 
               color=['green', 'orange', 'red'])
axes[1, 1].set_title('Confidence Distribution')
axes[1, 1].set_xlabel('Confidence Level')
axes[1, 1].set_ylabel('Count')

plt.tight_layout()
plt.show()

In [None]:
# Cross-tabulation analysis
print("Wave Type vs Direction Cross-tabulation:")
crosstab = pd.crosstab(df['wave_type'], df['direction'])
print(crosstab)

# Visualize cross-tabulation
plt.figure(figsize=(10, 6))
sns.heatmap(crosstab, annot=True, fmt='d', cmap='Blues')
plt.title('Wave Type vs Direction Cross-tabulation')
plt.tight_layout()
plt.show()

In [None]:
# Height distribution by wave type
plt.figure(figsize=(12, 6))
df.boxplot(column='height_meters', by='wave_type', ax=plt.gca())
plt.title('Wave Height Distribution by Wave Type')
plt.suptitle('')  # Remove default title
plt.xlabel('Wave Type')
plt.ylabel('Height (meters)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Validation and Quality Checks

In [None]:
# Data quality checks
print("Data Quality Checks:")
print(f"✓ Total records: {len(df)}")

# Check for missing values
missing_values = df.isnull().sum()
if missing_values.sum() == 0:
    print("✓ No missing values found")
else:
    print(f"⚠️ Missing values found:")
    print(missing_values[missing_values > 0])

# Check height range
min_height, max_height = df['height_meters'].min(), df['height_meters'].max()
print(f"✓ Height range: {min_height:.2f}m to {max_height:.2f}m")

if min_height < 0:
    print("⚠️ Warning: Negative height values found")
if max_height > 10:
    print("⚠️ Warning: Very large height values found (>10m)")

# Check for valid wave types and directions
valid_wave_types = {"beach_break", "reef_break", "point_break", "closeout", "a_frame"}
valid_directions = {"left", "right", "both"}

invalid_wave_types = set(df['wave_type']) - valid_wave_types
invalid_directions = set(df['direction']) - valid_directions

if not invalid_wave_types:
    print("✓ All wave types are valid")
else:
    print(f"⚠️ Invalid wave types found: {invalid_wave_types}")

if not invalid_directions:
    print("✓ All directions are valid")
else:
    print(f"⚠️ Invalid directions found: {invalid_directions}")

print(f"\n✓ Index file saved to: {OUT_INDEX}")

## Sample Records

In [None]:
# Display sample records
print("Sample records from the dataset:")
print(df.head(10).to_string(index=False))

In [None]:
# Save summary statistics
summary_stats = {
    "total_samples": len(df),
    "height_stats": df['height_meters'].describe().to_dict(),
    "wave_type_counts": df['wave_type'].value_counts().to_dict(),
    "direction_counts": df['direction'].value_counts().to_dict(),
    "confidence_counts": df['confidence'].value_counts().to_dict()
}

summary_path = OUT_INDEX.replace('.jsonl', '_summary.json')
with open(summary_path, 'w') as f:
    json.dump(summary_stats, f, indent=2)

print(f"Summary statistics saved to: {summary_path}")