# AIMO 3 - Exploratory Data Analysis

This notebook explores the AIMO 3 competition data format and problem characteristics.


In [None]:
import sys
from pathlib import Path

# Add src to path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root / "src"))

from data.loader import load_problems
from data.preprocessing import extract_math_expressions, normalize_latex_text


In [None]:
# Load problems
data_dir = project_root / "data" / "raw"
problems = load_problems(data_dir)

print(f"Loaded {len(problems)} problems")


In [None]:
# Examine first problem
if problems:
    first_problem = problems[0]
    print("Problem ID:", first_problem.get("problem_id"))
    print("\nStatement:")
    print(first_problem.get("statement", "")[:500])


In [None]:
# Analyze problem characteristics
import matplotlib.pyplot as plt
import numpy as np

statement_lengths = [len(p.get("statement", "")) for p in problems]
math_counts = [len(extract_math_expressions(p.get("statement", ""))) for p in problems]

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].hist(statement_lengths, bins=50)
axes[0].set_xlabel("Statement Length")
axes[0].set_ylabel("Frequency")
axes[0].set_title("Distribution of Problem Statement Lengths")

axes[1].hist(math_counts, bins=20)
axes[1].set_xlabel("Number of Math Expressions")
axes[1].set_ylabel("Frequency")
axes[1].set_title("Distribution of Math Expressions per Problem")

plt.tight_layout()
plt.show()

print(f"Average statement length: {np.mean(statement_lengths):.1f}")
print(f"Average math expressions: {np.mean(math_counts):.1f}")
