# 1. Exploratory Data Analysis (EDA)

This notebook is for analyzing the input datasets (e.g., synthetic, ChartQA). We'll explore:
- Dataset statistics (number of samples, question types)
- Image properties (dimensions, color distributions)
- OCR quality on a sample of images
- Visualization of text and image embedding spaces.

In [None]:
import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import numpy as np

# Add src to path
sys.path.insert(0, str(Path.cwd().parent))
from src.ingestion.dataset_loader import DatasetLoader
from src.utils.config import load_config

## Load Data

In [None]:
config = load_config()
loader = DatasetLoader(config.paths.data_dir)
synthetic_data = loader.load_dataset('synthetic')
df = pd.DataFrame(synthetic_data)

print(f"Loaded {len(df)} samples from the synthetic dataset.")
df.head()

## Analyze Image Properties

In [None]:
image_dims = []
chart_dir = Path(config.paths.data_dir) / 'synthetic' / 'charts'

for img_path in df['chart_path'].unique():
    with Image.open(chart_dir / img_path) as img:
        image_dims.append(img.size)

df_dims = pd.DataFrame(image_dims, columns=['width', 'height'])

sns.jointplot(data=df_dims, x='width', y='height')
plt.suptitle('Distribution of Image Dimensions', y=1.02)
plt.show()