# Data Exploration - iFood 2019 Dataset

This notebook explores the iFood 2019 dataset, analyzes class distribution, and visualizes sample images.


In [None]:
import sys
import os
sys.path.append('../')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from PIL import Image
from collections import Counter

from src.data.utils import load_class_names
from scripts.analyze_class_imbalance import analyze_split, plot_class_distribution

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (15, 6)


## 1. Load Dataset Information


In [None]:
# Configuration - use project root for consistent paths
from src.utils import get_project_root, resolve_path

project_root = get_project_root()
data_dir = project_root.parent
annotations_dir = data_dir / "annotations"

# Load class names
class_list_file = annotations_dir / "class_list.txt"
class_names = load_class_names(str(class_list_file))

print(f"Project root: {project_root}")
print(f"Data directory: {data_dir}")
print(f"Total number of classes: {len(class_names)}")
print(f"\nFirst 10 classes:")
for i in range(10):
    print(f"  {i}: {class_names[i]}")

## 2. Analyze Class Distribution


In [None]:
# Analyze training split
train_csv = annotations_dir / "train_labels.csv"
val_csv = annotations_dir / "val_labels.csv"
results_plots_dir = project_root / "results" / "plots"

if train_csv.exists():
    train_counts, train_total = analyze_split(str(train_csv), "train", class_names)
    plot_class_distribution(train_counts, "train", str(results_plots_dir))

if val_csv.exists():
    val_counts, val_total = analyze_split(str(val_csv), "val", class_names)

## 3. Visualize Sample Images


In [None]:
def visualize_samples(csv_file, image_dir, class_names, num_classes=8, output_path=None):
    """Visualize sample images from different classes"""
    df = pd.read_csv(csv_file, header=None, names=['image_name', 'label'])
    unique_classes = df['label'].unique()[:num_classes]
    
    fig, axes = plt.subplots(num_classes, 2, figsize=(10, 4*num_classes))
    
    for i, class_id in enumerate(unique_classes):
        class_samples = df[df['label'] == class_id].head(2)
        for j, (_, row) in enumerate(class_samples.iterrows()):
            img_path = Path(image_dir) / row['image_name']
            if img_path.exists():
                img = Image.open(img_path)
                axes[i, j].imshow(img)
                axes[i, j].set_title(f"{class_names[class_id]} (Class {class_id})")
                axes[i, j].axis('off')
    
    plt.tight_layout()
    if output_path:
        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
    plt.show()

# Visualize training samples
train_images_dir = data_dir / "train_set"
if train_csv.exists() and train_images_dir.exists():
    output_path = results_plots_dir / "sample_images.png"
    visualize_samples(str(train_csv), str(train_images_dir), class_names, output_path=str(output_path))


## 4. Class Imbalance Analysis


In [None]:
if train_csv.exists():
    df_train = pd.read_csv(train_csv, header=None, names=['image_name', 'label'])
    class_counts = Counter(df_train['label'].values)
    counts_list = list(class_counts.values())
    
    print(f"Class Distribution Statistics:")
    print(f"  Min samples per class: {min(counts_list)}")
    print(f"  Max samples per class: {max(counts_list)}")
    print(f"  Mean samples per class: {np.mean(counts_list):.2f}")
    print(f"  Median samples per class: {np.median(counts_list):.2f}")
    print(f"  Std samples per class: {np.std(counts_list):.2f}")
    print(f"  Imbalance ratio (max/min): {max(counts_list) / min(counts_list):.2f}")
    
    plt.figure(figsize=(12, 5))
    plt.hist(counts_list, bins=50, edgecolor='black')
    plt.xlabel('Number of Images per Class')
    plt.ylabel('Number of Classes')
    plt.title('Distribution of Class Sizes in Training Set')
    plt.grid(True, alpha=0.3)
    output_path = results_plots_dir / "class_size_distribution.png"
    output_path.parent.mkdir(parents=True, exist_ok=True)
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    plt.show()


## 5. Summary

This exploration provides insights into:
- Dataset size and structure
- Class distribution and imbalance
- Visual characteristics of the data

These insights will guide data preprocessing and model training strategies.
