# Dataset Exploration and Analysis

This notebook explores the dataset structure, visualizes samples, and performs statistical analysis on the training data.

## 1. Environment Setup and Imports

In [None]:
# Standard library imports
import os
import json
from pathlib import Path
from collections import defaultdict, Counter

# Data processing and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Rectangle
import cv2
from PIL import Image

# Deep learning
import torch
from torchvision import transforms
from ultralytics import YOLO

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

## 2. Dataset Loading and Structure

In [None]:
# Define dataset paths
DATASET_ROOT = Path('/path/to/dataset')  # Update with actual dataset path
IMAGES_DIR = DATASET_ROOT / 'images'
LABELS_DIR = DATASET_ROOT / 'labels'

# List dataset splits
print("Dataset Structure:")
for split in ['train', 'val', 'test']:
    split_images = list((IMAGES_DIR / split).glob('*.jpg')) if (IMAGES_DIR / split).exists() else []
    split_labels = list((LABELS_DIR / split).glob('*.txt')) if (LABELS_DIR / split).exists() else []
    print(f"  {split}: {len(split_images)} images, {len(split_labels)} labels")

## 3. Class Distribution Analysis

In [None]:
def analyze_class_distribution(labels_dir, split='train'):
    """Analyze class distribution in the dataset."""
    class_counts = Counter()
    total_annotations = 0
    
    label_files = list((labels_dir / split).glob('*.txt'))
    
    for label_file in label_files:
        with open(label_file, 'r') as f:
            for line in f:
                class_id = int(line.split()[0])
                class_counts[class_id] += 1
                total_annotations += 1
    
    return class_counts, total_annotations

# Analyze training data
class_counts, total_annots = analyze_class_distribution(LABELS_DIR)

print(f"Total annotations: {total_annots}")
print(f"\nClass distribution:")
for class_id in sorted(class_counts.keys()):
    count = class_counts[class_id]
    percentage = (count / total_annots) * 100
    print(f"  Class {class_id}: {count:5d} ({percentage:6.2f}%)")

## 4. Visualize Class Distribution

In [None]:
# Create class distribution visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart
classes = sorted(class_counts.keys())
counts = [class_counts[c] for c in classes]
axes[0].bar(classes, counts, color='steelblue')
axes[0].set_xlabel('Class ID', fontsize=12)
axes[0].set_ylabel('Number of Annotations', fontsize=12)
axes[0].set_title('Class Distribution (Count)', fontsize=14, fontweight='bold')
axes[0].grid(axis='y', alpha=0.3)

# Pie chart
colors = plt.cm.Set3(np.linspace(0, 1, len(classes)))
axes[1].pie(counts, labels=[f'Class {c}' for c in classes], autopct='%1.1f%%',
            colors=colors, startangle=90)
axes[1].set_title('Class Distribution (Percentage)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

## 5. Bounding Box Statistics

In [None]:
def analyze_bbox_statistics(labels_dir, images_dir, split='train'):
    """Analyze bounding box dimensions and positions."""
    bbox_widths = []
    bbox_heights = []
    bbox_areas = []
    aspect_ratios = []
    
    label_files = list((labels_dir / split).glob('*.txt'))
    
    for label_file in label_files:
        # Get image dimensions
        img_name = label_file.stem + '.jpg'
        img_path = images_dir / split / img_name
        
        if not img_path.exists():
            continue
            
        img = Image.open(img_path)
        img_w, img_h = img.size
        
        # Parse bounding boxes (YOLO format: class_id, x_center, y_center, width, height)
        with open(label_file, 'r') as f:
            for line in f:
                parts = list(map(float, line.strip().split()))
                if len(parts) >= 5:
                    bbox_w = parts[3] * img_w
                    bbox_h = parts[4] * img_h
                    bbox_widths.append(bbox_w)
                    bbox_heights.append(bbox_h)
                    bbox_areas.append(bbox_w * bbox_h)
                    if bbox_h > 0:
                        aspect_ratios.append(bbox_w / bbox_h)
    
    return {
        'widths': bbox_widths,
        'heights': bbox_heights,
        'areas': bbox_areas,
        'aspect_ratios': aspect_ratios
    }

bbox_stats = analyze_bbox_statistics(LABELS_DIR, IMAGES_DIR)

print("Bounding Box Statistics:")
print(f"\n  Width:")
print(f"    Mean: {np.mean(bbox_stats['widths']):.2f} px")
print(f"    Std: {np.std(bbox_stats['widths']):.2f} px")
print(f"    Min: {np.min(bbox_stats['widths']):.2f} px")
print(f"    Max: {np.max(bbox_stats['widths']):.2f} px")

print(f"\n  Height:")
print(f"    Mean: {np.mean(bbox_stats['heights']):.2f} px")
print(f"    Std: {np.std(bbox_stats['heights']):.2f} px")
print(f"    Min: {np.min(bbox_stats['heights']):.2f} px")
print(f"    Max: {np.max(bbox_stats['heights']):.2f} px")

print(f"\n  Area:")
print(f"    Mean: {np.mean(bbox_stats['areas']):.2f} px²")
print(f"    Median: {np.median(bbox_stats['areas']):.2f} px²")

print(f"\n  Aspect Ratio (W/H):")
print(f"    Mean: {np.mean(bbox_stats['aspect_ratios']):.2f}")
print(f"    Median: {np.median(bbox_stats['aspect_ratios']):.2f}")

## 6. Visualize Bounding Box Statistics

In [None]:
# Create bounding box statistics visualizations
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Width distribution
axes[0, 0].hist(bbox_stats['widths'], bins=50, color='skyblue', edgecolor='black', alpha=0.7)
axes[0, 0].set_xlabel('Width (pixels)', fontsize=11)
axes[0, 0].set_ylabel('Frequency', fontsize=11)
axes[0, 0].set_title('Bounding Box Width Distribution', fontsize=12, fontweight='bold')
axes[0, 0].grid(alpha=0.3)

# Height distribution
axes[0, 1].hist(bbox_stats['heights'], bins=50, color='lightcoral', edgecolor='black', alpha=0.7)
axes[0, 1].set_xlabel('Height (pixels)', fontsize=11)
axes[0, 1].set_ylabel('Frequency', fontsize=11)
axes[0, 1].set_title('Bounding Box Height Distribution', fontsize=12, fontweight='bold')
axes[0, 1].grid(alpha=0.3)

# Area distribution
axes[1, 0].hist(bbox_stats['areas'], bins=50, color='lightgreen', edgecolor='black', alpha=0.7)
axes[1, 0].set_xlabel('Area (pixels²)', fontsize=11)
axes[1, 0].set_ylabel('Frequency', fontsize=11)
axes[1, 0].set_title('Bounding Box Area Distribution', fontsize=12, fontweight='bold')
axes[1, 0].grid(alpha=0.3)

# Aspect ratio distribution
axes[1, 1].hist(bbox_stats['aspect_ratios'], bins=50, color='plum', edgecolor='black', alpha=0.7)
axes[1, 1].set_xlabel('Aspect Ratio (W/H)', fontsize=11)
axes[1, 1].set_ylabel('Frequency', fontsize=11)
axes[1, 1].set_title('Bounding Box Aspect Ratio Distribution', fontsize=12, fontweight='bold')
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Image Quality Checks

In [None]:
def analyze_image_quality(images_dir, split='train'):
    """Analyze image dimensions and properties."""
    widths = []
    heights = []
    file_sizes = []
    image_info = []
    
    img_files = list((images_dir / split).glob('*.jpg'))
    
    for img_path in img_files:
        try:
            img = Image.open(img_path)
            w, h = img.size
            file_size = os.path.getsize(img_path) / (1024 * 1024)  # Convert to MB
            
            widths.append(w)
            heights.append(h)
            file_sizes.append(file_size)
            
            image_info.append({
                'filename': img_path.name,
                'width': w,
                'height': h,
                'size_mb': file_size
            })
        except Exception as e:
            print(f"Error processing {img_path}: {e}")
    
    return {
        'widths': widths,
        'heights': heights,
        'file_sizes': file_sizes,
        'image_info': image_info
    }

quality_stats = analyze_image_quality(IMAGES_DIR)

print(f"Image Quality Statistics ({len(quality_stats['widths'])} images):")
print(f"\n  Resolution:")
print(f"    Mean: {np.mean(quality_stats['widths']):.0f}x{np.mean(quality_stats['heights']):.0f}")
print(f"    Min: {np.min(quality_stats['widths']):.0f}x{np.min(quality_stats['heights']):.0f}")
print(f"    Max: {np.max(quality_stats['widths']):.0f}x{np.max(quality_stats['heights']):.0f}")

print(f"\n  File Size:")
print(f"    Mean: {np.mean(quality_stats['file_sizes']):.2f} MB")
print(f"    Total: {np.sum(quality_stats['file_sizes']):.2f} MB")

# Check for uncommon aspect ratios
aspect_ratios = np.array(quality_stats['widths']) / np.array(quality_stats['heights'])
print(f"\n  Aspect Ratio (Image):")
print(f"    Mean: {np.mean(aspect_ratios):.2f}")
print(f"    Std: {np.std(aspect_ratios):.2f}")

## 8. Visualize Image Quality

In [None]:
# Visualize image quality metrics
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Resolution scatter plot
axes[0].scatter(quality_stats['widths'], quality_stats['heights'], 
                alpha=0.5, s=50, color='steelblue')
axes[0].set_xlabel('Image Width (pixels)', fontsize=11)
axes[0].set_ylabel('Image Height (pixels)', fontsize=11)
axes[0].set_title('Image Resolution Distribution', fontsize=12, fontweight='bold')
axes[0].grid(alpha=0.3)

# File size distribution
axes[1].hist(quality_stats['file_sizes'], bins=30, color='coral', edgecolor='black', alpha=0.7)
axes[1].set_xlabel('File Size (MB)', fontsize=11)
axes[1].set_ylabel('Frequency', fontsize=11)
axes[1].set_title('Image File Size Distribution', fontsize=12, fontweight='bold')
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 9. Sample Visualization with Bounding Boxes

In [None]:
def plot_sample_with_bboxes(image_path, label_path, class_names=None):
    """Plot an image with its bounding boxes."""
    img = Image.open(image_path)
    img_array = np.array(img)
    img_w, img_h = img.size
    
    fig, ax = plt.subplots(1, 1, figsize=(10, 8))
    ax.imshow(img_array)
    
    # Parse and draw bounding boxes
    with open(label_path, 'r') as f:
        for line in f:
            parts = list(map(float, line.strip().split()))
            if len(parts) >= 5:
                class_id = int(parts[0])
                x_center, y_center, bbox_w, bbox_h = parts[1:5]
                
                # Convert from normalized center coordinates to pixel coordinates
                x_center *= img_w
                y_center *= img_h
                bbox_w *= img_w
                bbox_h *= img_h
                
                # Convert to top-left corner coordinates
                x_min = x_center - bbox_w / 2
                y_min = y_center - bbox_h / 2
                
                # Draw rectangle
                rect = Rectangle((x_min, y_min), bbox_w, bbox_h, 
                                linewidth=2, edgecolor='red', facecolor='none')
                ax.add_patch(rect)
                
                # Add class label
                label_text = f"Class {class_id}" if class_names is None else class_names.get(class_id, f"Class {class_id}")
                ax.text(x_min, y_min - 5, label_text, fontsize=10, 
                       color='white', bbox=dict(facecolor='red', alpha=0.7))
    
    ax.set_title(f"Sample: {Path(image_path).name}", fontsize=12, fontweight='bold')
    ax.axis('off')
    plt.tight_layout()
    return fig

# Plot sample images
sample_images = list((IMAGES_DIR / 'train').glob('*.jpg'))[:5]  # First 5 images

for img_path in sample_images:
    label_path = LABELS_DIR / 'train' / (img_path.stem + '.txt')
    if label_path.exists():
        plot_sample_with_bboxes(img_path, label_path)
        plt.show()

## 10. Data Augmentation Examples

In [None]:
from albumentations import (
    Compose, HorizontalFlip, VerticalFlip, Rotate, GaussNoise,
    ColorJitter, RandomBrightnessContrast, GaussBlur, Normalize
)

# Define augmentation pipeline
augment_transform = Compose([
    HorizontalFlip(p=0.5),
    VerticalFlip(p=0.3),
    Rotate(limit=15, p=0.5),
    RandomBrightnessContrast(p=0.5),
    GaussNoise(p=0.3),
    GaussBlur(blur_limit=3, p=0.3),
    ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1, p=0.5),
], bbox_params=None)  # Update with appropriate bbox_params for your use case

print("Augmentation pipeline defined with:")
print("  - Horizontal Flip (50%)")
print("  - Vertical Flip (30%)")
print("  - Rotation (15°, 50%)")
print("  - Brightness/Contrast (50%)")
print("  - Gaussian Noise (30%)")
print("  - Gaussian Blur (30%)")
print("  - Color Jitter (50%)")

## 11. Visualize Augmentation Examples

In [None]:
# Select a sample image for augmentation visualization
sample_img_path = list((IMAGES_DIR / 'train').glob('*.jpg'))[0]
sample_img = cv2.imread(str(sample_img_path))
sample_img = cv2.cvtColor(sample_img, cv2.COLOR_BGR2RGB)

# Create augmented versions
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

# Original image
axes[0].imshow(sample_img)
axes[0].set_title('Original Image', fontsize=12, fontweight='bold')
axes[0].axis('off')

# Augmented versions
for i in range(1, 6):
    augmented = augment_transform(image=sample_img)['image']
    axes[i].imshow(augmented)
    axes[i].set_title(f'Augmented Version {i}', fontsize=12, fontweight='bold')
    axes[i].axis('off')

plt.tight_layout()
plt.show()

## 12. Summary Statistics

In [None]:
# Create comprehensive summary
print("="*60)
print("DATASET SUMMARY".center(60))
print("="*60)
print(f"\nTotal images: {len(quality_stats['widths'])}")
print(f"Total annotations: {total_annots}")
print(f"Number of classes: {len(class_counts)}")
print(f"Average annotations per image: {total_annots / len(quality_stats['widths']):.2f}")
print(f"\nTotal dataset size: {np.sum(quality_stats['file_sizes']):.2f} MB")
print(f"\nClass balance: {'Good' if max(class_counts.values()) / min(class_counts.values()) < 2 else 'Imbalanced'}")
print(f"\nRecommendations:")
print(f"  - Consider class weighting: {class_counts}")
print(f"  - Use data augmentation for underrepresented classes")
print(f"  - Monitor small objects (< 1000 px²)")
print("="*60)