## 1. Import Required Libraries

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import cv2
from PIL import Image
import json

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

## 2. Load Dataset Information

In [None]:
# Define data paths
DATA_DIR = Path('data/detection')
IMAGES_DIR = DATA_DIR / 'images'
LABELS_DIR = DATA_DIR / 'labels'

# Count images
train_images = list((IMAGES_DIR / 'train').glob('*'))
val_images = list((IMAGES_DIR / 'val').glob('*'))

print(f"Dataset Statistics:")
print(f"  Training images: {len(train_images)}")
print(f"  Validation images: {len(val_images)}")
print(f"  Total: {len(train_images) + len(val_images)}")

## 3. Image Statistics

In [None]:
def get_image_stats(image_path):
    """Extract image statistics"""
    img = Image.open(image_path)
    return {
        'width': img.width,
        'height': img.height,
        'format': img.format,
        'size_mb': os.path.getsize(image_path) / (1024 * 1024)
    }

# Collect statistics
stats = []
all_images = train_images + val_images

for img_path in all_images[:min(100, len(all_images))]:
    try:
        stat = get_image_stats(img_path)
        stats.append(stat)
    except:
        continue

df_stats = pd.DataFrame(stats)

print(f"\nImage Dimensions:")
print(df_stats[['width', 'height']].describe())
print(f"\nFile Sizes (MB):")
print(df_stats['size_mb'].describe())

## 4. Annotation Analysis (YOLO Format)

In [None]:
def parse_yolo_annotation(label_path):
    """Parse YOLO format annotation file"""
    annotations = []
    if label_path.exists():
        with open(label_path, 'r') as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) >= 5:
                    annotations.append({
                        'class_id': int(parts[0]),
                        'x_center': float(parts[1]),
                        'y_center': float(parts[2]),
                        'width': float(parts[3]),
                        'height': float(parts[4])
                    })
    return annotations

# Count annotations
annotation_count = 0
annotation_list = []

for img_path in train_images[:min(100, len(train_images))]:
    label_path = (LABELS_DIR / 'train' / img_path.stem).with_suffix('.txt')
    annotations = parse_yolo_annotation(label_path)
    annotation_count += len(annotations)
    annotation_list.extend(annotations)

print(f"Total annotations in sample: {annotation_count}")
print(f"Average annotations per image: {annotation_count / min(100, len(train_images)):.2f}")

## 5. Annotation Size Distribution

In [None]:
if annotation_list:
    df_annotations = pd.DataFrame(annotation_list)
    
    # Calculate actual sizes
    df_annotations['box_area'] = df_annotations['width'] * df_annotations['height']
    
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # Box width distribution
    axes[0, 0].hist(df_annotations['width'], bins=30, color='skyblue', edgecolor='black')
    axes[0, 0].set_title('Bounding Box Width Distribution')
    axes[0, 0].set_xlabel('Normalized Width')
    axes[0, 0].set_ylabel('Frequency')
    
    # Box height distribution
    axes[0, 1].hist(df_annotations['height'], bins=30, color='lightcoral', edgecolor='black')
    axes[0, 1].set_title('Bounding Box Height Distribution')
    axes[0, 1].set_xlabel('Normalized Height')
    axes[0, 1].set_ylabel('Frequency')
    
    # Box area distribution
    axes[1, 0].hist(df_annotations['box_area'], bins=30, color='lightgreen', edgecolor='black')
    axes[1, 0].set_title('Bounding Box Area Distribution')
    axes[1, 0].set_xlabel('Normalized Area')
    axes[1, 0].set_ylabel('Frequency')
    
    # X-center distribution
    axes[1, 1].hist(df_annotations['x_center'], bins=30, color='lightyellow', edgecolor='black')
    axes[1, 1].set_title('Horizontal Position Distribution')
    axes[1, 1].set_xlabel('Normalized X Center')
    axes[1, 1].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nBounding Box Statistics:")
    print(df_annotations[['width', 'height', 'box_area']].describe())

## 6. Visualize Sample Annotations

In [None]:
def visualize_annotation(image_path, label_path, ax):
    """Visualize image with bounding boxes"""
    img = cv2.imread(str(image_path))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # type: ignore
    height, width = img.shape[:2]
    
    ax.imshow(img)
    ax.set_title(image_path.name)
    
    # Draw bounding boxes
    annotations = parse_yolo_annotation(label_path)
    for ann in annotations:
        x_center = ann['x_center'] * width
        y_center = ann['y_center'] * height
        w = ann['width'] * width
        h = ann['height'] * height
        
        x1 = int(x_center - w/2)
        y1 = int(y_center - h/2)
        x2 = int(x_center + w/2)
        y2 = int(y_center + h/2)
        
        rect = plt.Rectangle((x1, y1), w, h, fill=False, edgecolor='red', linewidth=2) # type: ignore
        ax.add_patch(rect)
    
    ax.axis('off')

# Visualize samples
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for idx, img_path in enumerate(train_images[:6]):
    label_path = (LABELS_DIR / 'train' / img_path.stem).with_suffix('.txt')
    visualize_annotation(img_path, label_path, axes[idx])

plt.tight_layout()
plt.show()

print("Sample annotations visualized")

## 7. Data Quality Checks

In [None]:
def check_data_quality():
    """Perform data quality checks"""
    issues = {
        'missing_labels': 0,
        'corrupted_images': 0,
        'empty_images': 0,
        'invalid_annotations': 0
    }
    
    for split in ['train', 'val']:
        image_dir = IMAGES_DIR / split
        label_dir = LABELS_DIR / split
        
        for img_path in image_dir.glob('*'):
            # Check if label exists
            label_path = (label_dir / img_path.stem).with_suffix('.txt')
            if not label_path.exists():
                issues['missing_labels'] += 1
                continue
            
            # Check image validity
            try:
                img = Image.open(img_path)
                if img.size[0] == 0 or img.size[1] == 0:
                    issues['empty_images'] += 1
            except Exception as e:
                issues['corrupted_images'] += 1
            
            # Check annotation validity
            try:
                annotations = parse_yolo_annotation(label_path)
                for ann in annotations:
                    if not (0 <= ann['x_center'] <= 1 and 
                           0 <= ann['y_center'] <= 1 and
                           0 < ann['width'] <= 1 and
                           0 < ann['height'] <= 1):
                        issues['invalid_annotations'] += 1
            except:
                issues['invalid_annotations'] += 1
    
    return issues

quality_issues = check_data_quality()

print("\nData Quality Report:")
print("="*40)
for issue, count in quality_issues.items():
    status = "✓" if count == 0 else "⚠"
    print(f"{status} {issue}: {count}")

## 8. Summary and Recommendations

In [None]:
print("\n" + "="*50)
print("DATASET ANALYSIS SUMMARY")
print("="*50)

print(f"\nDataset Size:")
print(f"  Train: {len(train_images)} images")
print(f"  Val: {len(val_images)} images")
print(f"  Total: {len(train_images) + len(val_images)} images")

if df_stats is not None:
    print(f"\nImage Properties:")
    print(f"  Avg Width: {df_stats['width'].mean():.0f} px")
    print(f"  Avg Height: {df_stats['height'].mean():.0f} px")
    print(f"  Avg Size: {df_stats['size_mb'].mean():.2f} MB")

print(f"\nAnnotations:")
print(f"  Total in sample: {annotation_count}")
print(f"  Avg per image: {annotation_count / min(100, len(train_images)):.2f}")

print(f"\nRecommendations:")
print(f"  • Image size for training: 640x480 (YOLO standard)")
print(f"  • Batch size: 32-64 (adjust based on GPU memory)")
print(f"  • Model: YOLOv8n or YOLOv8s for edge deployment")
print(f"  • Epochs: 100-150 with early stopping")