# Notebook Explanation

## 1. Load Dataset:
- The code reads the image files and annotations JSON file.
- Ensures the dataset is ready for splitting.

## 2. Splitting Dataset:
- Uses train_test_split from sklearn to split the dataset:
    - 80% for training, further split into 64% training and 16% validation.
    - 20% for testing.

## 3. Filter Annotations:
Filters annotations to ensure only those relevant to each split are included.

## 4. Save Splits:
Saves the training, validation, and test annotations to separate JSON files.

## Summary:
- Logs the number of images and annotations in each split.

## Expected Output:
- Logs:
    - Total number of images and annotations found.
    - Number of images and annotations in each split.
- Generated Files:
    - ../data/train_annotations.json
    - ../data/val_annotations.json
    - ../data/test_annotations.json

## Next Steps:
- Verify the splits by checking the generated JSON files.
- Use train_annotations.json and val_annotations.json in model_training.ipynb.
- Use test_annotations.json for evaluating the model in model_evaluation.ipynb.

In [None]:
# Import necessary libraries
import os
import json
from sklearn.model_selection import train_test_split

# Define dataset paths
image_dir = "../data/images"  # Directory containing image files
annotation_file = "../data/annotations.json"  # Path to the annotations file

# Output paths for splits
train_annotation_file = "../data/train_annotations.json"
val_annotation_file = "../data/val_annotations.json"
test_annotation_file = "../data/test_annotations.json"

# Load all image file names
image_files = [f for f in os.listdir(image_dir) if f.endswith('.jpg') or f.endswith('.png')]
print(f"Total images found: {len(image_files)}")

# Load annotations
with open(annotation_file, 'r') as f:
    annotations = json.load(f)
print(f"Total annotations found: {len(annotations)}")

# Split dataset into training, validation, and testing sets
train_images, test_images = train_test_split(image_files, test_size=0.2, random_state=42)  # 80% train, 20% test
train_images, val_images = train_test_split(train_images, test_size=0.2, random_state=42)  # 16% validation

# Function to filter annotations based on image list
def filter_annotations(image_list, annotations_dict):
    """
    Filter annotations for a given list of images.
    Args:
    - image_list: List of image filenames.
    - annotations_dict: Dictionary of annotations.
    Returns:
    - Filtered annotations dictionary.
    """
    return {img: annotations_dict[img] for img in image_list if img in annotations_dict}

# Filter annotations for each split
train_annotations = filter_annotations(train_images, annotations)
val_annotations = filter_annotations(val_images, annotations)
test_annotations = filter_annotations(test_images, annotations)

# Save the split annotations to JSON files
def save_annotations_to_file(annotations, file_path):
    """
    Save annotations to a JSON file.
    Args:
    - annotations: Dictionary of annotations.
    - file_path: File path to save the annotations.
    """
    with open(file_path, 'w') as f:
        json.dump(annotations, f, indent=4)
    print(f"Saved annotations to {file_path}")

save_annotations_to_file(train_annotations, train_annotation_file)
save_annotations_to_file(val_annotations, val_annotation_file)
save_annotations_to_file(test_annotations, test_annotation_file)

# Display summary of splits
print("Dataset Splits Summary:")
print(f"Training set: {len(train_images)} images, {len(train_annotations)} annotations")
print(f"Validation set: {len(val_images)} images, {len(val_annotations)} annotations")
print(f"Testing set: {len(test_images)} images, {len(test_annotations)} annotations")
