# Dataset split

## Functions

In [5]:
import os
from pathlib import Path
from collections import defaultdict

def count_images_per_subfolder(folder_path):
    """
    Count image files in each subfolder of the given directory.
    
    Args:
        folder_path (str): Path to the main folder to analyze
    
    Returns:
        dict: Dictionary with subfolder names as keys and image counts as values
    """
    # Define common image file extensions
    image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.tif', 
                       '.webp', '.svg', '.ico', '.raw', '.cr2', '.nef', '.arw'}
    
    folder_path = Path(folder_path)
    image_counts = defaultdict(int)
    
    # Check if the provided path exists
    if not folder_path.exists():
        print(f"Error: The path '{folder_path}' does not exist.")
        return {}
    
    # Iterate through all items in the folder
    for item in folder_path.iterdir():
        if item.is_dir():  # Only process directories (subfolders)
            subfolder_name = item.name
            
            # Count image files in this subfolder
            for file in item.iterdir():
                if file.is_file() and file.suffix.lower() in image_extensions:
                    image_counts[subfolder_name] += 1
    
    return dict(image_counts)

In [6]:
import os
import shutil
import random
from pathlib import Path
from collections import defaultdict
import math

def split_dataset(source_folder, output_folder, train_ratio=0.7, test_ratio=0.15, val_ratio=0.15, seed=42):
    """
    Split a dataset into train/test/validation sets while maintaining folder structure.
    
    Args:
        source_folder (str): Path to the source folder containing subfolders with images
        output_folder (str): Path to the output directory where split folders will be created
        train_ratio (float): Proportion for training set (default 0.7)
        test_ratio (float): Proportion for test set (default 0.15)
        val_ratio (float): Proportion for validation set (default 0.15)
        seed (int): Random seed for reproducible splits
    
    Returns:
        dict: Summary of the split operation
    """
    # Validate ratios
    if abs(train_ratio + test_ratio + val_ratio - 1.0) > 1e-6:
        raise ValueError("Ratios must sum to 1.0")
    
    # Define image extensions
    image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.tif', 
                       '.webp', '.svg', '.ico', '.raw', '.cr2', '.nef', '.arw'}
    
    source_path = Path(source_folder)
    output_path = Path(output_folder)
    
    if not source_path.exists():
        raise ValueError(f"Source folder '{source_folder}' does not exist")
    
    # Create output directory if it doesn't exist
    output_path.mkdir(parents=True, exist_ok=True)
    
    # Create destination folder names inside the output folder
    base_name = source_path.name
    samples_folder = output_path / f"{base_name}_samples"
    test_folder = output_path / f"{base_name}_test"
    validation_folder = output_path / f"{base_name}_validation"
    
    # Create main destination folders
    for folder in [samples_folder, test_folder, validation_folder]:
        folder.mkdir(exist_ok=True)
    
    # Set random seed for reproducible results
    random.seed(seed)
    
    # Track statistics
    split_summary = defaultdict(lambda: defaultdict(int))
    total_images_processed = 0
    total_images_original = 0
    
    print(f"Creating dataset split from: {source_folder}")
    print(f"Output directory: {output_folder}")
    print(f"Destination folders:")
    print(f"  - Samples (70%): {samples_folder}")
    print(f"  - Test (15%): {test_folder}")
    print(f"  - Validation (15%): {validation_folder}")
    print("\nProcessing subfolders...")
    
    # Process each subfolder (class)
    subfolders_processed = 0
    for subfolder in source_path.iterdir():
        if not subfolder.is_dir():
            continue
            
        subfolder_name = subfolder.name
        print(f"\nProcessing class: {subfolder_name}")
        
        # Create corresponding subfolders in each destination
        samples_subfolder = samples_folder / subfolder_name
        test_subfolder = test_folder / subfolder_name
        validation_subfolder = validation_folder / subfolder_name
        
        for dest_subfolder in [samples_subfolder, test_subfolder, validation_subfolder]:
            dest_subfolder.mkdir(exist_ok=True)
        
        # Get all image files in this subfolder
        image_files = [f for f in subfolder.iterdir() 
                      if f.is_file() and f.suffix.lower() in image_extensions]
        
        if not image_files:
            print(f"  No images found in {subfolder_name}")
            continue
        
        # Shuffle files for random split
        random.shuffle(image_files)
        
        # Calculate split indices
        total_files = len(image_files)
        train_count = math.floor(total_files * train_ratio)
        test_count = math.floor(total_files * test_ratio)
        val_count = total_files - train_count - test_count  # Remaining files go to validation
        
        # Split files
        train_files = image_files[:train_count]
        test_files = image_files[train_count:train_count + test_count]
        val_files = image_files[train_count + test_count:]
        
        # Copy files to respective folders
        def copy_files(file_list, destination, split_name):
            for file in file_list:
                dest_file = destination / file.name
                shutil.copy2(file, dest_file)
            return len(file_list)
        
        # Perform the copies
        samples_copied = copy_files(train_files, samples_subfolder, "samples")
        test_copied = copy_files(test_files, test_subfolder, "test")
        val_copied = copy_files(val_files, validation_subfolder, "validation")
        
        # Update statistics
        split_summary[subfolder_name]['original'] = total_files
        split_summary[subfolder_name]['samples'] = samples_copied
        split_summary[subfolder_name]['test'] = test_copied
        split_summary[subfolder_name]['validation'] = val_copied
        split_summary[subfolder_name]['total_copied'] = samples_copied + test_copied + val_copied
        
        total_images_original += total_files
        total_images_processed += samples_copied + test_copied + val_copied
        
        # Print subfolder summary
        print(f"  Original: {total_files} images")
        print(f"  Samples: {samples_copied} ({samples_copied/total_files*100:.1f}%)")
        print(f"  Test: {test_copied} ({test_copied/total_files*100:.1f}%)")
        print(f"  Validation: {val_copied} ({val_copied/total_files*100:.1f}%)")
        
        subfolders_processed += 1
    
    # Validation check
    print("\n" + "="*60)
    print("VALIDATION SUMMARY")
    print("="*60)
    
    validation_passed = True
    for class_name, stats in split_summary.items():
        original = stats['original']
        total_copied = stats['total_copied']
        if original != total_copied:
            print(f"❌ ERROR in {class_name}: {original} original ≠ {total_copied} copied")
            validation_passed = False
        else:
            print(f"✅ {class_name}: {original} images correctly split")
    
    print(f"\nOverall Summary:")
    print(f"  Subfolders processed: {subfolders_processed}")
    print(f"  Total original images: {total_images_original}")
    print(f"  Total images copied: {total_images_processed}")
    print(f"  Validation: {'✅ PASSED' if validation_passed else '❌ FAILED'}")
    
    if validation_passed:
        print(f"\n🎉 Dataset split completed successfully!")
        print(f"   All {total_images_original} images have been correctly distributed.")
        print(f"   Split folders created in: {output_folder}")
    
    return dict(split_summary)

In [None]:
# Example usage
FOLDER_PATH = "/Users/ivan/Workspace/agentai-document-data-extractor/smartdoc/docs-sm"  # Replace with your actual folder path

# Count images per subfolder
results = count_images_per_subfolder(FOLDER_PATH)

# Display results
print(f"Image count per subfolder in '{FOLDER_PATH}':")
print("-" * 50)

if results:
    for subfolder, count in sorted(results.items()):
        print(f"{subfolder}: {count} images")
    
    # Summary statistics
    total_subfolders = len(results)
    total_images = sum(results.values())
    avg_images = total_images / total_subfolders if total_subfolders > 0 else 0
    
    print("\n" + "=" * 50)
    print(f"Summary:")
    print(f"Total subfolders: {total_subfolders}")
    print(f"Total images: {total_images}")
    print(f"Average images per subfolder: {avg_images:.2f}")
else:
    print("No subfolders found or no images in any subfolders.")

## Replace paths here

In [None]:
# Replace paths here
source_folder = "../docs-sm"  # Replace with your actual source folder path
output_folder = "../output" # Replace with your desired output folder path

# Perform the dataset split
try:
    results = split_dataset(
        source_folder=source_folder,
        output_folder=output_folder,    # New required parameter
        train_ratio=0.7,               # 70% for training/samples
        test_ratio=0.15,               # 15% for testing
        val_ratio=0.15,                # 15% for validation
        seed=42                        # For reproducible results
    )
    
    # Print detailed results
    print("\nDetailed breakdown by class:")
    for class_name, stats in results.items():
        print(f"\n{class_name}:")
        print(f"  Original: {stats['original']}")
        print(f"  Samples: {stats['samples']}")
        print(f"  Test: {stats['test']}")
        print(f"  Validation: {stats['validation']}")
        
except Exception as e:
    print(f"Error: {e}")