In [1]:
import os
import yaml
import random
from pathlib import Path
from typing import List, Tuple

print("Train/Val Split Creation Script Starting...")

Train/Val Split Creation Script Starting...


In [2]:
def split_images(images_dir: str, seed: int = 42, val_ratio: float = 0.1) -> Tuple[List[str], List[str]]:
    """
    Split image files into train and validation sets.
    
    Args:
        images_dir: Directory containing image files
        seed: Random seed for reproducible splits
        val_ratio: Fraction of data to use for validation
        
    Returns:
        Tuple of (train_files, val_files)
    """
    # Get all image files
    image_dir = Path(images_dir)
    image_extensions = ['.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG']
    
    image_files = []
    for ext in image_extensions:
        image_files.extend(image_dir.glob(f"*{ext}"))
    
    # Convert to relative paths (images/filename.jpg)
    image_paths = [f"images/{img.name}" for img in image_files]
    
    # Shuffle deterministically
    random.seed(seed)
    random.shuffle(image_paths)
    
    # Split into train/val
    total_count = len(image_paths)
    val_count = int(total_count * val_ratio)
    train_count = total_count - val_count
    
    val_files = image_paths[:val_count]
    train_files = image_paths[val_count:]
    
    print(f"Total images: {total_count}")
    print(f"Training images: {train_count} ({(train_count/total_count)*100:.1f}%)")
    print(f"Validation images: {val_count} ({(val_count/total_count)*100:.1f}%)")
    
    return train_files, val_files

In [3]:
# Set up paths
images_dir = "../../dataset/images"
train_file = "../../dataset/train.txt"
val_file = "../../dataset/val.txt"
data_yaml_file = "../../dataset/data.yaml"

# Configuration
SEED = 42
VAL_RATIO = 0.1  # 10% for validation

print(f"Images directory: {images_dir}")
print(f"Validation ratio: {VAL_RATIO*100:.1f}%")
print(f"Random seed: {SEED}")

Images directory: ../../dataset/images
Validation ratio: 10.0%
Random seed: 42


In [4]:
# Check if images directory exists
if not Path(images_dir).exists():
    print(f"Error: Images directory {images_dir} does not exist!")
    print("Please run the previous notebook (5_write_yolo_labels.ipynb) first.")
else:
    print(f"Found images directory: {images_dir}")

Found images directory: ../../dataset/images


In [5]:
# Create train/val split
print("Creating train/validation split...")
train_images, val_images = split_images(images_dir, seed=SEED, val_ratio=VAL_RATIO)

Creating train/validation split...
Total images: 1252
Training images: 1127 (90.0%)
Validation images: 125 (10.0%)


In [6]:
# Write train.txt file
with open(train_file, 'w', encoding='utf-8') as f:
    for img_path in train_images:
        f.write(f"{img_path}\n")

print(f"Created train file: {train_file} ({len(train_images)} images)")

Created train file: ../../dataset/train.txt (1127 images)


In [7]:
# Write val.txt file
with open(val_file, 'w', encoding='utf-8') as f:
    for img_path in val_images:
        f.write(f"{img_path}\n")

print(f"Created validation file: {val_file} ({len(val_images)} images)")

Created validation file: ../../dataset/val.txt (125 images)


In [8]:
# Update data.yaml file to point to split files
if Path(data_yaml_file).exists():
    # Load existing data.yaml
    with open(data_yaml_file, 'r', encoding='utf-8') as f:
        data_config = yaml.safe_load(f)
    
    # Update train and val paths
    data_config['train'] = 'train.txt'
    data_config['val'] = 'val.txt'
    
    # Save updated data.yaml
    with open(data_yaml_file, 'w', encoding='utf-8') as f:
        yaml.dump(data_config, f, default_flow_style=False, allow_unicode=True)
    
    print(f"Updated data.yaml file: {data_yaml_file}")
else:
    print(f"Warning: {data_yaml_file} not found. Please run notebook 5 first.")

Updated data.yaml file: ../../dataset/data.yaml


In [9]:
# Verify that corresponding label files exist
labels_dir = "../../dataset/labels"
missing_labels = []

print("\nVerifying label files exist for all images...")

all_images = train_images + val_images
for img_path in all_images[:10]:  # Check first 10 as sample
    # Convert images/filename.jpg to labels/filename.txt
    img_name = Path(img_path).stem  # Get filename without extension
    label_file = Path(labels_dir) / f"{img_name}.txt"
    
    if not label_file.exists():
        missing_labels.append(img_path)

if missing_labels:
    print(f"Warning: {len(missing_labels)} images have no corresponding label files")
    print(f"Sample missing: {missing_labels[:5]}")
else:
    print("✓ All sample images have corresponding label files")


Verifying label files exist for all images...
✓ All sample images have corresponding label files


In [10]:
# Print summary
print("\n" + "="*50)
print("TRAIN/VAL SPLIT SUMMARY")
print("="*50)
print(f"Total images: {len(train_images) + len(val_images)}")
print(f"Training images: {len(train_images)} ({len(train_images)/(len(train_images)+len(val_images))*100:.1f}%)")
print(f"Validation images: {len(val_images)} ({len(val_images)/(len(train_images)+len(val_images))*100:.1f}%)")
print(f"Random seed: {SEED}")

print(f"\nFiles created:")
print(f"  Train split: {train_file}")
print(f"  Val split: {val_file}")
print(f"  Updated config: {data_yaml_file}")

print("\nSample train images:")
for i, img in enumerate(train_images[:5]):
    print(f"  {i+1}. {img}")

print("\nSample validation images:")
for i, img in enumerate(val_images[:5]):
    print(f"  {i+1}. {img}")

print("\n" + "="*50)
print("Ready for YOLO training!")


TRAIN/VAL SPLIT SUMMARY
Total images: 1252
Training images: 1127 (90.0%)
Validation images: 125 (10.0%)
Random seed: 42

Files created:
  Train split: ../../dataset/train.txt
  Val split: ../../dataset/val.txt
  Updated config: ../../dataset/data.yaml

Sample train images:
  1. images/X51007846412.jpg
  2. images/X51005711447.jpg
  3. images/X51008114266.jpg
  4. images/X51005433514.jpg
  5. images/X51007103692.jpg

Sample validation images:
  1. images/X51006311780.jpg
  2. images/X51005433494.jpg
  3. images/X51006311758.jpg
  4. images/X51006619343.jpg
  5. images/X51005724629.jpg

Ready for YOLO training!


In [11]:
# Show final data.yaml content
if Path(data_yaml_file).exists():
    print(f"\nFinal {data_yaml_file} content:")
    with open(data_yaml_file, 'r', encoding='utf-8') as f:
        content = f.read()
        print(content)


Final ../../dataset/data.yaml content:
names:
- COMPANY
- ADDRESS
- DATE
- TOTAL
- TAX
- ITEM
- QTY
- UNIT_PRICE
- LINE_TOTAL
- DOCUMENT_NO
- CASHIER
- OTHER
nc: 12
path: dataset
train: train.txt
val: val.txt

