In [2]:
import os
import shutil
import random
from pathlib import Path

# === Config ===
input_root = Path("../../../data/out_data")             # Your current folder with PLU subfolders
output_root = Path("../../../data/out_data_split")      # Where train/ and val/ folders will be created
split_ratio = 0.8                             # 80% train, 20% val
seed = 42

random.seed(seed)

# Optionally clear old output
if output_root.exists():
    shutil.rmtree(output_root)
output_root.mkdir(parents=True, exist_ok=True)

# Loop through each PLU folder
for class_folder in input_root.iterdir():
    if not class_folder.is_dir():
        continue

    images = sorted(class_folder.glob("*.png"))
    random.shuffle(images)

    split_point = int(len(images) * split_ratio)
    train_images = images[:split_point]
    val_images = images[split_point:]

    for mode, image_list in [("train", train_images), ("val", val_images)]:
        output_class_dir = output_root / mode / class_folder.name
        output_class_dir.mkdir(parents=True, exist_ok=True)

        for image_path in image_list:
            dest_path = output_class_dir / image_path.name
            shutil.copy2(image_path, dest_path)

print("✅ Done. Dataset has been split into 'train/' and 'val/' inside ../out_data_split.")

✅ Done. Dataset has been split into 'train/' and 'val/' inside ../out_data_split.



✅ Done. Cropped and saved 3244 images. Skipped 0.


In [4]:
# ✅ Sanity check: ensure no file appears in both train and val

train_dir = output_root / "train"
val_dir = output_root / "val"

train_files = set()
val_files = set()

# Collect file identifiers (PLU + filename)
for class_folder in train_dir.iterdir():
    for img in class_folder.glob("*.png"):
        train_files.add(f"{class_folder.name}/{img.name}")

for class_folder in val_dir.iterdir():
    for img in class_folder.glob("*.png"):
        val_files.add(f"{class_folder.name}/{img.name}")

# Find duplicates
duplicates = train_files.intersection(val_files)

if duplicates:
    print(f"❌ Found {len(duplicates)} duplicates between train and val:")
    for dup in list(duplicates)[:10]:  # Show first 10 duplicates
        print(" -", dup)
else:
    print("✅ No duplicates found between train and val sets.")

❌ Found 1 duplicates between train and val:
 - 4011/4011-1419.png
