In [2]:
import os
import shutil
import random
from pathlib import Path

# === Config ===
input_root = Path("../out_data")             # Your current folder with PLU subfolders
output_root = Path("../out_data_split")      # Where train/ and val/ folders will be created
split_ratio = 0.8                             # 80% train, 20% val
seed = 42

random.seed(seed)

# Optionally clear old output
if output_root.exists():
    shutil.rmtree(output_root)
output_root.mkdir(parents=True, exist_ok=True)

# Loop through each PLU folder
for class_folder in input_root.iterdir():
    if not class_folder.is_dir():
        continue

    images = sorted(class_folder.glob("*.png"))
    random.shuffle(images)

    split_point = int(len(images) * split_ratio)
    train_images = images[:split_point]
    val_images = images[split_point:]

    for mode, image_list in [("train", train_images), ("val", val_images)]:
        output_class_dir = output_root / mode / class_folder.name
        output_class_dir.mkdir(parents=True, exist_ok=True)

        for image_path in image_list:
            dest_path = output_class_dir / image_path.name
            shutil.copy2(image_path, dest_path)

print("✅ Done. Dataset has been split into 'train/' and 'val/' inside ../out_data_split.")

✅ Done. Dataset has been split into 'train/' and 'val/' inside ../out_data_split.


In [None]:
import os
from pathlib import Path
from PIL import Image

# === CONFIG ===
original_root = Path("dataset/NGD_HACK")  # Folder with all PLU folders (4011/, 4015/, etc.)
output_root = Path("out_data/")        # Folder to save cropped images

# Create output root if it doesn't exist
output_root.mkdir(parents=True, exist_ok=True)

def crop_and_save_image(image_path, bbox, save_path):
    with Image.open(image_path) as img:
        width, height = img.size
        left = int(bbox['topX'] * width)
        top = int(bbox['topY'] * height)
        right = int(bbox['bottomX'] * width)
        bottom = int(bbox['bottomY'] * height)

        # Ensure valid box
        if left >= right or top >= bottom:
            print(f"⚠️ Skipping invalid bbox in {image_path.name}")
            return

        cropped = img.crop((left, top, right, bottom))
        cropped.save(save_path)

import json

def parse_bbox_file(txt_file: Path):
    try:
        with open(txt_file) as f:
            data = json.load(f)

        labels = data.get("label", [])
        if isinstance(labels, list) and len(labels) > 0:
            bbox = labels[0]
            return {
                "topX": float(bbox["topX"]),
                "topY": float(bbox["topY"]),
                "bottomX": float(bbox["bottomX"]),
                "bottomY": float(bbox["bottomY"]),
            }

        print(f"⚠️ No label list found in {txt_file}")
        return None

    except Exception as e:
        print(f"⚠️ Failed to parse {txt_file}: {e}")
        return None

# === PROCESS ===
count = 0
skipped = 0

for plu_folder in original_root.iterdir():
    if not plu_folder.is_dir():
        continue

    for txt_file in plu_folder.glob("*.txt"):
        image_stem = txt_file.stem
        image_file = plu_folder / f"{image_stem}.png"

        if not image_file.exists():
            print(f"❌ Image not found for {image_file}")
            skipped += 1
            continue

        bbox = parse_bbox_file(txt_file)
        if bbox is None:
            print(f"❌ Invalid bbox for {txt_file}")
            skipped += 1
            continue

        # Output path (same subfolder)
        output_folder = output_root / plu_folder.name
        output_folder.mkdir(parents=True, exist_ok=True)

        save_path = output_folder / f"{image_stem}.png"
        crop_and_save_image(image_file, bbox, save_path)
        count += 1

print(f"\n✅ Done. Cropped and saved {count} images. Skipped {skipped}.")


✅ Done. Cropped and saved 3244 images. Skipped 0.


In [4]:
# ✅ Sanity check: ensure no file appears in both train and val

train_dir = output_root / "train"
val_dir = output_root / "val"

train_files = set()
val_files = set()

# Collect file identifiers (PLU + filename)
for class_folder in train_dir.iterdir():
    for img in class_folder.glob("*.png"):
        train_files.add(f"{class_folder.name}/{img.name}")

for class_folder in val_dir.iterdir():
    for img in class_folder.glob("*.png"):
        val_files.add(f"{class_folder.name}/{img.name}")

# Find duplicates
duplicates = train_files.intersection(val_files)

if duplicates:
    print(f"❌ Found {len(duplicates)} duplicates between train and val:")
    for dup in list(duplicates)[:10]:  # Show first 10 duplicates
        print(" -", dup)
else:
    print("✅ No duplicates found between train and val sets.")

❌ Found 1 duplicates between train and val:
 - 4011/4011-1419.png
