In [14]:
import os
import json
from PIL import Image

In [15]:
# === CONFIGURATION ===
NUM_FOLDERS = 26     # Number of folders to process
NUM_FILES_PER_FOLDER = 10000  # Number of _bb.png/.txt pairs per folder to process

In [16]:
# === PATHS ===
DATASET_PATH = 'dataset/NGD_HACK'
OUTPUT_PATH = 'out_data/'

In [17]:
def crop_and_save_bounding_boxes(dataset_path='data/', output_path='out_data/', 
                                 num_folders=None, num_files_per_folder=None):

    os.makedirs(output_path, exist_ok=True)

    all_items = os.listdir(dataset_path)
    folders = [f for f in sorted(all_items) if os.path.isdir(os.path.join(dataset_path, f))]

    # Limit folders if specified
    if num_folders is not None:
        folders = folders[:num_folders]

    for folder in folders:
        folder_path = os.path.join(dataset_path, folder)
        output_folder_path = os.path.join(output_path, folder)
        os.makedirs(output_folder_path, exist_ok=True)

        files = sorted(os.listdir(folder_path))
        bb_files = [f for f in files if f.endswith('_bb.png')]

        # Limit number of files if specified
        if num_files_per_folder is not None:
            bb_files = bb_files[:num_files_per_folder]

        for bb_file in bb_files:
            base_name = bb_file.replace('_bb.png', '')
            txt_file = f"{base_name}.txt"
            txt_path = os.path.join(folder_path, txt_file)
            image_path = os.path.join(folder_path, bb_file)

            if os.path.exists(txt_path):
                with open(txt_path, 'r') as f:
                    data = json.load(f)

                bbox = data['label'][0]
                img = Image.open(image_path)
                width, height = img.size

                # Convert normalized coords to absolute
                left = int(bbox['topX'] * width) + 2
                top = int(bbox['topY'] * height) + 2
                right = int(bbox['bottomX'] * width) - 1
                bottom = int(bbox['bottomY'] * height) - 1
                cropped_img = img.crop((left, top, right, bottom))

                save_path = os.path.join(output_folder_path, f"{base_name}_cropped.png")
                cropped_img.save(save_path)
                print(f"✅ Saved: {save_path}")
            else:
                print(f"⚠️ Missing .txt file for: {bb_file}")


In [20]:
import os
from pathlib import Path
from PIL import Image

# === CONFIG ===
original_root = Path("dataset/NGD_HACK")  # Folder with all PLU folders (4011/, 4015/, etc.)
output_root = Path("out_data/")        # Folder to save cropped images

# Create output root if it doesn't exist
output_root.mkdir(parents=True, exist_ok=True)

def crop_and_save_image(image_path, bbox, save_path):
    with Image.open(image_path) as img:
        width, height = img.size
        left = int(bbox['topX'] * width)
        top = int(bbox['topY'] * height)
        right = int(bbox['bottomX'] * width)
        bottom = int(bbox['bottomY'] * height)

        # Ensure valid box
        if left >= right or top >= bottom:
            print(f"⚠️ Skipping invalid bbox in {image_path.name}")
            return

        cropped = img.crop((left, top, right, bottom))
        cropped.save(save_path)

import json

def parse_bbox_file(txt_file: Path):
    try:
        with open(txt_file) as f:
            data = json.load(f)

        labels = data.get("label", [])
        if isinstance(labels, list) and len(labels) > 0:
            bbox = labels[0]
            return {
                "topX": float(bbox["topX"]),
                "topY": float(bbox["topY"]),
                "bottomX": float(bbox["bottomX"]),
                "bottomY": float(bbox["bottomY"]),
            }

        print(f"⚠️ No label list found in {txt_file}")
        return None

    except Exception as e:
        print(f"⚠️ Failed to parse {txt_file}: {e}")
        return None

# === PROCESS ===
count = 0
skipped = 0

for plu_folder in original_root.iterdir():
    if not plu_folder.is_dir():
        continue

    for txt_file in plu_folder.glob("*.txt"):
        image_stem = txt_file.stem
        image_file = plu_folder / f"{image_stem}.png"

        if not image_file.exists():
            print(f"❌ Image not found for {image_file}")
            skipped += 1
            continue

        bbox = parse_bbox_file(txt_file)
        if bbox is None:
            print(f"❌ Invalid bbox for {txt_file}")
            skipped += 1
            continue

        # Output path (same subfolder)
        output_folder = output_root / plu_folder.name
        output_folder.mkdir(parents=True, exist_ok=True)

        save_path = output_folder / f"{image_stem}.png"
        crop_and_save_image(image_file, bbox, save_path)
        count += 1

print(f"\n✅ Done. Cropped and saved {count} images. Skipped {skipped}.")


✅ Done. Cropped and saved 3244 images. Skipped 0.


In [18]:
crop_and_save_bounding_boxes(dataset_path='dataset/NGD_HACK', output_path='out_data/', 
                                 num_folders=4, num_files_per_folder=1)

✅ Saved: out_data/4011/4011-1000_cropped.png
✅ Saved: out_data/4015/4015-1027_cropped.png
✅ Saved: out_data/4088/4088-1002_cropped.png
✅ Saved: out_data/4196/4196-1010_cropped.png
