In [1]:
# datasets
# rugby-balls: https://universe.roboflow.com/hotchickenpie/ruby-ball-detection/browse?queryText=&pageSize=50&startingIndex=0&browseQuery=true
# mini-rugby balls: https://universe.roboflow.com/muhie-kk0uz/rugby-balls-zcnjx
# ping-pong balls: https://universe.roboflow.com/pingpong-ojuhj/ping-pong-detection-0guzq/images/8SLEQwQoyRiRvY70azBS

In [3]:
from roboflow import Roboflow

In [6]:
rf = Roboflow(api_key="NIGjwRcd4EBYxngwiDkn")
project = rf.workspace("hotchickenpie").project("ruby-ball-detection")
version = project.version(3)
dataset = version.download("yolov11")

loading Roboflow workspace...
loading Roboflow project...


Downloading Dataset Version Zip in Ruby-Ball-Detection-3 to yolov11:: 100%|██████████| 31077/31077 [00:02<00:00, 13302.07it/s]





Extracting Dataset Version Zip to Ruby-Ball-Detection-3 in yolov11:: 100%|██████████| 1304/1304 [00:00<00:00, 3064.93it/s]


In [5]:
rf = Roboflow(api_key="NIGjwRcd4EBYxngwiDkn")
project = rf.workspace("pingpong-ojuhj").project("ping-pong-detection-0guzq")
version = project.version(3)
dataset = version.download("yolov11")

loading Roboflow workspace...
loading Roboflow project...


Downloading Dataset Version Zip in Ping-Pong-Detection-3 to yolov11:: 100%|██████████| 1461722/1461722 [01:07<00:00, 21693.96it/s]





Extracting Dataset Version Zip to Ping-Pong-Detection-3 in yolov11:: 100%|██████████| 49688/49688 [00:19<00:00, 2581.22it/s]


# Merge Datasets

In [2]:
!pip install roboflow

Collecting roboflow
  Downloading roboflow-1.1.50-py3-none-any.whl.metadata (9.7 kB)
Collecting idna==3.7 (from roboflow)
  Using cached idna-3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting opencv-python-headless==4.10.0.84 (from roboflow)
  Using cached opencv_python_headless-4.10.0.84-cp37-abi3-win_amd64.whl.metadata (20 kB)
Collecting python-dotenv (from roboflow)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting requests-toolbelt (from roboflow)
  Downloading requests_toolbelt-1.0.0-py2.py3-none-any.whl.metadata (14 kB)
Collecting filetype (from roboflow)
  Using cached filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Downloading roboflow-1.1.50-py3-none-any.whl (81 kB)
Using cached idna-3.7-py3-none-any.whl (66 kB)
Using cached opencv_python_headless-4.10.0.84-cp37-abi3-win_amd64.whl (38.8 MB)
Using cached filetype-1.2.0-py2.py3-none-any.whl (19 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Downloading requests_toolbelt-1.0.0-py2.

In [1]:
# Import necessary libraries
import os
import shutil
import glob
import random
import logging
import yaml
from roboflow import Roboflow

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

# Parameters
API_KEY = "NIGjwRcd4EBYxngwiDkn"

# Define datasets with unique variable names
datasets_info = [
    {
        "workspace": "hotchickenpie",
        "project": "ruby-ball-detection",
        "version": 3,
        "download_format": "yolov11",
        "local_path": r"C:\Users\Marlo\code\QMUL-Societies\Electronics\Unibots\Computer_Vision\dataset-creation\Ruby-Ball-Detection-3"
    },
    {
        "workspace": "pingpong-ojuhj",
        "project": "ping-pong-detection-0guzq",
        "version": 3,
        "download_format": "yolov11",
        "local_path": r"C:\Users\Marlo\code\QMUL-Societies\Electronics\Unibots\Computer_Vision\dataset-creation\Ping-Pong-Detection-3"
    }
]

# Initialize Roboflow
rf = Roboflow(api_key=API_KEY)

# Download each dataset
for dataset in datasets_info:
    project = rf.workspace(dataset["workspace"]).project(dataset["project"])
    version = project.version(dataset["version"])
    logging.info(f"Downloading dataset '{dataset['project']}' version {dataset['version']}...")
    version.download(dataset["download_format"], location=dataset["local_path"])
    logging.info(f"Downloaded to '{dataset['local_path']}'")

# Define source datasets paths
SOURCE_DATASETS = [dataset["local_path"] for dataset in datasets_info]

# Define splits to ensure a total of 200 images per dataset
# Example split: 160 train, 30 valid, 10 test
SPLITS = {
    "train": 550,
    "valid": 50,
    "test": 34
}

OUTPUT_DIR = r"C:\Users\Marlo\code\QMUL-Societies\Electronics\Unibots\Computer_Vision\dataset-creation\full-data"
OUTPUT_IMAGES_DIR = os.path.join(OUTPUT_DIR, "images")
OUTPUT_LABELS_DIR = os.path.join(OUTPUT_DIR, "labels")
OUTPUT_DATA_YAML = os.path.join(OUTPUT_DIR, "data.yaml")

# Create output directories
for split in SPLITS.keys():
    os.makedirs(os.path.join(OUTPUT_IMAGES_DIR, split), exist_ok=True)
    os.makedirs(os.path.join(OUTPUT_LABELS_DIR, split), exist_ok=True)

# Function to read data.yaml
def read_data_yaml(dataset_path):
    data_yaml_path = os.path.join(dataset_path, "data.yaml")
    if not os.path.exists(data_yaml_path):
        logging.error(f"data.yaml not found in {dataset_path}")
        return None
    with open(data_yaml_path, 'r') as f:
        data = yaml.safe_load(f)
    return data

# Function to merge class names
def merge_classes(datasets_data):
    merged_names = []
    class_map = {}  # {dataset_idx: {original_class: merged_class_index}}
    
    for idx, data in enumerate(datasets_data):
        if data is None:
            continue  # Skip datasets without data.yaml
        class_map[idx] = {}
        for cls in data['names']:
            if cls not in merged_names:
                merged_names.append(cls)
            class_map[idx][cls] = merged_names.index(cls)
    
    return merged_names, class_map

# Read and merge data.yaml files
datasets_data = [read_data_yaml(ds) for ds in SOURCE_DATASETS]
merged_class_names, class_maps = merge_classes(datasets_data)
merged_nc = len(merged_class_names)

# Create merged data.yaml
merged_data_yaml = {
    'train': os.path.join(OUTPUT_IMAGES_DIR, 'train'),
    'val': os.path.join(OUTPUT_IMAGES_DIR, 'valid'),
    'test': os.path.join(OUTPUT_IMAGES_DIR, 'test'),
    'nc': merged_nc,
    'names': merged_class_names
}

with open(OUTPUT_DATA_YAML, 'w') as f:
    yaml.dump(merged_data_yaml, f)

logging.info(f"Merged data.yaml created at {OUTPUT_DATA_YAML}")

def select_and_copy(dataset_idx, dataset_path, split, num_images, split_type):
    split_path = os.path.join(dataset_path, split_type)
    
    if not os.path.exists(split_path):
        logging.warning(f"Split folder '{split_type}' does not exist in '{dataset_path}'. Skipping.")
        return

    images_path = os.path.join(split_path, "images")
    labels_path = os.path.join(split_path, "labels")

    if not os.path.exists(images_path):
        logging.warning(f"'images' folder does not exist in '{split_path}'. Skipping.")
        return
    if not os.path.exists(labels_path):
        logging.warning(f"'labels' folder does not exist in '{split_path}'. Skipping.")
        return

    # Get all .jpg and .png images
    image_files = glob.glob(os.path.join(images_path, "*.jpg")) + glob.glob(os.path.join(images_path, "*.png"))

    if not image_files:
        logging.warning(f"No image files found in '{images_path}'.")
        return

    # Sort image files to take the first N
    image_files.sort()
    selected_image_files = image_files[:num_images]

    # Get corresponding label files
    for img_file in selected_image_files:
        basename = os.path.splitext(os.path.basename(img_file))[0]
        src_label = os.path.join(labels_path, f"{basename}.txt")

        if not os.path.exists(src_label):
            logging.warning(f"Label file '{src_label}' does not exist. Skipping image '{basename}'.")
            continue

        # Destination paths
        dest_image = os.path.join(OUTPUT_IMAGES_DIR, split, os.path.basename(img_file))
        dest_label = os.path.join(OUTPUT_LABELS_DIR, split, os.path.basename(src_label))

        # Copy image
        shutil.copy2(img_file, dest_image)

        # Read label, remap class if necessary
        with open(src_label, 'r') as f:
            lines = f.readlines()

        remapped_lines = []
        for line in lines:
            parts = line.strip().split()
            if len(parts) != 5:
                logging.warning(f"Unexpected label format in '{src_label}'. Skipping line.")
                continue
            original_cls, x_center, y_center, width, height = parts
            try:
                original_cls = int(original_cls)
            except ValueError:
                logging.warning(f"Invalid class index in '{src_label}': '{original_cls}'. Skipping line.")
                continue
            # Map to merged class
            if datasets_data[dataset_idx] is None:
                logging.warning(f"No data.yaml found for dataset index {dataset_idx}. Cannot remap classes.")
                remapped_cls = original_cls
            else:
                cls_name = datasets_data[dataset_idx]['names'][original_cls]
                remapped_cls = class_maps[dataset_idx].get(cls_name, original_cls)
            remapped_line = f"{remapped_cls} {x_center} {y_center} {width} {height}\n"
            remapped_lines.append(remapped_line)

        # Write remapped label
        with open(dest_label, 'w') as f:
            f.writelines(remapped_lines)

    logging.info(f"Copied {len(selected_image_files)} images and labels from '{dataset_path}' to split '{split_type}'")

# Seed for reproducibility
random.seed(42)

# Combine datasets with the new split limits
for split, num_images in SPLITS.items():
    for idx, dataset in enumerate(SOURCE_DATASETS):
        select_and_copy(idx, dataset, split, num_images, split)

logging.info("Dataset combination complete.")

loading Roboflow workspace...
loading Roboflow project...


INFO: Downloading dataset 'ruby-ball-detection' version 3...
INFO: Downloaded to 'C:\Users\Marlo\code\QMUL-Societies\Electronics\Unibots\Computer_Vision\dataset-creation\Ruby-Ball-Detection-3'


loading Roboflow workspace...
loading Roboflow project...


INFO: Downloading dataset 'ping-pong-detection-0guzq' version 3...
INFO: Downloaded to 'C:\Users\Marlo\code\QMUL-Societies\Electronics\Unibots\Computer_Vision\dataset-creation\Ping-Pong-Detection-3'
INFO: Merged data.yaml created at C:\Users\Marlo\code\QMUL-Societies\Electronics\Unibots\Computer_Vision\dataset-creation\full-data\data.yaml
INFO: Copied 550 images and labels from 'C:\Users\Marlo\code\QMUL-Societies\Electronics\Unibots\Computer_Vision\dataset-creation\Ruby-Ball-Detection-3' to split 'train'
INFO: Copied 550 images and labels from 'C:\Users\Marlo\code\QMUL-Societies\Electronics\Unibots\Computer_Vision\dataset-creation\Ping-Pong-Detection-3' to split 'train'
INFO: Copied 50 images and labels from 'C:\Users\Marlo\code\QMUL-Societies\Electronics\Unibots\Computer_Vision\dataset-creation\Ruby-Ball-Detection-3' to split 'valid'
INFO: Copied 50 images and labels from 'C:\Users\Marlo\code\QMUL-Societies\Electronics\Unibots\Computer_Vision\dataset-creation\Ping-Pong-Detection-3' to