This classification would combine all images then divide them into 70 15 15 (not a good classification)

In [None]:
import os
import shutil
import random

# ==========================
# CONFIG
# ==========================
base_dataset_path = r"Dataset_Repository"  # Original dataset
output_path = r"YOLO_split"       # Split dataset
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15
image_extensions = [".jpg", ".jpeg", ".png"]

# Create output folders
for folder_type in ["images", "labels"]:
    for split in ["train", "val", "test"]:
        os.makedirs(os.path.join(output_path, folder_type, split), exist_ok=True)

# ==========================
# COLLECT ALL DATA
# ==========================
all_images = []
all_labels = []

for root, _, files in os.walk(base_dataset_path):
    for f in files:
        ext = os.path.splitext(f)[1].lower()
        if ext in image_extensions:
            img_path = os.path.join(root, f)
            label_path = os.path.splitext(img_path)[0] + ".txt"
            all_images.append(img_path)
            all_labels.append(label_path if os.path.exists(label_path) else None)

# ==========================
# SHUFFLE DATA
# ==========================
combined = list(zip(all_images, all_labels))
random.shuffle(combined)
all_images, all_labels = zip(*combined)

total = len(all_images)
train_end = int(total * train_ratio)
val_end = int(total * (train_ratio + val_ratio))

splits = {
    "train": (0, train_end),
    "val": (train_end, val_end),
    "test": (val_end, total)
}

# ==========================
# COPY FILES
# ==========================
for split_name, (start, end) in splits.items():
    for i in range(start, end):
        img_src = all_images[i]
        lbl_src = all_labels[i]

        img_dst = os.path.join(output_path, "images", split_name, os.path.basename(img_src))
        shutil.copy2(img_src, img_dst)

        if lbl_src and os.path.exists(lbl_src):
            lbl_dst = os.path.join(output_path, "labels", split_name, os.path.basename(lbl_src))
            shutil.copy2(lbl_src, lbl_dst)

print("Dataset split into YOLOv8 format successfully!")

For classwise classification (each class contribute equally to 70 15 15)

In [None]:
import os
import shutil
import random

# ==========================
# CONFIG
# ==========================
base_dataset_path = r"YOLO_Dataset_Fahad2"  # Original dataset
output_path = r"YOLO_Dataset_Split2"       # Split dataset
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15
image_extensions = [".jpg", ".jpeg", ".png"]

# Optional: set random seed for reproducibility
random.seed(42)

# Create output folders
for folder_type in ["images", "labels"]:
    for split in ["train", "val", "test"]:
        os.makedirs(os.path.join(output_path, folder_type, split), exist_ok=True)

# ==========================
# STRATIFIED SPLIT BY CLASS
# ==========================
# Loop through each class folder (subfolder)
for class_name in os.listdir(base_dataset_path):
    class_path = os.path.join(base_dataset_path, class_name)
    if not os.path.isdir(class_path):
        continue

    # Collect all image-label pairs for this class
    all_images = []
    for f in os.listdir(class_path):
        if os.path.splitext(f)[1].lower() in image_extensions:
            img_path = os.path.join(class_path, f)
            label_path = os.path.splitext(img_path)[0] + ".txt"
            all_images.append((img_path, label_path if os.path.exists(label_path) else None))

    # Skip empty folders
    if not all_images:
        continue

    # Shuffle for randomness
    random.shuffle(all_images)
    total = len(all_images)

    # Compute counts
    train_count = int(total * train_ratio)
    val_count = int(total * val_ratio)
    test_count = total - train_count - val_count  # handle rounding

    # Split per class
    train_set = all_images[:train_count]
    val_set = all_images[train_count:train_count + val_count]
    test_set = all_images[train_count + val_count:]

    # Define output mappings
    splits = {
        "train": train_set,
        "val": val_set,
        "test": test_set
    }

    # Copy to output
    for split_name, data_list in splits.items():
        for img_src, lbl_src in data_list:
            # Keep class subfolder structure
            img_dst_dir = os.path.join(output_path, "images", split_name, class_name)
            lbl_dst_dir = os.path.join(output_path, "labels", split_name, class_name)
            os.makedirs(img_dst_dir, exist_ok=True)
            os.makedirs(lbl_dst_dir, exist_ok=True)

            img_dst = os.path.join(img_dst_dir, os.path.basename(img_src))
            shutil.copy2(img_src, img_dst)

            if lbl_src and os.path.exists(lbl_src):
                lbl_dst = os.path.join(lbl_dst_dir, os.path.basename(lbl_src))
                shutil.copy2(lbl_src, lbl_dst)

print("Dataset split into YOLOv8 format successfully with stratified randomness per class!")
