In [1]:
import os
import shutil
from sklearn.model_selection import train_test_split

# Define paths
train_dir = 'dataset/train'
val_dir = 'dataset/val'
test_dir = 'dataset/test'

# Calculate the number of classes based on directories in the train folder
classes = [d.name for d in os.scandir(train_dir) if d.is_dir()]
num_classes = len(classes)
print(f"Number of classes: {num_classes}")

Number of classes: 9


In [2]:
def split_data(train_dir, val_dir, test_dir, val_split_size=0.2, test_split_size=0.05):
    for cls in classes:
        cls_dir = os.path.join(train_dir, cls)
        images = [os.path.join(cls_dir, img) for img in os.listdir(cls_dir) if img.endswith('.jpg')]

        # Check if there are images in the class directory
        if len(images) == 0:
            print(f"No images found in class {cls}. Skipping this class.")
            continue

        # Split the data into train and initial validation set
        train_imgs, initial_val_imgs = train_test_split(images, test_size=val_split_size)

        # Split the initial validation set into final validation set and test set
        val_imgs, test_imgs = train_test_split(initial_val_imgs, test_size=test_split_size / val_split_size)

        # Create corresponding directories in val_dir and test_dir
        val_cls_dir = os.path.join(val_dir, cls)
        test_cls_dir = os.path.join(test_dir, cls)
        os.makedirs(val_cls_dir, exist_ok=True)
        os.makedirs(test_cls_dir, exist_ok=True)

        # Move/copy validation and test images
        for img in val_imgs:
            shutil.move(img, val_cls_dir)  # Use shutil.copy if you want to keep the original
        for img in test_imgs:
            shutil.move(img, test_cls_dir)

In [3]:
split_data(train_dir, val_dir, test_dir)