In [1]:
import os
import shutil
import random
from math import ceil
from tqdm import tqdm

def create_train_test_datasets(original_dataset_dir, train_dataset_dir, test_dataset_dir, test_ratio=0.1):
    # Ensure the training and test dataset directories exist
    os.makedirs(train_dataset_dir, exist_ok=True)
    os.makedirs(test_dataset_dir, exist_ok=True)

    # Get list of class folders
    class_folders = [folder for folder in os.listdir(original_dataset_dir) if os.path.isdir(os.path.join(original_dataset_dir, folder))]

    # Iterate through each class folder with a progress bar
    for class_name in tqdm(class_folders, desc="Processing classes"):
        class_dir = os.path.join(original_dataset_dir, class_name)
        
        # Create corresponding class folders in the training and test dataset directories
        train_class_dir = os.path.join(train_dataset_dir, class_name)
        test_class_dir = os.path.join(test_dataset_dir, class_name)
        os.makedirs(train_class_dir, exist_ok=True)
        os.makedirs(test_class_dir, exist_ok=True)

        # List all image files in the class directory
        images = [img for img in os.listdir(class_dir) if os.path.isfile(os.path.join(class_dir, img))]

        # Calculate the number of images to copy for testing
        num_test_images = ceil(len(images) * test_ratio)
        test_images = random.sample(images, num_test_images)
        train_images = list(set(images) - set(test_images))

        # Copy the selected images to the test dataset directory
        for img in test_images:
            src_path = os.path.join(class_dir, img)
            dst_path = os.path.join(test_class_dir, img)
            shutil.copy(src_path, dst_path)

        # Copy the remaining images to the training dataset directory
        for img in train_images:
            src_path = os.path.join(class_dir, img)
            dst_path = os.path.join(train_class_dir, img)
            shutil.copy(src_path, dst_path)

# Example usage
original_dataset_dir = 'datasets/data_aug_added'
train_dataset_dir = 'datasets/train_val_data'
test_dataset_dir = 'datasets/test_data'
create_train_test_datasets(original_dataset_dir, train_dataset_dir, test_dataset_dir, test_ratio=0.1)

Processing classes: 100%|██████████| 38/38 [01:16<00:00,  2.01s/it]
