In [None]:
# Load the required packages
import os
import shutil
import random
import yaml
import cv2
from tqdm import tqdm

In [None]:
# Define the function "create_yolo_dataset" which takes in the input path, output path, class names, desired split and then the seed for reproducability 
def create_yolo_dataset(
    labelstudio_export_path,
    output_path,
    class_names,
    train_split=0.8,
    val_split=0.1,
    seed=42
):  
    # Initially the "random" function based on the seed
    random.seed(seed)

    # Get the path to the images and labels, since this is the folder directly from label studio there are only two folders. Hence this script
    images_path = os.path.join(labelstudio_export_path, 'images')
    labels_path = os.path.join(labelstudio_export_path, 'labels')

    # Loop through all the image files in the path and create a list of the file names
    image_files = [f for f in os.listdir(images_path)]
    # Shuffle the images so they are in a random order
    random.shuffle(image_files)

    # Calculate the total number of images
    total = len(image_files)
    # Calculate which index in the list marks the end of the training images based on the desired percentage
    train_end = int(total * train_split)
    # Calculate the index in the list where the validation split ends
    val_end = train_end + int(total * val_split)

    # The training files are all image files up until the previously determined index
    train_files = image_files[:train_end]
    # The validation files are all images between the train_end and val_end indices
    val_files = image_files[train_end:val_end]
    # The test files are all files after the val_end index
    test_files = image_files[val_end:]

    # Loop through the three desired splits
    for split in ['train', 'val', 'test']:
        # Make an empty folder for each of the splits in both the images folder and labels folder
        os.makedirs(os.path.join(output_path, 'images', split), exist_ok=True)
        os.makedirs(os.path.join(output_path, 'labels', split), exist_ok=True)

    # Create a function so we can copy the files from the original dataset to our new one, 
    def copy_files(file_list, split):
        # Loop through images
        for img_file in file_list:
            # Specify the path of the source image
            src_img = os.path.join(images_path, img_file)
            # Specify the path we are going to copy the source image to with the structure: folder/images/split/file name
            dst_img = os.path.join(output_path, 'images', split, img_file)
            # Copy the src_img to dst_img
            shutil.copy2(src_img, dst_img)

            # Obtain the corresponding label file by stripping ".jpg" from the image name and adding ".txt"
            label_file = os.path.splitext(img_file)[0] + '.txt'
            # Specify source path as before
            src_label = os.path.join(labels_path, label_file)
            # Specify output path as before
            dst_label = os.path.join(output_path, 'labels', split, label_file)
            # If there is a label matching the source image, copy it to the new path
            if os.path.exists(src_label):
                shutil.copy2(src_label, dst_label)
            else:
                print(f"Warning: Label not found for {img_file}")

    # Copy the images and files to their new folders based on the pre-determined split
    copy_files(train_files, 'train')
    copy_files(val_files, 'val')
    copy_files(test_files, 'test')

    # Create a dataset.yaml which is saved to the output folder, and has a path to each of the folders + the class names
    dataset_yaml = {
        'path': os.path.abspath(output_path),
        'train': 'images/train',
        'val': 'images/val',
        'test': 'images/test',
        'names': {i: name for i, name in enumerate(class_names)}
    }

    with open(os.path.join(output_path, 'dataset.yaml'), 'w') as f:
        yaml.dump(dataset_yaml, f, default_flow_style=False)

    # Print that the new dataset is done
    print("Dataset completed")


# Specify the input and output folders as well as the classes in the dataset + run the code with the desired split 80:10:10
if __name__ == "__main__":
    labelstudio_export_path = "yolo_dataset"
    output_path = "training_data"
    class_names = ["Carrot", "Cross"]

    create_yolo_dataset(
        labelstudio_export_path,
        output_path,
        class_names,
        train_split=0.8,
        val_split=0.1
    )
