In [None]:
# Here I load the packages need to run the code

import os
import shutil
import random
from pathlib import Path

In [None]:
# Define the path to the full dataset, while also providing a path to the labels and images in the train, test and val folders.
source_dir = Path("tiled_dataset")
train_img_dir = source_dir / "images" / "train"
train_lbl_dir = source_dir / "labels" / "train"

val_img_dir = source_dir / "images" / "val"
val_lbl_dir = source_dir / "labels" / "val"
test_img_dir = source_dir / "images" / "test"
test_lbl_dir = source_dir / "labels" / "test"

# Decide which size we want the create the subsets in. In our case we used 11% since this brought the size of the training data down to around the same amount of images
# we had before cropping. Note that this is a list, so that we could create multiple subsets at once if we wished. 
subsets = [0.11]


# Specify the class names associated to each label value 
class_names = {
    0: "Carrot",
    1: "Cross"
}

# Get list of all training images (here you would obviously need to change the .jpg depending on your fileformat, we're using .jpg though)
all_train_images = list(train_img_dir.glob("*.jpg"))

# Here we loop through all the desired subsets (in our case only the 11% subset)
for pct in subsets:
    # We convert the float value to an integer for the purpose of naming the folder in which we save our "new" dataset
    pct_int = int(pct * 100)
    # We save the new subset a folder named "tiled_data11"
    subset_dir = Path(f"tiled_data{pct_int}")

    # Create folder structure which matches the one from the original dataset, since YOLO expects this folder structure
    (subset_dir / "images" / "train").mkdir(parents=True, exist_ok=True)
    (subset_dir / "labels" / "train").mkdir(parents=True, exist_ok=True)

    (subset_dir / "images" / "val").mkdir(parents=True, exist_ok=True)
    (subset_dir / "labels" / "val").mkdir(parents=True, exist_ok=True)

    (subset_dir / "images" / "test").mkdir(parents=True, exist_ok=True)
    (subset_dir / "labels" / "test").mkdir(parents=True, exist_ok=True)

    # Specify the number of samples (images) which should be selected
    num_samples = int(len(all_train_images) * pct)

    # Based on this value, select a random sample from the training images. The random.sample function returns a list
    selected_images = random.sample(all_train_images, num_samples)

    # Now we can loop through each of the selected images 
    for img in selected_images:
        # Copy each of the images into the new images/train folder 
        shutil.copy(img, subset_dir / "images" / "train" / img.name)

        # Now we can extract the corresponding label from the original labels/train folder by using the img.stem function to replace ".jpg" with ".txt"
        # which is the name of the corresponding label file
        label = train_lbl_dir / f"{img.stem}.txt"
        # if that label exists, copy it to the new tiled_data11/labels/train
        if label.exists():
            shutil.copy(label, subset_dir / "labels" / "train" / label.name)

    # Since making a subset of the validation and test set is irrelevant, we just copy full validation and test sets
    for file in val_img_dir.glob("*"):
        shutil.copy(file, subset_dir / "images" / "val" / file.name)
    for file in val_lbl_dir.glob("*"):
        shutil.copy(file, subset_dir / "labels" / "val" / file.name)
    
    
    for file in test_img_dir.glob("*"):
        shutil.copy(file, subset_dir / "images" / "test" / file.name)
    for file in test_lbl_dir.glob("*"):
        shutil.copy(file, subset_dir / "labels" / "test" / file.name)

    # Create dataset.yaml with required format, which points to the correct folders and dataset folder. 
    yaml_path = subset_dir / "dataset.yaml"
    with open(yaml_path, "w") as f:
        f.write(f"path: {subset_dir.resolve()}\n")
        f.write("train: images/train\n")
        f.write("val: images/val\n")
        f.write("test: images/test\n")
        f.write("names:\n")

        # loops through the dictionary of class names and adds them to the .yaml file.
        for idx, name in class_names.items():
            f.write(f"  {idx}: {name}\n")

# Prints when the code is done
print("Dataset created")