# Configuration

The following parameters are configurable:

| Parameter | Description |
| --------- | ----------- |
| `train_samples_per_class` | Amount of samples per class of the training set |
| `val_samples_per_class` | Amount of samples per class of the validation set |
| `dir_in` | Dataset directory with huge amount of data |
| `dir_out` | Directory for sampled data |

Note: This script assumes conventional folder structure:
```
dataset (e.g. coco)
├── train
│   ├── labels
│   └── images
└── val
    ├── labels
    └── images
```

Note 2: Only the `train` folder is copied to the output. The `val` folder is untouched by this script.

In [None]:
train_samples_per_class = 500
val_samples_per_class = 200
dir_in = "./dataset/coco"
dir_out = "./dataset/coco_sampling"

In [None]:
import os

train_folder_in = os.path.join(dir_in, "train")
train_image_folder_in = os.path.join(train_folder_in, "images")
train_label_folder_in = os.path.join(train_folder_in, "labels")
train_label_files = os.listdir(train_label_folder_in)

train_folder_out = os.path.join(dir_out, "train")
train_image_folder_out = os.path.join(train_folder_out, "images")
train_label_folder_out = os.path.join(train_folder_out, "labels")


val_folder_in = os.path.join(dir_in, "val")
val_image_folder_in = os.path.join(val_folder_in, "images")
val_label_folder_in = os.path.join(val_folder_in, "labels")
val_label_files = os.listdir(val_label_folder_in)

val_folder_out = os.path.join(dir_out, "val")
val_image_folder_out = os.path.join(val_folder_out, "images")
val_label_folder_out = os.path.join(val_folder_out, "labels")

# Build Files Dictionary

**Initialize dictionary variable**

This code will attempt to read variable from a file. If none is found, it will initialize a blank dictionary, meaning you have to manually generate a new one using the code block below. The dictionary variable acts like a cache, and is used in the random sampling process.

In [None]:
import pickle

dict_found = False

try:
    f = open("sampling_dict.pckl", "rb")
    class_files = pickle.load(f)
    f.close()
    print("Dictionary variable has been loaded from disk.")
    dict_found = True
except (FileNotFoundError, EOFError) as e:
    class_files = {"train": {}, "val": {}}
    print("Dictionary variable has not been loaded from disk. Manual generation is required.")

**Generate files dictionary and save dictionary variable into a file**

Run this if the stored dictionary variable is not available for use, or when you made changes to the dataset files, or when you simply want to generate a fresh new one. This process might take a while depending on the number of files. When everything is done, the data file will be created on disk. As mentioned, this acts like a cache so that you don't have to re-read every label files when re-generate random samples.

In [None]:
from tqdm import tqdm

if dict_found:
    print("Dictionary variable generation is not required.")

else:
    print("Generating dictionary variable.")
    for filename in tqdm(train_label_files):
        filepath = os.path.join(train_label_folder_in, filename)
        label_file = open(filepath, "r")
        labels = label_file.readlines()
        for line in labels:
            label_class = int(line.split(" ")[0])
            try:
                class_files["train"][label_class].add(filename)
            except KeyError:
                class_files["train"][label_class] = {filename}
        label_file.close()

    for filename in tqdm(val_label_files):
        filepath = os.path.join(val_label_folder_in, filename)
        label_file = open(filepath, "r")
        labels = label_file.readlines()
        for line in labels:
            label_class = int(line.split(" ")[0])
            try:
                class_files["val"][label_class].add(filename)
            except KeyError:
                class_files["val"][label_class] = {filename}
        label_file.close()

    f = open("sampling_dict.pckl", "wb")
    pickle.dump(class_files, f)
    f.close()

    print("Dictionary variable has been saved to disk.")

# Random Sampling

Note: The number of output samples might be less than `samples_per_class * number_of_classes` as one label file could contain more than one class.

In [None]:
import random

train_sample_labels = set()
val_sample_labels = set()

for class_index in class_files["train"]:
    try:
        samples = random.sample([*class_files["train"][class_index]], train_samples_per_class)
    except ValueError as e:
        samples = [*class_files["train"][class_index]]
    train_sample_labels = train_sample_labels.union(samples)
    
print(f"Successfully picked {len(train_sample_labels)} train random samples.")

for class_index in class_files["val"]:
    try:
        samples = random.sample([*class_files["val"][class_index]], val_samples_per_class)
    except ValueError as e:
        samples = [*class_files["val"][class_index]]
    val_sample_labels = val_sample_labels.union(samples)

print(f"Successfully picked {len(val_sample_labels)} val random samples.")

# Copy Files to the Destination Directory

In [None]:
import shutil
from tqdm import tqdm
import datetime

dirs = [dir_out, train_label_folder_out, train_image_folder_out, val_image_folder_out, val_label_folder_out]

for d in dirs:
    if not os.path.isdir(d):
        os.makedirs(d)

for filename in tqdm(train_sample_labels):
    label_source_path = os.path.join(train_label_folder_in, filename)
    label_destination_path = os.path.join(train_label_folder_out, filename)
    shutil.copyfile(label_source_path, label_destination_path)

    filename_image = filename.removesuffix(".txt") + ".jpg"
    image_source_path = os.path.join(train_image_folder_in, filename_image)
    image_destination_path = os.path.join(train_image_folder_out, filename_image)
    shutil.copyfile(image_source_path, image_destination_path)

timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"[{timestamp}] Done copying train random samples from {train_folder_in} to {train_folder_out}.")

for filename in tqdm(val_sample_labels):
    label_source_path = os.path.join(val_label_folder_in, filename)
    label_destination_path = os.path.join(val_label_folder_out, filename)
    shutil.copyfile(label_source_path, label_destination_path)

    filename_image = filename.removesuffix(".txt") + ".jpg"
    image_source_path = os.path.join(val_image_folder_in, filename_image)
    image_destination_path = os.path.join(val_image_folder_out, filename_image)
    shutil.copyfile(image_source_path, image_destination_path)

timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"[{timestamp}] Done copying val random samples from {val_folder_in} to {val_folder_out}.")