# Data cleaning


* Date:       March 26, 2023
1. Download the 3 small datasets (resized version) and combine them into 1 folder.
2. Split the dataset in 80:10:10 ratio.

----

In [2]:
import os
import random
import shutil

## Split the dataset to 80:10:10 ratio.
The `train` dataset contains 80% of the dataset, while `test` and `valid` contains 10% each.

In [3]:
# Set the paths for your dataset
data_path = r"C:\Users\sheil\OneDrive\Documents\Craus\resized\resized_set"
images_path = os.path.join(data_path, 'images')
labels_path = os.path.join(data_path, 'labels')

# Set the paths for your new train/valid/test folders
train_path = os.path.join(data_path, 'train')
valid_path = os.path.join(data_path, 'valid')
test_path = os.path.join(data_path, 'test')

# Set the names for the sub-folders
images_folder = 'images'
labels_folder = 'labels'

# Set the ratios for train/valid/test
train_ratio = 0.8
valid_ratio = 0.1
test_ratio = 0.1

# Create the new folders if they don't exist
os.makedirs(train_path, exist_ok=True)
os.makedirs(os.path.join(train_path, images_folder), exist_ok=True)
os.makedirs(os.path.join(train_path, labels_folder), exist_ok=True)

os.makedirs(valid_path, exist_ok=True)
os.makedirs(os.path.join(valid_path, images_folder), exist_ok=True)
os.makedirs(os.path.join(valid_path, labels_folder), exist_ok=True)

os.makedirs(test_path, exist_ok=True)
os.makedirs(os.path.join(test_path, images_folder), exist_ok=True)
os.makedirs(os.path.join(test_path, labels_folder), exist_ok=True)

# Get the list of image filenames
image_filenames = [f for f in os.listdir(images_path) if f.endswith(('.jpg', '.jpeg', '.png'))]

# Shuffle the image filenames randomly
random.shuffle(image_filenames)

# Calculate the split points for train/valid/test
train_split = int(len(image_filenames) * train_ratio)
valid_split = int(len(image_filenames) * (train_ratio + valid_ratio))

# Move the images and labels to the corresponding folders
for i, image_filename in enumerate(image_filenames):
    src_image_path = os.path.join(images_path, image_filename)
    src_label_path = os.path.join(labels_path, image_filename.replace(os.path.splitext(image_filename)[1], '.txt'))
    
    if i < train_split:
        dst_image_path = os.path.join(train_path, images_folder, image_filename)
        dst_label_path = os.path.join(train_path, labels_folder, image_filename.replace(os.path.splitext(image_filename)[1], '.txt'))
    elif i < valid_split:
        dst_image_path = os.path.join(valid_path, images_folder, image_filename)
        dst_label_path = os.path.join(valid_path, labels_folder, image_filename.replace(os.path.splitext(image_filename)[1], '.txt'))
    else:
        dst_image_path = os.path.join(test_path, images_folder, image_filename)
        dst_label_path = os.path.join(test_path, labels_folder, image_filename.replace(os.path.splitext(image_filename)[1], '.txt'))
    
    shutil.copy(src_image_path, dst_image_path)
    shutil.copy(src_label_path, dst_label_path)


In [4]:
# Set the subfolder paths for images and labels
train_images_path = os.path.join(train_path, 'images')
valid_images_path = os.path.join(valid_path, 'images')
test_images_path  = os.path.join(test_path, 'images')

# Print the number of images in each folder
train_size = len(os.listdir(train_images_path))
valid_size = len(os.listdir(valid_images_path))
test_size = len(os.listdir(test_images_path))

print(f'Training dataset   = {train_size} images')
print(f'Validation dataset = {valid_size} images')
print(f'Testing dataset    = {test_size} images')
print(f'—————————————————————————————————')
print(f'Total images       = {train_size + valid_size + test_size} images')

Training dataset   = 17541 images
Validation dataset = 2193 images
Testing dataset    = 2193 images
—————————————————————————————————
Total images       = 21927 images


## Class distribution
Show how many annotations each class have.
* Class 0 = `not_covered`
* Class 1 = `fully_covered`
* Class 2 = `partially_covered`


## Class balance report 
Show a graph describing how each class was represented in the dataset.