# **Dataset Cleaning, Augmentation, and Combination for the Chosen Roboflow Datasets**

## First Section: Environment Setup

In the below two modules of code, the GDrive is mounted and the dataset file in my GDrive is set to the current directory.

In [None]:
#NOTE: Class 0 = headset, 1 = Mouse, 2 = keyboard, 3 = PC
#Step 1: Loading the drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Step 2: Setting the file in my GDrive that has the datasets to current directory
import os
os.chdir('/content/drive/MyDrive/create_data')

# **Dataset # 6**

## Updating label files, deleting corrupt label files, and deleting corresponding images without label files.

The initial paths point to the label directories for train, val, and test datasets for Dataset 6. These label files are processed to standardise class labels.

The update labels function processes each label file in the dataset directories, reads the labels, and updates them based on the predefined mappings:

Class 4 is changed to 0.
Class 6 is changed to 1.
Class 0 is changed to 2.
Classes 2, 3, 5, 7, 8 are discarded (ignored).

After updating these labels, in the immediately following block of code, image files without valid labels are deleted, ensuring that only correctly labeled images are included in the dataset.

The update_labels() function is applied to the tain, val, and test datasets to ensure all three are updated and consistent.

In [None]:
#Dataset 6 label update, deletion, and corresponding image management.
import os
import shutil

# Path to the label files
train_labels_path = 'data6/train/labels'
val_labels_path = 'data6/valid/labels'
test_labels_path = 'data6/test/labels'

# Function to update labels in a given path
def update_labels(path):
    for label_file in os.listdir(path):
        if label_file.endswith('.txt'):
            file_path = os.path.join(path, label_file)
            updated_lines = []
            with open(file_path, 'r') as file:
                lines = file.readlines()
                for line in lines:
                    label, *coords = line.split()
                    label = int(label)
                    if label == 4:
                        label = 0
                    elif label == 6:
                        label = 1
                    elif label == 0:
                        label = 2
                    elif label in {2, 3, 5, 7, 8}:
                        continue
                    updated_lines.append(f"{label} {' '.join(coords)}\n")

            if updated_lines:
                with open(file_path, 'w') as file:
                    file.writelines(updated_lines)
            else:
                os.remove(file_path)

# Update labels in train, validation, and test sets
for path in [train_labels_path, val_labels_path, test_labels_path]:
    update_labels(path)
print("Labels updated successfully.")
########################################################

In [None]:
# Paths to image and label directories
train_images_path = 'data4/train/images'
val_images_path = 'data4/valid/images'
test_images_path = 'data4/test/images'

train_labels_path = 'data4/train/labels'
val_labels_path = 'data4/valid/labels'
test_labels_path = 'data4/test/labels'

# Function to delete images without corresponding label files
def delete_images_without_labels(images_path, labels_path):
    for image_file in os.listdir(images_path):
        if image_file.endswith(('.jpg', '.jpeg', '.png', '.webp')):
            label_file = os.path.splitext(image_file)[0] + '.txt'
            label_file_path = os.path.join(labels_path, label_file)
            if not os.path.exists(label_file_path):
                os.remove(os.path.join(images_path, image_file))
                print(f"Deleted {image_file} as it has no corresponding label file.")

# Delete images without labels in train, validation, and test sets
for images_path, labels_path in [(train_images_path, train_labels_path),
                                 (val_images_path, val_labels_path),
                                 (test_images_path, test_labels_path)]:
    delete_images_without_labels(images_path, labels_path)

print("Completed deleting images without corresponding label files.")


# **Dataset # 05**

## Updating and standardising label files.

This step follows the same structure as with dataset 6, however this dataset did not need as much management of corrupt label files. As such, images without label files did not occur and did not need to be deleted.

The update labels function reads and updates the lbales in specified directories, remapping class 0 to class 1. It then updates the file with the new labels.

In [None]:
import os
# Paths to the label directories
train_labels_path = 'data5/train/labels'
val_labels_path = 'data5/valid/labels'
test_labels_path = 'data5/test/labels'

# Function to update labels in a given path
def update_labels(path):
    for label_file in os.listdir(path):
        if label_file.endswith('.txt'):
            file_path = os.path.join(path, label_file)
            updated_lines = []
            with open(file_path, 'r') as file:
                lines = file.readlines()
                for line in lines:
                    label, *coords = line.split()
                    label = int(label)
                    if label == 0:
                        label = 1
                    updated_lines.append(f"{label} {' '.join(coords)}\n")

            with open(file_path, 'w') as file:
                file.writelines(updated_lines)

# Update labels in train, validation, and test sets
for path in [train_labels_path, val_labels_path, test_labels_path]:
    update_labels(path)

print("Labels updated successfully.")
##################################################################

# **Dataset # 04**

## Updating and standardising label files, deleting image files without labels.

This step follows the same structure as with dataset 6, however this dataset did not need as much management of corrupt label files.

The update labels function reads and updates the lbales in specified directories, remapping class 0 to class 3, and class 1 is ignored (removed). It then updates the file with the new labels. Files without labels are also deleted.

In [None]:
# Paths to the label directories
train_labels_path = 'data4/train/labels'
val_labels_path = 'data4/valid/labels'
test_labels_path = 'data4/test/labels'

# Function to update labels in a given path
def update_labels(path):
    for label_file in os.listdir(path):
        if label_file.endswith('.txt'):
            file_path = os.path.join(path, label_file)
            updated_lines = []
            with open(file_path, 'r') as file:
                lines = file.readlines()
                for line in lines:
                    label, *coords = line.split()
                    label = int(label)
                    if label == 0:
                        label = 3
                    elif label == 1:
                        continue
                    updated_lines.append(f"{label} {' '.join(coords)}\n")

            if updated_lines:
                with open(file_path, 'w') as file:
                    file.writelines(updated_lines)
            else:
                os.remove(file_path)

# Update labels in train, validation, and test sets
for path in [train_labels_path, val_labels_path, test_labels_path]:
    update_labels(path)
print("Labels updated successfully.")
######################################################################

# **Dataset # 03**

## Updating and standardising label files, deleting image files without labels.

This step follows the same structure as with dataset 6, however this dataset did not need as much management of corrupt label files.

Class 0 is kept unchanged in this dataset, the update_labels() function simply iterates over the label files, ensuring that all files contain valid labels.



In [None]:
import os
# Paths to the label directories
train_labels_path = 'data3/train/labels'
val_labels_path = 'data3/valid/labels'
test_labels_path = 'data3/test/labels'

# Function to update labels in a given path
def update_labels(path):
    for label_file in os.listdir(path):
        if label_file.endswith('.txt'):
            file_path = os.path.join(path, label_file)
            updated_lines = []
            with open(file_path, 'r') as file:
                lines = file.readlines()
                for line in lines:
                    label, *coords = line.split()
                    label = int(label)
                    if label == 0:
                        label = 0
                    updated_lines.append(f"{label} {' '.join(coords)}\n")

            with open(file_path, 'w') as file:
                file.writelines(updated_lines)

# Update labels in train, validation, and test sets
for path in [train_labels_path]:
    update_labels(path)

print("Labels updated successfully.")


# **Dataset # 02**

## Updating and standardising label files, deleting image files without labels.

This step follows the same structure as with dataset 6, however this dataset did not need as much management of corrupt label files.

The update labels function reads and updates the lbales in specified directories, remapping class labels 0, 1, and 2, to class 3.

In [None]:
# Paths to the label directories
train_labels_path = 'data2/train/labels'
val_labels_path = 'data2/valid/labels'
test_labels_path = 'data2/test/labels'

# Function to update labels in a given path
def update_labels(path):
    for label_file in os.listdir(path):
        if label_file.endswith('.txt'):
            file_path = os.path.join(path, label_file)
            updated_lines = []
            with open(file_path, 'r') as file:
                lines = file.readlines()
                for line in lines:
                    label, *coords = line.split()
                    label = int(label)
                    if label == 0:
                        label = 3
                    if label == 1:
                        label = 3
                    if label == 2:
                        label = 3
                    updated_lines.append(f"{label} {' '.join(coords)}\n")

            with open(file_path, 'w') as file:
                file.writelines(updated_lines)

# Update labels in train, validation, and test sets
for path in [train_labels_path]:
    update_labels(path)

print("Labels updated successfully.")


# **Dataset # 01**

## Updating and standardising label files, deleting image files without labels.

This step follows the same structure as with dataset 6, however this dataset did not need as much management of corrupt label files.

The update labels function reads and updates the lbales in specified directories, remapping class 0 to class 2. It then updates the file with the new labels. Files without labels are also deleted.

In [None]:
import os
# Paths to the label directories
train_labels_path = 'data1/train/labels'
val_labels_path = 'data1/valid/labels'
test_labels_path = 'data1/test/labels'

# Function to update labels in a given path
def update_labels(path):
    for label_file in os.listdir(path):
        if label_file.endswith('.txt'):
            file_path = os.path.join(path, label_file)
            updated_lines = []
            with open(file_path, 'r') as file:
                lines = file.readlines()
                for line in lines:
                    label, *coords = line.split()
                    label = int(label)
                    if label == 0:
                        label = 2
                    updated_lines.append(f"{label} {' '.join(coords)}\n")

            with open(file_path, 'w') as file:
                file.writelines(updated_lines)

# Update labels in train, validation, and test sets
for path in [train_labels_path]:
    update_labels(path)

print("Labels updated successfully.")


# **Counting the File Quantity for Each Label**

labels_path defines the path to the directory containting all of the label files for the combined datasets, and it initialises a dictionary to store those label counts.

The count_labels(path) function then counts the occurences of each class label across all label files in the directory. These results are stored in the label_counts directory.

The count_labels(labels_path) section then prints the number of instances for each class label, allowing me to check that the dataset is balanced.

In [None]:
# Path to the directory containing label files
labels_path = 'complete_data/labels'

# Dictionary to store the count of each class
label_counts = {}
# Function to count labels in a given path
def count_labels(path):
    for label_file in os.listdir(path):
        if label_file.endswith('.txt'):
            file_path = os.path.join(path, label_file)
            with open(file_path, 'r') as file:
                lines = file.readlines()
                for line in lines:
                    label, *coords = line.split()
                    label = int(label)
                    if label in label_counts:
                        label_counts[label] += 1
                    else:
                        label_counts[label] = 1

# Count labels in the specified directory
count_labels(labels_path)

# Print the counts of each class
for label, count in label_counts.items():
    print(f"Class {label}: {count} instances")

print("Completed counting labels.")

# **Check Label FIle Format Consistency**

The check_label_files function checks each label file in the specified directories to ensure that each line contains exactly 5 values (class label and bbox coordinates). If any line has a format error, it is flagged up using the print function for later debugging.

In [None]:
# Function to check label files for format consistency
def check_label_files(label_dirs):
    for label_dir in label_dirs:
        for label_file in os.listdir(label_dir):
            if label_file.endswith('.txt'):
                file_path = os.path.join(label_dir, label_file)
                with open(file_path, 'r') as file:
                    for line in file:
                        values = line.split()
                        if len(values) != 5:
                            print(f"Incorrect format in file {file_path}: {line.strip()}")

# List of label directories to check
label_dirs = [
    'data4/train/labels', 'data4/valid/labels', 'data4/test/labels',
    'data3/train/labels', 'data3/valid/labels', 'data3/test/labels',
    'data2/train/labels', 'data2/valid/labels', 'data2/test/labels',
    'data1/train/labels'
]

# Check the label files for format issues
check_label_files(label_dirs)


# **Defining the Augmentation Pipeline**

First, the necessary libraries are imported for image augmentation and handling. E.g. albumentations is used for image augmentations, and ToTensorV2 converts the images to PyTorch tensors.

The augmentations section then defines the augmentation pipeline with the following transformations, which will be outlined in more detail in the project report:


*   Horizontal and vertical flips
*   Random brightness and contrast adjustments
*   Rotations, shifts, and scaling
*   Converts the images to tensors for PyTorch compatibility

This is followed by bbox_params, which ensures that the bboxes are correctly updated during transformations.



In [None]:
#Augmentation steps
import os
import albumentations as A
from albumentations.pytorch import ToTensorV2
import cv2
import numpy as np

# Define augmentation pipeline
augmentations = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.5),
    A.RandomBrightnessContrast(p=0.2),
    A.Rotate(limit=30, p=0.5),
    A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=30, p=0.5),
    ToTensorV2()
], bbox_params=A.BboxParams(format='yolo', label_fields=['class_labels']))

# Function to load image and corresponding labels
def load_image_and_labels(image_path, labels_path):
    image = cv2.imread(image_path)
    height, width = image.shape[:2]
    with open(labels_path, 'r') as file:
        labels = []
        for line in file:
            class_label, x_center, y_center, bbox_width, bbox_height = map(float, line.split())
            labels.append([class_label, x_center, y_center, bbox_width, bbox_height])
    return image, labels, height, width

# Function to save augmented image and labels
def save_augmented_image_and_labels(image, labels, image_path, labels_path):
    # Convert tensor to numpy array
    image = image.permute(1, 2, 0).cpu().numpy()
    image = (image * 255).astype(np.uint8)  # Convert to uint8

    cv2.imwrite(image_path, image)
    with open(labels_path, 'w') as file:
        for label in labels:
            file.write(' '.join(map(str, label)) + '\n')

# Function to augment dataset
def augment_dataset(image_dir, label_dir, output_image_dir, output_label_dir, num_augmentations=1):
    os.makedirs(output_image_dir, exist_ok=True)
    os.makedirs(output_label_dir, exist_ok=True)

    for image_file in os.listdir(image_dir):
        if image_file.endswith(('.jpg', '.jpeg', '.png', '.webp')):
            image_path = os.path.join(image_dir, image_file)
            label_file = os.path.splitext(image_file)[0] + '.txt'
            label_path = os.path.join(label_dir, label_file)

            if os.path.exists(label_path):
                image, labels, height, width = load_image_and_labels(image_path, label_path)

                for i in range(num_augmentations):
                    augmented = augmentations(image=image, bboxes=[label[1:] for label in labels], class_labels=[label[0] for label in labels])
                    aug_image = augmented['image']
                    aug_labels = [[augmented['class_labels'][j]] + list(augmented['bboxes'][j]) for j in range(len(augmented['bboxes']))]

                    aug_image_path = os.path.join(output_image_dir, f"{os.path.splitext(image_file)[0]}_aug_{i}.jpg")
                    aug_label_path = os.path.join(output_label_dir, f"{os.path.splitext(image_file)[0]}_aug_{i}.txt")

                    save_augmented_image_and_labels(aug_image, aug_labels, aug_image_path, aug_label_path)

    print(f"Completed augmenting dataset in {image_dir}.")

# Apply augmentations to train, validation, and test sets
datasets = [
    ('data6/train/images', 'data6/train/labels', 'data6/train/aug_images', 'data6/train/aug_labels'),
    ('data6/valid/images', 'data6/valid/labels', 'data6/valid/aug_images', 'data6/valid/aug_labels'),
    ('data6/test/images', 'data6/test/labels', 'data6/test/aug_images', 'data6/test/aug_labels'),
    ('data5/train/images', 'data5/train/labels', 'data5/train/aug_images', 'data5/train/aug_labels'),
    ('data5/valid/images', 'data5/valid/labels', 'data5/valid/aug_images', 'data5/valid/aug_labels'),
    ('data5/test/images', 'data5/test/labels', 'data5/test/aug_images', 'data5/test/aug_labels'),
    ('data4/train/images', 'data4/train/labels', 'data4/train/aug_images', 'data4/train/aug_labels'),
    ('data4/valid/images', 'data4/valid/labels', 'data4/valid/aug_images', 'data4/valid/aug_labels'),
    ('data4/test/images', 'data4/test/labels', 'data4/test/aug_images', 'data4/test/aug_labels'),
    ('data3/train/images', 'data3/train/labels', 'data3/train/aug_images', 'data3/train/aug_labels'),
    ('data3/valid/images', 'data3/valid/labels', 'data3/valid/aug_images', 'data3/valid/aug_labels'),
    ('data3/test/images', 'data3/test/labels', 'data3/test/aug_images', 'data3/test/aug_labels'),
    ('data2/train/images', 'data2/train/labels', 'data2/train/aug_images', 'data2/train/aug_labels'),
    ('data2/valid/images', 'data2/valid/labels', 'data2/valid/aug_images', 'data2/valid/aug_labels'),
    ('data2/test/images', 'data2/test/labels', 'data2/test/aug_images', 'data2/test/aug_labels'),
    ('data1/train/images', 'data1/train/labels', 'data1/train/aug_images', 'data1/train/aug_labels'),
]

for image_dir, label_dir, output_image_dir, output_label_dir in datasets:
    augment_dataset(image_dir, label_dir, output_image_dir, output_label_dir)



# **Saving the cleaned and augmentated datasets to the GDrive**

GDrive is mounted to save the augmented datasets. The copy_to_drive function then copies the augmented images and labels from the local environment to the GDrive, ensuring that directories are created if they don't aready exist. It then iterates through the datasets and applies the copy_to_drive function, copying all of the data to GDrive directories.

**NOTE: I then *manually* saved the unified augmented dataset to my local machine as a zip file, before splitting the dataset into train, val, and test sets once again in the main script. As such, GDrive is not used in the main project file, but it was a necessary step in this section due to the amount of data that was being handled.**



In [None]:
# Save datasets

from google.colab import drive
import shutil
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define source and destination paths
datasets = [
    ('data6/train/aug_images', 'data6/train/aug_labels', '/content/drive/MyDrive/augmented_data6/train/aug_images', '/content/drive/MyDrive/augmented_data6/train/aug_labels'),
    ('data6/valid/aug_images', 'data6/valid/aug_labels', '/content/drive/MyDrive/augmented_data6/valid/aug_images', '/content/drive/MyDrive/augmented_data6/valid/aug_labels'),
    ('data6/test/aug_images', 'data6/test/aug_labels', '/content/drive/MyDrive/augmented_data6/test/aug_images', '/content/drive/MyDrive/augmented_data6/test/aug_labels'),
    ('data5/train/aug_images', 'data5/train/aug_labels', '/content/drive/MyDrive/augmented_data5/train/aug_images', '/content/drive/MyDrive/augmented_data5/train/aug_labels'),
    ('data5/valid/aug_images', 'data5/valid/aug_labels', '/content/drive/MyDrive/augmented_data5/valid/aug_images', '/content/drive/MyDrive/augmented_data5/valid/aug_labels'),
    ('data5/test/aug_images', 'data5/test/aug_labels', '/content/drive/MyDrive/augmented_data5/test/aug_images', '/content/drive/MyDrive/augmented_data5/test/aug_labels'),
    ('data4/train/aug_images', 'data4/train/aug_labels', '/content/drive/MyDrive/augmented_data4/train/aug_images', '/content/drive/MyDrive/augmented_data4/train/aug_labels'),
    ('data4/valid/aug_images', 'data4/valid/aug_labels', '/content/drive/MyDrive/augmented_data4/valid/aug_images', '/content/drive/MyDrive/augmented_data4/valid/aug_labels'),
    ('data4/test/aug_images', 'data4/test/aug_labels', '/content/drive/MyDrive/augmented_data4/test/aug_images', '/content/drive/MyDrive/augmented_data4/test/aug_labels'),
    ('data3/train/aug_images', 'data3/train/aug_labels', '/content/drive/MyDrive/augmented_data3/train/aug_images', '/content/drive/MyDrive/augmented_data3/train/aug_labels'),
    ('data3/valid/aug_images', 'data3/valid/aug_labels', '/content/drive/MyDrive/augmented_data3/valid/aug_images', '/content/drive/MyDrive/augmented_data3/valid/aug_labels'),
    ('data3/test/aug_images', 'data3/test/aug_labels', '/content/drive/MyDrive/augmented_data3/test/aug_images', '/content/drive/MyDrive/augmented_data3/test/aug_labels'),
    ('data2/train/aug_images', 'data2/train/aug_labels', '/content/drive/MyDrive/augmented_data2/train/aug_images', '/content/drive/MyDrive/augmented_data2/train/aug_labels'),
    ('data2/valid/aug_images', 'data2/valid/aug_labels', '/content/drive/MyDrive/augmented_data2/valid/aug_images', '/content/drive/MyDrive/augmented_data2/valid/aug_labels'),
    ('data2/test/aug_images', 'data2/test/aug_labels', '/content/drive/MyDrive/augmented_data2/test/aug_images', '/content/drive/MyDrive/augmented_data2/test/aug_labels'),
    ('data1/train/aug_images', 'data1/train/aug_labels', '/content/drive/MyDrive/augmented_data1/train/aug_images', '/content/drive/MyDrive/augmented_data1/train/aug_labels'),
]

# Function to copy directories to Google Drive
def copy_to_drive(src_image_dir, src_label_dir, dest_image_dir, dest_label_dir):
    os.makedirs(dest_image_dir, exist_ok=True)
    os.makedirs(dest_label_dir, exist_ok=True)

    for file_name in os.listdir(src_image_dir):
        full_file_name = os.path.join(src_image_dir, file_name)
        if os.path.isfile(full_file_name):
            shutil.copy(full_file_name, dest_image_dir)

    for file_name in os.listdir(src_label_dir):
        full_file_name = os.path.join(src_label_dir, file_name)
        if os.path.isfile(full_file_name):
            shutil.copy(full_file_name, dest_label_dir)

# Copy augmented datasets to Google Drive
for src_image_dir, src_label_dir, dest_image_dir, dest_label_dir in datasets:
    copy_to_drive(src_image_dir, src_label_dir, dest_image_dir, dest_label_dir)

print("Augmented datasets have been copied to Google Drive.")
