In [None]:
# reset files
!rm -rf datasets

# Data skew fix preprocess and augmentation script
This script is an attempt to improve a skewed dataset from your Roboflow workspace with underrepresented classes.  It does this by downloading the dataset images and creating new images of individual class labels with augmentations.  It finds the count of each class as well as how many augmentations per class are needed to balance the dataset.  After these are created, the new images are sent back to your original dataset.

### To use this script:
This was tested on Google Colab.  Simply install the libraries, then run the main cell which will ask for your:
- roboflow apikey
- roboflow workspace
- roboflow project
- roboflow project version
- roboflow download format
- desired padding around the bounding box here (in pixels)

In [None]:
!pip install --upgrade pip -q
!pip install fastapi -q
!pip install kaleido -q
!pip install uvicorn -q
!pip install python-multipart -q
!pip install roboflow -q
!pip install ultralytics==8.0.196 -q
!pip install inference -q
!pip install albumentations -q
!apt-get install tree


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tree is already the newest version (2.0.2-1).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.


In [None]:
import os
import ast
import glob
import albumentations as A
from tqdm import tqdm
from roboflow import Roboflow
import getpass
import requests
from io import BytesIO
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import matplotlib.image as mpimg
import tempfile
import cv2
import json
import math
from google.colab.patches import cv2_imshow
from ipywidgets import Video
import supervision as sv

from IPython import display
display.clear_output()

import ultralytics
ultralytics.checks()

from ultralytics import YOLO
from IPython.display import display, Image

HOME = os.getcwd()
print(HOME)

!mkdir {HOME}/datasets
%cd {HOME}/datasets

roboflow_apikey = getpass.getpass('enter roboflow apikey here')
roboflow_workspace = getpass.getpass('enter roboflow workspace here')
roboflow_project = getpass.getpass('enter roboflow project here')
roboflow_project_version = getpass.getpass('enter roboflow project version here')
roboflow_download_format = getpass.getpass('enter roboflow download format here')
masked_image_padding = int(getpass.getpass('enter desired padding around the bounding box here. 50 is a good start'))

# download images and bboxes
rf = Roboflow(api_key=roboflow_apikey)
project = rf.workspace(roboflow_workspace).project(roboflow_project)
dataset = project.version(roboflow_project_version).download(roboflow_download_format)

project_path= f'{HOME}/datasets/{roboflow_project}-{roboflow_project_version}'
original_image_path = f'{project_path}/train/images/'
original_labels_path = f'{project_path}/train/labels/'
masked_image_output_path = f'{project_path}/train/output/'
upload_path = f'{project_path}/train/upload'
yaml_file_path = f'{project_path}/data.yaml'
file_extension_type = ".jpg"


# Initialize an empty list to store the class names
class_names = []

# Reading the YAML file
with open(yaml_file_path, 'r') as file:
    lines = file.readlines()
    capture = False
    for line in lines:
        if line.strip() == "names:":
            capture = True
            continue
        if line.strip().startswith('nc:'):
            break
        if capture:
            class_names.append(line.strip().replace('- ', ''))

# Writing to label_maps.txt in the same directory
label_maps_path = os.path.join(os.path.dirname(yaml_file_path), 'label_maps.txt')
with open(label_maps_path, 'w') as file:
    for name in class_names:
        file.write(name + '\n')

# Outputting the location of label_maps.txt
print(f"\n\nClass names written to {label_maps_path}\n\n")

# Reading and displaying the contents of label_maps.txt
with open(label_maps_path, 'r') as file:
    contents = file.read()
    print("Contents of label_maps.txt:\n")
    print(contents)


# saves images with padding on the mask as well as the original labels for all underrepresented classes.
# also store factor for underrepresented class with unique identifier

# Ensure the output base path exists
if not os.path.exists(masked_image_output_path):
    os.makedirs(masked_image_output_path)

# First, count labels for each class
class_counter = {}
label_files = [f for f in os.listdir(original_labels_path) if os.path.isfile(os.path.join(original_labels_path, f))]

for label_file in label_files:
    label_file_path = os.path.join(original_labels_path, label_file)
    load_bboxes = np.genfromtxt(label_file_path, dtype='float')
    if load_bboxes.ndim == 1:
        load_bboxes = [load_bboxes]
    for bbox in load_bboxes:
        category = int(bbox[0])
        class_counter[category] = class_counter.get(category, 0) + 1

# Find the maximum count
max_count = max(class_counter.values())

# Calculate the factor for each underrepresented class
underrepresented_factors = {}
for category, count in class_counter.items():
    if count < max_count:
        underrepresented_factors[category] = math.floor(max_count / count)


# Process and save images and labels for underrepresented classes
image_files = [f for f in os.listdir(original_image_path) if os.path.isfile(os.path.join(original_image_path, f))]

for image_file in image_files:
    image_path = os.path.join(original_image_path, image_file)
    label_file = image_file.replace('.jpg', '.txt')
    label_file_path = os.path.join(original_labels_path, label_file)

    image = cv2.imread(image_path)
    load_bboxes = np.genfromtxt(label_file_path, dtype='float')
    if load_bboxes.ndim == 1:
        load_bboxes = [load_bboxes]
    load_bboxes = list(load_bboxes)

    img_ht, img_wd = image.shape[:2]
    bbox_count = {}  # Keep track of the number of bboxes for each class in this image

    for bbox in load_bboxes:
        category = int(bbox[0])

        # Only process if this category's count is less than max_count
        if class_counter[category] < max_count:
            bbox_count[category] = bbox_count.get(category, 0) + 1
            bb_width = int(round(bbox[3] * img_wd))
            bb_height = int(round(bbox[4] * img_ht))

            x_center = bbox[1] * img_wd
            y_center = bbox[2] * img_ht
            x_min = int(x_center - (bb_width / 2))
            x_max = int(x_center + (bb_width / 2))
            y_min = int(y_center - (bb_height / 2))
            y_max = int(y_center + (bb_height / 2))

            padded_x_min = max(x_min - masked_image_padding, 0)
            padded_x_max = min(x_max + masked_image_padding, img_wd)
            padded_y_min = max(y_min - masked_image_padding, 0)
            padded_y_max = min(y_max + masked_image_padding, img_ht)

            mask = np.zeros(image.shape[:2], dtype=np.uint8)
            cv2.rectangle(mask, (padded_x_min, padded_y_min), (padded_x_max, padded_y_max), 255, -1)
            masked_image = cv2.bitwise_and(image, image, mask=mask)

            class_images_dir = os.path.join(masked_image_output_path, f'class_{category}', 'images')
            class_labels_dir = os.path.join(masked_image_output_path, f'class_{category}', 'labels')
            if not os.path.exists(class_images_dir):
                os.makedirs(class_images_dir)
            if not os.path.exists(class_labels_dir):
                os.makedirs(class_labels_dir)

            base_filename = os.path.splitext(image_file)[0]
            unique_identifier = f"{base_filename}_class_{category}_{bbox_count[category]}"
            new_image_name = f'{unique_identifier}_masked.jpg'
            new_label_name = f'{unique_identifier}_masked.txt'

            cv2.imwrite(os.path.join(class_images_dir, new_image_name), masked_image)
            with open(os.path.join(class_labels_dir, new_label_name), 'w') as label_file:
                label_file.write(f'{category} {bbox[1]} {bbox[2]} {bbox[3]} {bbox[4]}\n')

# Convert to JSON string with indentation
formatted_class_counts = json.dumps(class_counter, indent=4)
formatted_underrepresented_class_factors = json.dumps(underrepresented_factors, indent=4)

# Print formatted JSON
print("Class Counts:\n", formatted_class_counts)
print("\nUnderrepresented Class Factors:\n", formatted_underrepresented_class_factors)


# add augmentations to underrepresented class images using the factor of each class. Then store new images and labels in an upload folder
# Augmentations in Albumentations library need to be in Pascal_voc format so conversion needs to be done here

# Define the Albumentations augmentation pipeline
aug = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.RandomBrightnessContrast(brightness_limit=0.1, contrast_limit=0.1, p=0.5),
    A.ShiftScaleRotate(shift_limit=0.15, scale_limit=0.15, rotate_limit=.15, p=1),
    A.PadIfNeeded(min_height=1024, min_width=1024, border_mode=0, p=1.0)
], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['class_labels']))

# Function to convert YOLO format bbox to Pascal VOC format
def yolo_to_pascal_voc(bbox, img_width, img_height):
    x_center, y_center, w, h = bbox
    x_min = int((x_center - w / 2) * img_width)
    y_min = int((y_center - h / 2) * img_height)
    x_max = int((x_center + w / 2) * img_width)
    y_max = int((y_center + h / 2) * img_height)
    return [x_min, y_min, x_max, y_max]

# Function to convert Pascal VOC format bbox to YOLO format
def pascal_voc_to_yolo(bbox, img_width, img_height):
    x_min, y_min, x_max, y_max = bbox
    x_center = ((x_min + x_max) / 2) / img_width
    y_center = ((y_min + y_max) / 2) / img_height
    w = (x_max - x_min) / img_width
    h = (y_max - y_min) / img_height
    return [x_center, y_center, w, h]

# Define the output and upload paths. Remember the output folder is what is holding the padded single label images with mask.
upload_images_path = os.path.join(upload_path, 'images')
upload_labels_path = os.path.join(upload_path, 'labels')

os.makedirs(upload_images_path, exist_ok=True)
os.makedirs(upload_labels_path, exist_ok=True)

# underrepresented_factors is defined in previous code block

# Iterate over each class folder in the output directory
for class_folder in os.listdir(masked_image_output_path):
    class_id = int(class_folder.split('_')[1])
    class_dir = os.path.join(masked_image_output_path, class_folder)
    images_dir = os.path.join(class_dir, 'images')
    labels_dir = os.path.join(class_dir, 'labels')

    if class_id in underrepresented_factors:
        factor = underrepresented_factors[class_id]

        for image_file in os.listdir(images_dir):
            image_path = os.path.join(images_dir, image_file)
            label_file = image_file.replace('.jpg', '.txt')
            label_path = os.path.join(labels_dir, label_file)

            # Check if label file exists and is not empty
            if os.path.exists(label_path) and os.path.getsize(label_path) > 0:
                image = cv2.imread(image_path)
                labels = np.genfromtxt(label_path, dtype='float')
                if labels.ndim == 1:
                    labels = [labels]

                img_ht, img_wd = image.shape[:2]
                pascal_voc_bboxes = [yolo_to_pascal_voc(label[1:], img_wd, img_ht) for label in labels]
                class_labels = [int(label[0]) for label in labels]

                for i in range(factor):
                    augmented = aug(image=image, bboxes=pascal_voc_bboxes, class_labels=class_labels)

                    augmented_image_path = os.path.join(upload_images_path, f"{os.path.splitext(image_file)[0]}_aug_{i}.jpg")
                    cv2.imwrite(augmented_image_path, augmented['image'])

                    augmented_label_path = os.path.join(upload_labels_path, f"{os.path.splitext(label_file)[0]}_aug_{i}.txt")
                    with open(augmented_label_path, 'w') as file:
                        for bbox, class_label in zip(augmented['bboxes'], augmented['class_labels']):
                            yolo_bbox = pascal_voc_to_yolo(bbox, img_wd, img_ht)
                            file.write(f"{class_label} {' '.join(map(str, yolo_bbox))}\n")
            else:
                print(f"Label file {label_path} is empty or does not exist.")


# run through all new images with bboxes, and delete files where label doesn't exist.
# This can happen if the image moves offscreen and we don't need those files


# Paths for images and labels within the class directory
images_path = os.path.join(upload_path, 'images')
labels_path = os.path.join(upload_path, 'labels')

# List to store the names of deleted files
deleted_files = []

# Loop through each image file in the class directory
for image_file in os.listdir(images_path):
    # Construct the full image and label paths
    image_path = os.path.join(images_path, image_file)
    label_file = image_file.replace('.jpg', '.txt')
    label_file_path = os.path.join(labels_path, label_file)

    # Load the image
    image = cv2.imread(image_path)

    # Check if the label file exists and is not empty
    if os.path.isfile(label_file_path) and os.path.getsize(label_file_path) > 0:
        pass

    else:
        # Delete the image and label file and add to the deleted files list
        os.remove(image_path)
        deleted_files.append(image_path)
        if os.path.isfile(label_file_path):
            os.remove(label_file_path)
            deleted_files.append(label_file_path)

# Output the list of deleted files at the end of the script
print("Files deleted where label does not exist:")
for file in deleted_files:
    print(file)


# number of new images to upload

# Count the .jpg files
jpg_count = sum(1 for file in os.listdir(images_path) if file.endswith('.jpg'))

print(f"\n\nNumber of .jpg files to upload: {jpg_count}\n\n")

# Upload images and corresponding annotations
image_glob = glob.glob(images_path + '/*' + file_extension_type)
for image_path in image_glob:
    # Split the path and extract the filename
    filename_with_extension = image_path.split('/')[-1]

    # Remove the file extension
    base_name = filename_with_extension.rsplit('.', 1)[0]

    # Find all possible matching annotation files
    annotation_glob = glob.glob(f"{upload_path}/labels/{base_name}.txt")
    annotation_file = None
    for potential_annotation in annotation_glob:
        if base_name in potential_annotation:
            annotation_file = potential_annotation
            break

    if annotation_file:
        # Upload image and annotation
        print(project.single_upload(
            image_path=image_path,
            annotation_path=annotation_file,
            # optional parameters:
            annotation_labelmap=label_maps_path,
            # split='train',
            # num_retry_uploads=0,
            # batch_name='batch_name',
            # tag_names=['tag1', 'tag2'],
            # is_prediction=False,
        ))
    else:
        print(f"No matching annotation file found for {image_path}")

Ultralytics YOLOv8.0.196 🚀 Python-3.10.12 torch-2.1.0+cu121 CPU (Intel Xeon 2.20GHz)
Setup complete ✅ (2 CPUs, 12.7 GB RAM, 26.8/107.7 GB disk)


/content
/content/datasets
enter roboflow apikey here··········
enter roboflow workspace here··········
enter roboflow project here··········
enter roboflow project version here··········
enter roboflow download format here··········
enter desired padding around the bounding box here. 50 is a good start··········
loading Roboflow workspace...
loading Roboflow project...


Downloading Dataset Version Zip in find_fish_bbox-1 to yolov8:: 100%|██████████| 2151/2151 [00:00<00:00, 47828.05it/s]





Extracting Dataset Version Zip to find_fish_bbox-1 in yolov8:: 100%|██████████| 33/33 [00:00<00:00, 2669.62it/s]




Class names written to /content/datasets/find_fish_bbox-1/label_maps.txt


Contents of label_maps.txt:

black_orange_fish
smaller_white_fish
striped_fish
white_fish
yellow_fish

Class Counts:
 {
    "3": 36,
    "4": 9,
    "0": 3,
    "1": 4,
    "2": 2
}

Underrepresented Class Factors:
 {
    "4": 4,
    "0": 12,
    "1": 9,
    "2": 18
}
Files deleted where label does not exist:
/content/datasets/find_fish_bbox-1/train/upload/images/PXL_20231012_075201332_TS_mp4-0_jpg.rf.aecd4ec8d2bdec78f76bfa164b302a0f_class_4_2_masked_aug_2.jpg
/content/datasets/find_fish_bbox-1/train/upload/labels/PXL_20231012_075201332_TS_mp4-0_jpg.rf.aecd4ec8d2bdec78f76bfa164b302a0f_class_4_2_masked_aug_2.txt
/content/datasets/find_fish_bbox-1/train/upload/images/PXL_20231012_075201332_TS_mp4-2_jpg.rf.8807edbacd38658b1b10570f4b97fb3a_class_0_1_masked_aug_11.jpg
/content/datasets/find_fish_bbox-1/train/upload/labels/PXL_20231012_075201332_TS_mp4-2_jpg.rf.8807edbacd38658b1b10570f4b97fb3a_class_0_1_masked_aug_1