# Data Pre-processing:

Converting the default *Military Aircraft Detection Dataset* in to yolo format dataset

In [None]:
%pip install glob2 pytest-shutil cv-3 os.path2 tqdm pickleshare

Collecting pickleshare
  Downloading pickleshare-0.7.5-py2.py3-none-any.whl.metadata (1.5 kB)
Downloading pickleshare-0.7.5-py2.py3-none-any.whl (6.9 kB)
Installing collected packages: pickleshare
Successfully installed pickleshare-0.7.5
[0mNote: you may need to restart the kernel to use updated packages.


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
import shutil
import cv2
import os
import yaml
from tqdm import tqdm

In [None]:
%pip install ultralytics
import ultralytics
ultralytics.checks()

Ultralytics YOLOv8.2.9 🚀 Python-3.11.7 torch-2.1.1+cu121 CUDA:0 (NVIDIA RTX A4000, 16109MiB)
Setup complete ✅ (8 CPUs, 44.1 GB RAM, 96.1/244.2 GB disk)


In [None]:
# Different type of Aircrafts
classes = np.array(['A10','A400M','AG600','AV8B',
        'B1','B2','B52','Be200',
        'C130','C2','C17','C5','E2','E7','EF2000',
        'F117','F14','F15','F16','F18','F22','F35','F4',
        'JAS39','MQ9','Mig31','Mirage2000','P3','RQ4','Rafale',
        'SR71','Su34','Su57',
        'Tu160','Tu95','Tornado',
        'U2','US2', 'V22','XB70','YF23','Vulcan','J20',
        'KC135', 'J10', 'Su25', 'Su24']
)

# Number of classes
print(len(classes))

47


**Using the files from dataset folder**

In [None]:
# Paths and directories to be used in next step for conversion of original csv's into yolo format txt files
csv_paths = glob.glob('dataset/*.csv')
jpg_paths = glob.glob('dataset/*.jpg')
csv_paths.sort()
jpg_paths.sort()

# Number of images and Number of jpgs files
print(f'num of images: {len(csv_paths)} \nnum of jpgs: {len(jpg_paths)}')

# Creating folders for each class
for cls in classes:
  os.makedirs(f'yolo_classes/images/{cls}', exist_ok=True)
  os.makedirs(f'yolo_classes/labels/{cls}', exist_ok=True)

num of images: 13442 
num of jpgs: 13442


**Default csv labels: width, height, class, xmin, ymin, xmax, ymax**

**Target label structure: class_num, x_center, y_center, width, heigth**

In [None]:
# Converting original dataset csv files into yolo format dataset txt files
for i, (csv_path, jpg_path) in enumerate(tqdm(zip(csv_paths, jpg_paths))):
    annotations = np.array(pd.read_csv(csv_path))
    # print(f'Annotations: {annotations}')

    cls = ''
    for annot in annotations:
      cls = class_name = annot[3]

    jpg_file_path = f'yolo_classes/images/{cls}/' + os.path.basename(jpg_path)
    txt_file_path = f'yolo_classes/labels/{cls}/' + os.path.basename(csv_path)[:-4]+'.txt'
    shutil.copy(jpg_path, jpg_file_path)
    # print(f'Images and Text files - {jpg_file_path}, {txt_file_path}')

    with open(txt_file_path, mode='w') as f:
        try:
            for annotation in annotations:
                width = annotation[1]
                height = annotation[2]
                class_name = annotation[3]

                # Calculating the x/y values for x_center and y_center
                xmin = annotation[4]
                ymin = annotation[5]
                xmax = annotation[6]
                ymax = annotation[7]
                x_center = 0.5*(xmin+xmax)
                y_center = 0.5*(ymin+ymax)

                b_width = xmax - xmin
                b_height= ymax - ymin
                class_num = np.where(classes==class_name)[0][0]
                output_string = '{} {} {} {} {}\n'.format(class_num,
                                                        x_center/width,
                                                        y_center/height,
                                                        b_width/width,
                                                        b_height/height)
                # print(f'Output String: {output_string}')
                f.write(output_string)
        except:
            print(f'text file path: {txt_file_path}')
            0/0

print('finished')

In [None]:
# Comparing lengths of yolo_classes images/txts with original dataset ones
print('yolo_classes jpg',len(glob.glob('yolo_classes/images/*/*.jpg')))
print('yolo_classes labels',len(glob.glob('yolo_classes/labels/*/*.txt')))
print('Original dataset jpg',len(glob.glob('dataset/*.jpg')))
print('Original dataset labels',len(glob.glob('dataset/*.csv')))

yolo_classes jpg 13442
yolo_classes labels 13442
Original dataset jpg 13442
Original dataset labels 13442


In [None]:
!pip install split-folders



In [None]:
# Split the dataset with a ratio 80, 10, 10 for modes: train, val, test resp.
import splitfolders

input_folder = 'yolo_classes/images'
output_folder = 'yolo_format_dataset/images'
input_folder_labels = 'yolo_classes/labels'
output_folder_labels = 'yolo_format_dataset/labels'

# To only split into training and validation set, set a tuple to `ratio`, i.e, `(.8, .1, .1)`.
splitfolders.ratio(input_folder, output=output_folder, seed=1337, ratio=(.8, .1, .1), group_prefix=None, move=False) # default values
splitfolders.ratio(input_folder_labels, output=output_folder_labels, seed=1337, ratio=(.8, .1, .1), group_prefix=None, move=False) # default values

Copying files: 13442 files [01:06, 203.39 files/s]
Copying files: 13442 files [00:09, 1357.25 files/s]


In [None]:
# Checking lengths of yolo_format_dataset images
print('yolo_format_dataset train',len(glob.glob('yolo_format_dataset/images/train/*/*.jpg')))
print('yolo_format_dataset val',len(glob.glob('yolo_format_dataset/images/val/*/*.jpg')))
print('yolo_format_dataset test',len(glob.glob('yolo_format_dataset/images/test/*/*.jpg')))
print('Images count', len(glob.glob('yolo_format_dataset/images/train/*/*.jpg')) + len(glob.glob('yolo_format_dataset/images/val/*/*.jpg')) + len(glob.glob('yolo_format_dataset/images/test/*/*.jpg')))

# Checking lengths of yolo_format_dataset labels
print('yolo_format_dataset train',len(glob.glob('yolo_format_dataset/labels/train/*/*.txt')))
print('yolo_format_dataset val',len(glob.glob('yolo_format_dataset/labels/val/*/*.txt')))
print('yolo_format_dataset test',len(glob.glob('yolo_format_dataset/labels/test/*/*.txt')))
print('Label counts', len(glob.glob('yolo_format_dataset/labels/train/*/*.txt')) + len(glob.glob('yolo_format_dataset/labels/val/*/*.txt')) + len(glob.glob('yolo_format_dataset/labels/test/*/*.txt')))

yolo_format_dataset train 10735
yolo_format_dataset val 1320
yolo_format_dataset test 1387
Images count 13442
yolo_format_dataset train 10735
yolo_format_dataset val 1320
yolo_format_dataset test 1387
Label counts 13442


In [None]:
# Verifying whether the split dataset is correctly divided for both jpg's and txt files for corresponding mode's (train/val/test)
MODE=['train', 'val', 'test']
counter = 0
for mode in MODE:
    txt_paths = glob.glob(f'yolo_format_dataset/labels/{mode}/*/*.txt')
    jpgs = glob.glob(f'yolo_format_dataset/images/{mode}/*/*.jpg')
    txt_paths.sort()
    jpgs.sort()
    for i, (txt_path, jpg) in enumerate(tqdm(zip(txt_paths, jpgs))):
        jpg_file_path = os.path.basename(jpg)[:-4]
        txt_file_path = os.path.basename(txt_path)[:-4]
        counter += 1
        # print(f'{jpg_file_path} and {txt_file_path}')
        if jpg_file_path != txt_file_path:
            print(f'{txt_file_path} is a mismatch')
        # elif jpg_file_path == txt_file_path:
        #     print(f'{txt_file_path} is in {txt_path} and {jpg}')

print(counter)

10735it [00:00, 513905.76it/s]
1320it [00:00, 465446.09it/s]
1387it [00:00, 500947.18it/s]

13442





Rearranging the final input dataset for training will be in below format:



    yolo_final_dataset/

        images/
            train/*.jpg
            val/*.jpg
            test/*.jpg

        labels/
            train/*.txt
            val/*.txt
            test/*.txt  

In [None]:
# Create a final dataset which has to be in the yolo format -> yolo_final_dataset/images/{mode:train/val/test}/*.jpg
for mode in MODE:
    os.makedirs(f'yolo_final_dataset/images/{mode}', exist_ok=True)
    os.makedirs(f'yolo_final_dataset/labels/{mode}', exist_ok=True)
    img_path_src = glob.glob(f'yolo_format_dataset/images/{mode}/*/*.jpg')
    txt_path_src = glob.glob(f'yolo_format_dataset/labels/{mode}/*/*.txt')
    img_path_dest = glob.glob(f'yolo_final_dataset/images/{mode}/.')
    txt_path_dest = glob.glob(f'yolo_final_dataset/labels/{mode}/.')
    # print(img_path_dest[0])
    # print(txt_path_dest[0])
    for img in img_path_src:
        # print(img)
        shutil.copy(img, img_path_dest[0])
    for txt in txt_path_src:
        # print(txt)
        shutil.copy(txt, txt_path_dest[0])

In [None]:
# Checking lengths of yolo_final_dataset images
print('yolo_final_dataset train',len(glob.glob('yolo_final_dataset/images/train/*.jpg')))
print('yolo_final_dataset val',len(glob.glob('yolo_final_dataset/images/val/*.jpg')))
print('yolo_final_dataset test',len(glob.glob('yolo_final_dataset/images/test/*.jpg')))
print('Images count', len(glob.glob('yolo_final_dataset/images/train/*.jpg')) + len(glob.glob('yolo_final_dataset/images/val/*.jpg')) + len(glob.glob('yolo_final_dataset/images/test/*.jpg')))

# Checking lengths of yolo_final_dataset labels
print('yolo_final_dataset train',len(glob.glob('yolo_final_dataset/labels/train/*.txt')))
print('yolo_final_dataset val',len(glob.glob('yolo_final_dataset/labels/val/*.txt')))
print('yolo_final_dataset test',len(glob.glob('yolo_final_dataset/labels/test/*.txt')))
print('Label counts', len(glob.glob('yolo_final_dataset/labels/train/*.txt')) + len(glob.glob('yolo_final_dataset/labels/val/*.txt')) + len(glob.glob('yolo_final_dataset/labels/test/*.txt')))

yolo_final_dataset train 10735
yolo_final_dataset val 1320
yolo_final_dataset test 1387
Images count 13442
yolo_final_dataset train 10735
yolo_final_dataset val 1320
yolo_final_dataset test 1387
Label counts 13442


In [None]:
# Deleting folder recursively even for read-only files
def rmtree(path):
    if not os.path.exists(path):
        return 'Folder does not exist'
    DIR_READ_WRITE = 0o700
    FILE_READ_WRITE = 0o600
    def remove_readonly(func, path, _):
        "Clear the readonly bit and reattempt the removal"
        os.chmod(path, FILE_READ_WRITE)
        func(path)

    # set all directories as read/write
    os.chmod(path, DIR_READ_WRITE)
    for root, dirs, files in os.walk(path):
        for d in dirs:
            os.chmod(os.path.join(root, d), DIR_READ_WRITE)

    shutil.rmtree(path, onerror=remove_readonly)

In [None]:
# No need for yolo_classes and yolo_format_dataset folders anymore and therefore deleting them
rmtree('yolo_classes')
rmtree('yolo_format_dataset')

'Folder does not exits'

In [None]:
# Unzipping the yolo_final_dataset.zip
%cd /notebooks/

shutil.unpack_archive('/notebooks/yolo_final_dataset.zip','yolo_final_dataset','zip')

/notebooks


In [None]:
# Creating yaml file
def generate_yaml(train_path, val_path, names, nc, base):
    data = {
        "train": train_path,
        "val": val_path,
        "names": names,
        "nc": nc
    }

    with open(os.path.join(base, 'training_data.yaml'), 'w') as outfile:
        yaml.dump(data, outfile, default_flow_style=False)

img_path_train = '/notebooks/yolo_final_dataset/images/train'
img_path_val = '/notebooks/yolo_final_dataset/images/val'
base = '/notebooks/'
Alabels = classes.tolist()
names = {i: name for i, name in enumerate(Alabels)}
nc = len(Alabels)
generate_yaml(img_path_train, img_path_val, names, nc, base)

In [None]:
# zip the yolo_final_dataset folder
shutil.make_archive('yolo_final_dataset', 'zip', 'yolo_final_dataset')

'/Users/taneya-pavitra/Documents/Personal/Learning-Study/MSc in Artificial Intelligence - University of Surrey/Year Feb 24 - Feb 25/Semester 2/Applied Machine Learning (EEEM068)/CourseWork/Aircraft_Classification/yolo_final_dataset.zip'

In [None]:
# No need for yolo_final_dataset folder as well and therefore deleting it
rmtree('yolo_final_dataset')

'Folder does not exits'