In [22]:
from pathlib import Path

import os
import xmltodict
from matplotlib import pyplot as plt
import splitfolders
import numpy as np
from sklearn.model_selection import train_test_split
import shutil

# Part 0: Set User-Defined Parameters

In [23]:
# Dataset Cropping Parameters
path_to_dataset = './original_fish_dataset'
target_cropped_dataset_root = './dataset_cropped'
target_uncropped_dataset_root = './dataset_uncropped'

# Dataset Splitting Parameters
split_cropped_dataset_root = './dataset_cropped_split'
split_uncropped_dataset_root = './dataset_uncropped_split'
train_dataset_split_size = 0.8
val_dataset_split_size = 0.1
test_dataset_split_size = 0.1

In [24]:
path_to_herring_root = os.path.join(path_to_dataset, 'Herring')
path_to_non_herring_root = os.path.join(path_to_dataset, 'Non-herring')

path_to_herring_images = os.path.join(path_to_herring_root, 'Images')
path_to_herring_data = os.path.join(path_to_herring_root, 'Xml')

path_to_non_herring_images = os.path.join(path_to_non_herring_root, 'Images')
path_to_non_herring_data = os.path.join(path_to_non_herring_root, 'Xml')

# Part 1: Create Correctly-Formatted Datasets From Original Images

In [12]:
# Cropped Dataset creation
fish_counter = {}

def crop_fish(image, xmin, xmax, ymin, ymax, fish_type):
    xmin = int(xmin)
    xmax = int(xmax)
    ymin = int(ymin)
    ymax = int(ymax)
    cropped_fish = image[ymin:ymax, xmin:xmax]
    save_img_target_dir = os.path.join(target_cropped_dataset_root, fish_type, 'images')
    save_data_target_dir = os.path.join(target_cropped_dataset_root, fish_type, 'labels')
    os.makedirs(save_img_target_dir, exist_ok=True)
    os.makedirs(save_data_target_dir, exist_ok=True)

    if fish_type not in fish_counter:
        fish_counter[fish_type] = 0
    else:
        fish_counter[fish_type] += 1

    file_name = str(fish_counter[fish_type])
    plt.imsave(os.path.join(save_img_target_dir, file_name+'.png'), cropped_fish)
    with open(os.path.join(save_data_target_dir, file_name+'.txt'), 'w+') as f:
        f.write(f"{fish_type} {(xmax-xmin)//2} {(ymax-ymin)//2} {xmax-xmin} {ymax-ymin}")
        f.close()

for img_root, data_root in [(path_to_non_herring_images, path_to_non_herring_data), (path_to_herring_images, path_to_herring_data)]:
    for file_obj in Path(img_root).iterdir():
        if file_obj.is_file():
            data_file_path = os.path.join(data_root, file_obj.stem+".xml")
            if not Path(data_file_path).is_file():
                continue

            with open(data_file_path, 'r') as xml_file:
                annotations = xmltodict.parse(xml_file.read())['annotation']

            img = plt.imread(file_obj).astype(float)

            if 'object' not in annotations:
                continue
            try:
                if type(annotations['object']) == list:
                    for fa in annotations['object']:
                        fish_type = fa['fish-type'].replace(' ', '-')
                        crop_fish(img, fa['bndbox']['xmin'], fa['bndbox']['xmax'], fa['bndbox']['ymin'], fa['bndbox']['ymax'], fish_type)
                else:
                    fa = annotations['object']
                    fish_type = fa['fish-type'].replace(' ', '-')
                    crop_fish(img, fa['bndbox']['xmin'], fa['bndbox']['xmax'], fa['bndbox']['ymin'], fa['bndbox']['ymax'], fish_type)
            except Exception as e:
                print(f"Caught Error {e}")

Caught Error zero-size array to reduction operation maximum which has no identity
Caught Error zero-size array to reduction operation maximum which has no identity
Caught Error zero-size array to reduction operation maximum which has no identity


In [39]:
# Uncropped Dataset Creation
fish_counter = {}

for img_root, data_root in [(path_to_non_herring_images, path_to_non_herring_data), (path_to_herring_images, path_to_herring_data)]:
    for file_obj in Path(img_root).iterdir():
        if file_obj.is_file():

            data_file_path = os.path.join(data_root, file_obj.stem+".xml")

            # if data_file_path == './original_fish_dataset/Non-herring/Xml/201503.20150517.215226032.49000_01_02.xml':
            #     print("Hi!")

            if not Path(data_file_path).is_file():
                continue

            with open(data_file_path, 'r') as xml_file:
                annotations = xmltodict.parse(xml_file.read())['annotation']


            save_img_target_dir = os.path.join(target_uncropped_dataset_root, 'ErrorFish', 'images')
            save_data_target_dir = os.path.join(target_uncropped_dataset_root, 'ErrorFish', 'labels')
            file_name = "ErrorFish"
            fish_counter["ErrorFish"] = -1

            if 'object' not in annotations:
                continue
            try:
                if type(annotations['object']) == list:
                    lines = []
                    first = True
                    if len(annotations['object']) == 0:
                        continue

                    for fa in annotations['object']:
                        fish_type = fa['fish-type'].replace(' ', '-')

                        if first:
                            first = False
                            if fish_type not in fish_counter:
                                fish_counter[fish_type] = 0
                            else:
                                fish_counter[fish_type] += 1

                        save_img_target_dir = os.path.join(target_uncropped_dataset_root, fish_type, 'images')
                        save_data_target_dir = os.path.join(target_uncropped_dataset_root, fish_type, 'labels')

                        xmin = int(fa['bndbox']['xmin'])
                        xmax = int(fa['bndbox']['xmax'])
                        ymin = int(fa['bndbox']['ymin'])
                        ymax = int(fa['bndbox']['ymax'])

                        os.makedirs(save_img_target_dir, exist_ok=True)
                        os.makedirs(save_data_target_dir, exist_ok=True)

                        file_name = str(fish_counter[fish_type])
                        lines.append(f"{fish_type} {(xmax-xmin)//2} {(ymax-ymin)//2} {xmax-xmin} {ymax-ymin}\n")

                    if len(lines) > 0:
                        lines[-1] = lines[-1][:-1]  # Remove newline at end of file
                        with open(os.path.join(save_data_target_dir, file_name+'.txt'), 'w+') as f:
                            f.writelines(lines)
                            f.close()

                else:
                    fa = annotations['object']
                    fish_type = fa['fish-type'].replace(' ', '-')

                    if fish_type not in fish_counter:
                        fish_counter[fish_type] = 0
                    else:
                        fish_counter[fish_type] += 1

                    save_img_target_dir = os.path.join(target_uncropped_dataset_root, fish_type, 'images')
                    save_data_target_dir = os.path.join(target_uncropped_dataset_root, fish_type, 'labels')

                    xmin = int(fa['bndbox']['xmin'])
                    xmax = int(fa['bndbox']['xmax'])
                    ymin = int(fa['bndbox']['ymin'])
                    ymax = int(fa['bndbox']['ymax'])

                    os.makedirs(save_img_target_dir, exist_ok=True)
                    os.makedirs(save_data_target_dir, exist_ok=True)

                    file_name = str(fish_counter[fish_type])

                    with open(os.path.join(save_data_target_dir, file_name+'.txt'), 'w+') as f:
                        f.write(f"{fish_type} {(xmax-xmin)//2} {(ymax-ymin)//2} {xmax-xmin} {ymax-ymin}")
                        f.close()
            except Exception as e:
                print(f"Caught {type(e)}: {e}")

            os.makedirs(save_img_target_dir, exist_ok=True)
            os.makedirs(save_data_target_dir, exist_ok=True)

            img = plt.imread(file_obj).astype(float)
            plt.imsave(os.path.join(save_img_target_dir, file_name+'.png'), img)

Caught <class 'KeyError'>: 'dead-scallop'
Caught <class 'KeyError'>: 'dead-scallop'
Caught <class 'KeyError'>: 'dead-scallop'
Caught <class 'KeyError'>: 'dead-scallop'


# Part 2: Splitting Dataset Into Train/Val/Test Sets

In [31]:
def create_split_dataset(unsplit_dataset_root, split_dataset_root):
    def split_fish_class_on_indices(indices, class_name, split_name):
        save_img_target_dir = os.path.join(class_name, 'images/')
        save_data_target_dir = os.path.join(class_name, 'labels/')

        target_split_folder = os.path.join(split_dataset_root, split_name, Path(class_name).name)
        target_split_folder_image_root = os.path.join(target_split_folder, 'images/')
        target_split_folder_data_root = os.path.join(target_split_folder, 'labels/')

        os.makedirs(target_split_folder_image_root, exist_ok=True)
        os.makedirs(target_split_folder_data_root, exist_ok=True)

        for index in indices:
            try:
                shutil.copy(save_img_target_dir+str(index)+'.png', target_split_folder_image_root)
                shutil.copy(save_data_target_dir+str(index)+'.txt', target_split_folder_data_root)
            except:
                print(f"Unable To Copy {class_name} for {index}.png/txt")
                continue

    for fish_type in Path(unsplit_dataset_root).iterdir():
        if fish_type.is_dir():
            fish_images_path = os.path.join(fish_type, 'images')
            num_fish_images = len([x for x in os.listdir(fish_images_path) if x[-4:] == '.png'])
            fish_indices = np.arange(num_fish_images)
            train_indices, val_test_indices = train_test_split(
                fish_indices,
                train_size=train_dataset_split_size,
                random_state=1337
            )
            val_indices, test_indices  = train_test_split(
                val_test_indices,
                train_size=val_dataset_split_size/(val_dataset_split_size+test_dataset_split_size),
                random_state=1338
            )
            split_fish_class_on_indices(train_indices, fish_type, 'train')
            split_fish_class_on_indices(val_indices, fish_type, 'val')
            split_fish_class_on_indices(test_indices, fish_type, 'test')

In [32]:
# Create Split Dataset For Cropped Images
create_split_dataset(target_cropped_dataset_root, split_cropped_dataset_root)

Unable To Copy dataset_cropped/roundfish for 1750.png/txt
Unable To Copy dataset_cropped/roundfish for 1695.png/txt
Unable To Copy dataset_cropped/roundfish for 2958.png/txt


In [41]:
# Create Split Dataset For Uncropped Images
create_split_dataset(target_uncropped_dataset_root, split_uncropped_dataset_root)

Unable To Copy dataset_uncropped/scallop for 964.png/txt
Unable To Copy dataset_uncropped/scallop for 554.png/txt
Unable To Copy dataset_uncropped/scallop for 369.png/txt
Unable To Copy dataset_uncropped/scallop for 1393.png/txt
Unable To Copy dataset_uncropped/scallop for 1260.png/txt
Unable To Copy dataset_uncropped/scallop for 853.png/txt
Unable To Copy dataset_uncropped/scallop for 418.png/txt
Unable To Copy dataset_uncropped/scallop for 1069.png/txt
Unable To Copy dataset_uncropped/scallop for 496.png/txt
Unable To Copy dataset_uncropped/scallop for 474.png/txt
Unable To Copy dataset_uncropped/scallop for 178.png/txt
Unable To Copy dataset_uncropped/scallop for 648.png/txt
Unable To Copy dataset_uncropped/scallop for 466.png/txt
Unable To Copy dataset_uncropped/scallop for 733.png/txt
Unable To Copy dataset_uncropped/scallop for 250.png/txt
Unable To Copy dataset_uncropped/scallop for 56.png/txt
Unable To Copy dataset_uncropped/scallop for 273.png/txt
Unable To Copy dataset_uncrop