# Imports

In [1]:
import os
import logging
import math
from importlib import reload
import glob
import random
import uuid
import shutil

# Logger Setup

In [2]:
logging.shutdown()
reload(logging)

logging.basicConfig(
    format="{asctime} - {levelname} - {message}",
    style="{",
    datefmt="%Y-%m-%d %H:%M",
    level=logging.DEBUG,
)
logger = logging.getLogger('DATASET SAMPLER')

# Folder Locations

In [3]:
dataset_locations = [
    os.path.join(os.getcwd(), 'dataset', 'Horwood Farm July 2022', '100MEDIA'),
    os.path.join(os.getcwd(), 'dataset', 'Longleat September 2022', '100MEDIA'),
    os.path.join(os.getcwd(), 'dataset', 'Stype August 2022', '100MEDIA')
]

subset_dataset_location = os.path.join(os.getcwd(), 'subset_dataset', 'images')

for location in dataset_locations:
    if os.path.exists(location):
        logger.debug(f'LOCATION \'{location}\' EXISTS')
    else:
        logger.warning(f'LOCATION \'{location}\' DOES NOT EXIST')

2024-08-23 07:26 - DEBUG - LOCATION 'e:\University\Masters\MSc Project\dataset\Horwood Farm July 2022\100MEDIA' EXISTS
2024-08-23 07:26 - DEBUG - LOCATION 'e:\University\Masters\MSc Project\dataset\Longleat September 2022\100MEDIA' EXISTS
2024-08-23 07:26 - DEBUG - LOCATION 'e:\University\Masters\MSc Project\dataset\Stype August 2022\100MEDIA' EXISTS


# Establish An Even Distribution of Images from Each Dataset

In [4]:
TOTAL_NUMBER_OF_PICTURES = 18
NUMBER_OF_DATASETS = len(os.listdir(os.path.join(os.getcwd(), 'dataset'))) - 1

images_per_dataset = int(math.ceil(TOTAL_NUMBER_OF_PICTURES / NUMBER_OF_DATASETS))

logger.debug(f'GRABBING {images_per_dataset} IMAGES FROM {NUMBER_OF_DATASETS} DATASETS')

2024-08-23 07:26 - DEBUG - GRABBING 6 IMAGES FROM 3 DATASETS


# Move Image Subset to New Dataset
ONLY RUN THIS ONCE!
The potential to move the same images are possible, however, with the usage of UUIDs to prevent name conflicts, it could be hard to tell the duplications have occured.

In [5]:
for dataset in dataset_locations:
    image_paths = glob.glob(os.path.join(dataset, '*.jpg'))
    logger.info(f'FOUND {len(image_paths)} IMAGES IN \'{dataset}\'')

    if len(image_paths) > images_per_dataset:
        image_locations = random.sample(image_paths, images_per_dataset)
    else:
        logger.warning(f'DIRECTORY \'{dataset}\' HAS LESS THAN {images_per_dataset}. ONLY EXTRACTING {len(image_paths) - 1}')
        image_locations = random.sample(image_paths, len(image_paths) - 1)

    if len(image_locations) > 0:
        for image_location in image_locations:
            logger.debug(f'MOVING IMAGE \'{image_location}\' TO \'{subset_dataset_location}\'')
            shutil.move(image_location, os.path.join(subset_dataset_location, (str(uuid.uuid4()) + '.jpg')))
            

2024-08-23 07:26 - INFO - FOUND 791 IMAGES IN 'e:\University\Masters\MSc Project\dataset\Horwood Farm July 2022\100MEDIA'
2024-08-23 07:26 - DEBUG - MOVING IMAGE 'e:\University\Masters\MSc Project\dataset\Horwood Farm July 2022\100MEDIA\DJI_0454.JPG' TO 'e:\University\Masters\MSc Project\subset_dataset\images'
2024-08-23 07:26 - DEBUG - MOVING IMAGE 'e:\University\Masters\MSc Project\dataset\Horwood Farm July 2022\100MEDIA\DJI_0758.JPG' TO 'e:\University\Masters\MSc Project\subset_dataset\images'
2024-08-23 07:26 - DEBUG - MOVING IMAGE 'e:\University\Masters\MSc Project\dataset\Horwood Farm July 2022\100MEDIA\DJI_0614.JPG' TO 'e:\University\Masters\MSc Project\subset_dataset\images'
2024-08-23 07:26 - DEBUG - MOVING IMAGE 'e:\University\Masters\MSc Project\dataset\Horwood Farm July 2022\100MEDIA\DJI_0234.JPG' TO 'e:\University\Masters\MSc Project\subset_dataset\images'
2024-08-23 07:26 - DEBUG - MOVING IMAGE 'e:\University\Masters\MSc Project\dataset\Horwood Farm July 2022\100MEDIA\DJI