In [None]:
# Importing the required libraries
import os
import shutil
import random
from sklearn.model_selection import train_test_split

In [None]:
# Define the source and output directories
source_dir = '../unstructured_dataset' 
output_dir = '../dataset' 

train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

In [None]:
# Create output directory
for split in ['train', 'val', 'test']:
    split_dir = os.path.join(output_dir, split)
    os.makedirs(split_dir, exist_ok=True)

In [None]:
# Split the data into train, val, and test sets
for class_folder in os.listdir(source_dir):
    class_path = os.path.join(source_dir, class_folder)
    if not os.path.isdir(class_path):
        continue

    images = os.listdir(class_path)
    random.shuffle(images)

    train_images, temp_images = train_test_split(images, train_size=train_ratio, random_state=42)
    val_images, test_images = train_test_split(temp_images, test_size=test_ratio / (test_ratio + val_ratio), random_state=42)

    def copy_images(image_list, destination):
        dest_dir = os.path.join(output_dir, destination, class_folder)
        os.makedirs(dest_dir, exist_ok=True)
        for image in image_list:
            src = os.path.join(class_path, image)
            try:
                shutil.copy(src, dest_dir)
            except PermissionError:
                pass

    copy_images(train_images, 'train')
    copy_images(val_images, 'val')
    copy_images(test_images, 'test')

In [None]:
# Count images in each folder
def count_images_in_folders(base_dir):
    train_images_count = 0
    test_images_count = 0
    val_images_count = 0
    for split in ['train', 'val', 'test']:
        split_dir = os.path.join(base_dir, split)
        
        for class_folder in os.listdir(split_dir):
            class_path = os.path.join(split_dir, class_folder)
            if os.path.isdir(class_path):
                image_count = len([f for f in os.listdir(class_path) if os.path.isfile(os.path.join(class_path, f))])
                if split == 'train':
                    train_images_count += image_count
                elif split == 'test':
                    test_images_count += image_count
                else:
                    val_images_count += image_count

    print(f"Total images across training sets: {train_images_count}")
    print(f"Total images across testing sets: {test_images_count}")
    print(f"Total images across validation sets: {val_images_count}")

base_dir = 'dataset'
count_images_in_folders(base_dir)

Total images across training sets: 16505
Total images across testing sets: 2070
Total images across validation sets: 2064


In [None]:
# Convert .JPG to .jpg
def convert_jpg_extensions(base_dir):
    for root, dirs, files in os.walk(base_dir):
        for filename in files:
            if filename.endswith(".JPG"):
                old_file = os.path.join(root, filename)
                new_file = os.path.join(root, filename.lower())
                os.rename(old_file, new_file)

base_dir = 'dataset' 
convert_jpg_extensions(base_dir)