In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import os

# Global Variables
RAW_DATASET_PATH = "/content/drive/MyDrive/machine_learning/raw_dataset"
SPLIT_DATASET_PATH = "/content/drive/MyDrive/machine_learning/split_dataset"
TRAIN_PATH = os.path.join(SPLIT_DATASET_PATH, 'train')
VAL_PATH = os.path.join(SPLIT_DATASET_PATH, 'val')
SPICES_LIST = os.listdir(RAW_DATASET_PATH)
NUM_OF_SPICES = len(SPICES_LIST)

print('Spices total:', NUM_OF_SPICES)


Spices total: 25


In [6]:
# Function to count the number of images for each spice
def count_spices_images(dataset_path):
    index = 1
    total_images = 0
    print('-= Number of Images for Each Spice =-')
    for spices in SPICES_LIST:
        spices_path = os.path.join(dataset_path, spices)
        num_images = len(os.listdir(spices_path))
        print('{:2}. {:16} : {}'.format(index, spices, num_images))
        index += 1
        total_images += num_images
    print('\nTotal images:', total_images)

# Call the function with the raw dataset path
count_spices_images(RAW_DATASET_PATH)

-= Number of Images for Each Spice =-
 1. kapulaga         : 100
 2. biji_ketumbar    : 49
 3. lada             : 100
 4. serai            : 99
 5. kunyit           : 100
 6. lengkuas         : 128
 7. kemiri           : 99
 8. andaliman        : 90
 9. daun_ketumbar    : 100
10. cabai            : 100
11. bawang_putih     : 92
12. cengkeh          : 123
13. adas_bintang     : 108
14. temulawak        : 100
15. jahe             : 100
16. daun_salam       : 99
17. biji_adas        : 100
18. wijen            : 100
19. asam_jawa        : 89
20. pala             : 99
21. kayu_manis       : 142
22. vanili           : 110
23. kayu_secang      : 101
24. bawang_merah     : 112
25. kencur           : 100

Total images: 2540


In [7]:
# Clearing the Train-Val directory
from shutil import rmtree

def CleanDatasetDirectory(clean_data_path, train_path, val_path):  # Exclude the test_path parameter
    if os.path.exists(clean_data_path):  # Remove the old directory
        rmtree(clean_data_path)

    # Create new empty directories
    for spice in SPICES_LIST:
        train_spice_path = os.path.join(train_path, spice)
        os.makedirs(train_spice_path)

        val_spice_path = os.path.join(val_path, spice)
        os.makedirs(val_spice_path)

    print('Finished emptying old data.')

In [8]:
# Resizing data and saving in a temporary folder
from PIL import Image

def ResizeAndSaveData(source, dest, spices_name, image_size):
    os.makedirs(dest, exist_ok=True)
    count = 0
    zeros_padding = 4

    for spice_image in os.listdir(source):
        image_path = os.path.join(source, spice_image)
        img = Image.open(image_path).convert('RGB')

        # Generate a new JPEG name with zero-padding
        jpeg_name = spices_name + str(count).zfill(zeros_padding) + ".jpeg"
        image_dest_path = os.path.join(dest, jpeg_name)

        # Resize the image and save it to the destination
        img.resize(image_size).save(image_dest_path)

        count += 1

    print('Data resizing and saving completed.')

In [9]:
# Data Splitting Function (still Train-Dev Split using SPLIT_SIZE)
import random
from shutil import move

def SplitDataAndPrint(SOURCE, TRAINING, VALIDATION, SPLIT_SIZE, SPICES_NAME):  # You can use dev_size / test_size later

    # Get a list of directories in the source folder
    dir_list = os.listdir(SOURCE)
    randomized_dir_list = random.sample(dir_list, len(dir_list))

    # Remove 0 size images
    final_list = []
    for filename in randomized_dir_list:
        fullpath = os.path.join(SOURCE, filename)
        if os.path.getsize(fullpath) != 0:
            final_list.append(filename)
        else:
            print("{} is zero length, so ignoring.".format(filename))

    # Start Splitting (train-dev split)
    index_split = round(SPLIT_SIZE * len(final_list))
    for filename in final_list[:index_split]:
        source = os.path.join(SOURCE, filename)
        dest = os.path.join(TRAINING, filename)
        move(source, dest)

    for filename in final_list[index_split:]:
        source = os.path.join(SOURCE, filename)
        dest = os.path.join(VALIDATION, filename)
        move(source, dest)

    print('Data splitting completed for:', SPICES_NAME)

In [10]:
# Copy files to Train-Val Directory (Train-Val later when the dataset is large)
def CreateCleanDataset(data_path, train_path, val_path, image_size, split_size):  # Exclude test_path parameter
    temp_folder_path = '/tmp/convert_images'

    for spice in SPICES_LIST:
        source_path = os.path.join(data_path, spice)
        train_spice_path = os.path.join(train_path, spice)
        val_spice_path = os.path.join(val_path, spice)
        # test_spice_path = os.path.join(test_path, spice)

        # Resize and rename data
        ResizeAndSaveData(source_path, temp_folder_path, spice, image_size)

        # Split data into training and validation sets
        SplitDataAndPrint(temp_folder_path, train_spice_path, val_spice_path, split_size, spice)

In [11]:
IMAGE_SIZE = (384, 384)
SPLIT_SIZE = 0.7

CleanDatasetDirectory(SPLIT_DATASET_PATH, TRAIN_PATH, VAL_PATH)
CreateCleanDataset(RAW_DATASET_PATH, TRAIN_PATH, VAL_PATH, IMAGE_SIZE, SPLIT_SIZE)

Finished emptying old data.
Data resizing and saving completed.
Data splitting completed for: kapulaga
Data resizing and saving completed.
Data splitting completed for: biji_ketumbar
Data resizing and saving completed.
Data splitting completed for: lada
Data resizing and saving completed.
Data splitting completed for: serai
Data resizing and saving completed.
Data splitting completed for: kunyit
Data resizing and saving completed.
Data splitting completed for: lengkuas
Data resizing and saving completed.
Data splitting completed for: kemiri
Data resizing and saving completed.
Data splitting completed for: andaliman
Data resizing and saving completed.
Data splitting completed for: daun_ketumbar
Data resizing and saving completed.
Data splitting completed for: cabai
Data resizing and saving completed.
Data splitting completed for: bawang_putih




Data resizing and saving completed.
Data splitting completed for: cengkeh
Data resizing and saving completed.
Data splitting completed for: adas_bintang
Data resizing and saving completed.
Data splitting completed for: temulawak
Data resizing and saving completed.
Data splitting completed for: jahe
Data resizing and saving completed.
Data splitting completed for: daun_salam
Data resizing and saving completed.
Data splitting completed for: biji_adas
Data resizing and saving completed.
Data splitting completed for: wijen
Data resizing and saving completed.
Data splitting completed for: asam_jawa
Data resizing and saving completed.
Data splitting completed for: pala
Data resizing and saving completed.
Data splitting completed for: kayu_manis
Data resizing and saving completed.
Data splitting completed for: vanili
Data resizing and saving completed.
Data splitting completed for: kayu_secang
Data resizing and saving completed.
Data splitting completed for: bawang_merah
Data resizing and sav

In [12]:
def CheckTotalImages(folder_name, data_path):
    total_sum = 0
    for rootdir, dirs, files in os.walk(data_path):
        for subdir in dirs:
            path = os.path.join(rootdir, subdir)
            total_sum += len(os.listdir(path))
    print('Total Images in {}: {}'.format(folder_name, total_sum))
    return total_sum

train_count = CheckTotalImages('Train', TRAIN_PATH)
dev_count = CheckTotalImages('Val', VAL_PATH)
total_count = train_count + dev_count
ratio_train = round(train_count / total_count, 4)
ratio_dev = round(dev_count / total_count, 4)
print('Total Images in Clean Dataset: {}\nTrain Ratio: {}\nDev Ratio: {}'.format(total_count, ratio_train, ratio_dev))


Total Images in Train: 1776
Total Images in Val: 764
Total Images in Clean Dataset: 2540
Train Ratio: 0.6992
Dev Ratio: 0.3008
