# NOTEBOOK INI ADALAH BAGIAN DIMANA DATASET BERUPA GAMBAR AKAN DIEXPORT MENJADI FILE H5PY

catatan awal:

> untuk file .h5
*     header untuk data 'X' : "x_images"
*     header untuk data 'y' : "y_labels"

> kode label
0. ceplok
1. kawung
2. lereng
3. nitik
4. parang
5. semen
6. lung-lungan

In [6]:
import h5py
import numpy as np
from PIL import Image
import os, os.path, shutil 
import random
from sklearn.utils import shuffle
import imageio

# Functions Definition

In [7]:
def download_all_images(folder_dir):
    """ download images from many folders,
        then save image alongside with its label
        
        Parameters:
        ---------------
        folder_dir        String, file location that contain many folders

        Returns:
        ----------
        x_images_array   (N, 224, 224, 3) to be stored, images array, 
        y_images_array    int label code (N, code), associated label
    """
    counter = 0
    x_images_array = []
    y_images_array = []
    for folder_image in os.listdir(folder_dir):
        for filename in os.listdir(f"{folder_dir+folder_image}"):
            img = Image.open(f"{folder_dir+folder_image}/{filename}")
            img = img.convert('RGB')
            img = img.resize((224,224))
            arr_img = np.array(img)
            x_images_array.append(arr_img)
            y_images_array.append(folder_image)
            print(f"working on {folder_image}/{filename}: {np.shape(arr_img)}")
            counter = counter + 1

    print("Done. retrieve " + str(counter) + " datas")
    return x_images_array, y_images_array

In [8]:
def store_many_hdf5(file_location, images, labels):
    """ Stores an array of images to HDF5.
        Parameters:
        ---------------
        file_location     path to HDF5 file, string
        images           images array, (N, 32, 32, 3) to be stored
        labels           labels array, (N, 1) to be stored
    """
    num_images = len(images)

    # Create a new HDF5 file
    file = h5py.File(f"{file_location}.h5", "w")

    # Create a dataset in the file
    dataset = file.create_dataset(
        "x_images", np.shape(images), h5py.h5t.STD_U8BE, data=images
    )
    meta_set = file.create_dataset(
        "y_labels", np.shape(labels), h5py.h5t.STD_U8BE, data=labels
    )
    file.close()
    print(f"{num_images} datas has been stored to HDF5")

In [9]:
def split_train_val(origin_dir, dest_dir):
    """
        split image files into train and validation set
        
        Parameters:
        ---------------
        origin_dir --> origin directory contains whole data in one dataset\n
        dest_dir --> destination directory that will MOVE the VALIDATION DATASET
 
        Returns:
        ----------
        NONE
    """
    for folder_image in os.listdir(origin_dir):
        try: 
            os.mkdir(f"{dest_dir+folder_image}") 
        except OSError as error: 
            print(error)
            
        filenames = random.sample([x for x in os.listdir(f"C:\\SAMUEL\\KULIAH\\Skripsi Gasal 1920\\jupyter-notebook\\new\\{folder_image}")], 220) 
        for filename in filenames:
            shutil.move(f"{origin_dir+folder_image}\\{filename}", f"{dest_dir+folder_image}\\{filename}")
        
        print(f"SIZE ORIGIN FOLDER {folder_image}: {str(np.shape(os.listdir(origin_dir+folder_image)))}")
        print(f"SIZE DEST FOLDER {folder_image}: {str(np.shape(os.listdir(dest_dir+folder_image)))}")

# Retrieve Dataset and store to h5py

In [11]:
def create_h5_dataset(source_image, h5_destination, h5_filename):
    """
    assign images into X and y data. then save to h5 file
    
    Parameters:
    source_image -> String, image directory
    h5_destination -> String, h5 destination directory 
    h5_filename -> String, h5 filename
    
    Return:
    None
    """

    print("Working...")

    # X, y = download_all_images("C:/SAMUEL/KULIAH/Skripsi Gasal 1920/jupyter-notebook/dataset_batik/Training/")
    X, y = download_all_images(source_image)

    # assign y as array
    y = np.array(y).astype(int)

    # SHUFFLE
    X, y = shuffle(X, y, random_state=0)

    # print shape
    print(f"X Shape: {np.shape(X)}\ny Shape: {np.shape(y)}")

    # store dataset
    dest_location = f"{h5_destination}{h5_filename}"
    store_many_hdf5(dest_location, X, y)

    print("DONE")

In [None]:
source = "C:\\Training2\\"
dest = "C:/h5_file/"
filename = "train_geometri_augmAllFeatures_600"
create_h5_dataset(source, dest, filename)

source = "C:Testing2\\"
dest = "C:/h5_file/"
filename = "val_geometri_augmAllFeatures_100"
create_h5_dataset(source, dest, filename)