In [1]:
def read_imgdata_label(hdf5_path, image_path, label_pattern, shuffle_data=True):
    '''
    shuffle_data = True  # shuffle the images

    hdf5_path = '/home/santosh/test/ML/dog_dataset.hdf5'  # hdf5_path to save the hdf5 file
    image_path = '/home/santosh/test/ML/images/merged_images/*.jpg'

    label_pattern = 'car*'
    '''
    # Read the images and labels from a given folder
    imgpath = glob.glob(image_path)
    labels = []
    for i in imgpath:
        label = i.split(os.path.sep)[-1].split(".")[0]
        if fnmatch.fnmatch(label, label_pattern):
            labels.append(1)
        else:
            labels.append(0)
        
    # Shuffle the data
    if shuffle_data:
        c = list(zip(imgpath, labels))
        shuffle(c)
        imgpath, labels = zip(*c)
    return imgpath, labels

In [2]:
def slice_data(imgpath , labels, slicep = (60, 20, 20) ):
    
    x1 = slicep[0]/100.
    x2 = (slicep[0] + slicep[1])/100.
    
    # Divide the data into 60% train, 20% validation, and 20% test
    train_imgdata = imgpath[0:int(x1*len(imgpath))]
    train_labels = labels[0:int(x1*len(labels))]

    val_imgdata = imgpath[int(x1*len(imgpath)):int(x2*len(imgpath))]
    val_labels = labels[int(x1*len(imgpath)):int(x2*len(imgpath))]

    test_imgdata = imgpath[int(x2*len(imgpath)):]
    test_labels = labels[int(x2*len(imgpath)):]
    
    imgvector = {}
    
    imgvector['train_imgdata'] = train_imgdata
    imgvector['train_labels'] = train_labels
    imgvector['val_imgdata'] =  val_imgdata
    imgvector['val_labels'] = val_labels
    imgvector['test_imgdata'] = test_imgdata
    imgvector['test_labels'] = test_labels
    
    return imgvector


In [3]:
def create_hdf5(imgvector, data_order='tf'):
    
    #'th' for Theano, 'tf' for Tensorflow
     
    # check the order of data and chose proper data shape to save images
    
    if data_order == 'th':
        train_shape = (len(imgvector['train_imgdata']), 3, 224, 224)
        val_shape = (len(imgvector['val_imgdata']), 3, 224, 224)
        test_shape = (len(imgvector['test_imgdata']), 3, 224, 224)
    elif data_order == 'tf':
        train_shape = (len(imgvector['train_imgdata']), 224, 224, 3)
        val_shape = (len(imgvector['val_imgdata']), 224, 224, 3)
        test_shape = (len(imgvector['test_imgdata']), 224, 224, 3)

    # open a hdf5 file and create earrays
    hdf5_file = h5py.File(hdf5_path, mode='w')

    hdf5_file.create_dataset("train_img", train_shape, np.int8)
    hdf5_file.create_dataset("val_img", val_shape, np.int8)
    hdf5_file.create_dataset("test_img", test_shape, np.int8)

    hdf5_file.create_dataset("train_mean", train_shape[1:], np.float32)

    hdf5_file.create_dataset("train_labels", (len(imgvector['train_imgdata']),), np.int8)
    hdf5_file["train_labels"][...] = imgvector['train_labels']
    hdf5_file.create_dataset("val_labels", (len(imgvector['val_imgdata']),), np.int8)
    hdf5_file["val_labels"][...] = imgvector['val_labels']
    hdf5_file.create_dataset("test_labels", (len(imgvector['test_imgdata']),), np.int8)
    hdf5_file["test_labels"][...] = imgvector['test_labels']

    hdf5_file.close()

In [4]:
from random import shuffle
import glob
import fnmatch
import os
import matplotlib.pyplot as plt
import cv2
import h5py
import numpy as np

if __name__ == '__main__':
    
    
    
    shuffle_data = True  # shuffle the images

    hdf5_path ='G:/ML/dog_dataset.hdf5'  # hdf5_path to save the hdf5 file
    image_path ='G:/ML/images/merged_images/*.jpg'

    label_pattern = 'car*'

    
    imgpath, labels = read_imgdata_label(hdf5_path, image_path, label_pattern, shuffle_data)
    
    imgvector = slice_data(imgpath , labels, slicep = (60, 20, 20) )
    
    create_hdf5(imgvector, data_order='tf')
    