In [2]:
from random import shuffle
import glob


In [3]:
shuffle_data = True  # shuffle the addresses before saving
hdf5_path = 'train/dataset.hdf5'  # address to where you want to save the hdf5 file
cat_dog_train_path = 'train/*.jpg'


In [4]:
# read addresses and labels from the 'train' folder
addrs = glob.glob(cat_dog_train_path)
labels = [1 if 'cat' in addr else 0 for addr in addrs]  # 0 = Cat, 1 = Dog


In [5]:
labels

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [6]:
# Divide the hata into 60% train, 20% validation, and 20% test
#train_addrs = addrs[0:int(0.6*len(addrs))]
#train_labels = labels[0:int(0.6*len(labels))]

# Divide the data into 100% train
train_addrs = addrs[0:int(len(addrs))]
train_labels = labels[0:int(len(labels))]

print(train_addrs)
print(train_labels)


['train\\cat.0.jpg', 'train\\cat.1.jpg', 'train\\cat.10.jpg', 'train\\cat.2.jpg', 'train\\cat.3.jpg', 'train\\cat.4.jpg', 'train\\cat.5.jpg', 'train\\cat.6.jpg', 'train\\cat.7.jpg', 'train\\cat.8.jpg', 'train\\cat.9.jpg', 'train\\dog.0.jpg', 'train\\dog.1.jpg', 'train\\dog.10.jpg', 'train\\dog.2.jpg', 'train\\dog.3.jpg', 'train\\dog.4.jpg', 'train\\dog.5.jpg', 'train\\dog.6.jpg', 'train\\dog.7.jpg', 'train\\dog.8.jpg', 'train\\dog.9.jpg']
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [7]:
train_shape = (len(train_addrs), 224, 224, 3)
print(train_shape)

(22, 224, 224, 3)


In [8]:
import numpy as np
import h5py

# open a hdf5 file and create arrays
hdf5_file = h5py.File(hdf5_path, mode='w')
print(hdf5_file)

<HDF5 file "dataset.hdf5" (mode r+)>


In [9]:
hdf5_file.create_dataset("train_img", train_shape, np.int8)

<HDF5 dataset "train_img": shape (22, 224, 224, 3), type "|i1">

In [10]:
hdf5_file.create_dataset("train_labels", (len(train_addrs),), np.int8)
hdf5_file["train_labels"][...] = train_labels

### Now, it's time to read images one by one, apply preprocessing (only resize in our code) and then save it.

In [15]:
import cv2
# loop over train addresses
for i in range(len(train_addrs)):
    # print how many images are saved every 1000 images
    if i % 1000 == 0 and i > 1:
        print ('Train data: {}/{}'.format(i, len(train_addrs)))
    # read an image and resize to (224, 224)
    # cv2 load images as BGR, convert it to RGB
    addr = train_addrs[i]
    img = cv2.imread(addr)
    img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_CUBIC)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    # add any image pre-processing here
    # if the data order is Theano, axis orders should change
    #if data_order == 'th':
    #    img = np.rollaxis(img, 2)
    # save the image and calculate the mean so far
    hdf5_file["train_img"][i, ...] = img[None]
    #mean += img / float(len(train_labels))

hdf5_file.close()

### Open the HDF5 for read

In [24]:
import h5py
import numpy as np
hdf5_path = 'train/dataset.hdf5'
subtract_mean = False
# open the hdf5 file
hdf5_file = h5py.File(hdf5_path, "r")
# subtract the training mean
if subtract_mean:
    mm = hdf5_file["train_mean"][0, ...]
    mm = mm[np.newaxis, ...]
# Total number of samples
data_num = hdf5_file["train_img"].shape[0]
data_shape = hdf5_file["train_img"].shape
print(' Number of Samples ',data_num)
print(' Samples shape ',data_shape)

label_num = hdf5_file["train_labels"].shape

print(' Lables shape ',label_num)
print('Lables:')
print(hdf5_file["train_labels"])




hdf5_file.close()

 Number of Samples  22
 Samples shape  (22, 224, 224, 3)
 Lables shape  (22,)
Lables:
<HDF5 dataset "train_labels": shape (22,), type "|i1">
