# Saving and Loading image files into HDF5 file

The data was downloaded from https://www.kaggle.com/c/cifar-10/data
Stored in a relative path '../data' to this notebook.

The CIFAR-10 data consists of 60,000 32x32 color images in 10 classes, with 6000 images per class.
The train folder contains, 50,000 images
The test folder contains, 10,000 images


In [None]:
import glob
import pandas as pd
import os


image_path  = '../data/cifar-10/'
image_files = '*.png'
labels_file = 'trainLabels.csv'


# Read the image paths and labels
images  = glob.glob(image_path + 'train/' + image_files)
labels  = pd.read_csv(image_path + labels_file)

# Match the labels for image paths
labels_sorted = [labels[labels['id'] == int(os.path.splitext(os.path.basename(img))[0])]['label'].values[0] for img in images]



In [None]:
print("No of images in train folder {}".format(len(images)))

In [None]:
import pandas as pd

# Code label string to integers
labels_coded = pd.get_dummies(labels_sorted)
labels_coded = labels_coded.values.argmax(1)

In [None]:
from random import shuffle

## Shuffle the data set
sh = list(zip(images, labels_coded))
shuffle(sh)
imgs, lbls = zip(*sh)

## Create train data set
train_size = 0.95
train_images = imgs[0:int(train_size * len(imgs))]
train_labels = lbls[0:int(train_size * len(lbls))]

## create test data set
test_images = imgs[int(train_size * len(imgs)):]
test_labels = lbls[int(train_size * len(lbls)):]





In [None]:
print("Our training set size {}".format(len(train_images)))
print("Our test set size {}".format(len(test_images)))




# Create HDF5 file

In [None]:
import numpy as np
import h5py

# Image width x height is 32 x 32, with 3 channels
train_shape = (len(train_images), 32, 32, 3)
test_shape  = (len(test_images), 32, 32, 3)


hdf5_path = '../data/cifar-10/hdf5/cifar_10.h5'

# Open a hdf5 file and create earray
hdf5_file = h5py.File(hdf5_path, mode = 'w')

hdf5_file.create_dataset("train_images", train_shape, np.int8)
hdf5_file.create_dataset("test_images", test_shape, np.int8)


hdf5_file.create_dataset("train_mean", train_shape[1:], np.float32)

hdf5_file.create_dataset("train_labels", (len(train_labels),), np.int8)
hdf5_file["train_labels"][...] = train_labels
hdf5_file.create_dataset("test_labels", (len(test_labels),), np.int8)
hdf5_file["test_labels"][...] = test_labels






## Load and save the images

In [None]:
import cv2
mean = np.zeros(train_shape[1:], np.float32)

## Load train image
for i in range(len(train_images)):
    if i % 1000 == 0 and i > 0:
        print("Train data {}/{}".format(i, len(train_images)))
    
    path = train_images[i]
    image = cv2.imread(path)
    hdf5_file["train_images"][i, ...] = image[None]
    mean += image / float(len(train_labels))

## Load test image
for i in range(len(test_images)):
    if i % 1000 == 0 and i > 0:
        print("Test data {}/{}".format(i, len(test_images)))
    
    path = train_images[i]
    image = cv2.imread(path)
    hdf5_file["test_images"][i, ...] = image[None]


    




In [None]:
hdf5_file["train_mean"][...] = mean
hdf5_file.close()

## Access HDF5 File for Processing
 * Using Adience data
 * Feed into a `keras.preprocessing.image.ImageDataGenerator`

In [None]:
import h5py
hdf5_path = '../../data/Adience/hdf5/adience.h5'

In [None]:
f = h5py.File(hdf5_path, "r")

## This is an awkward way to access the groups & data sets from an h5py file. Is there a more "proper" way?
train_images,train_labels,train_mean = f.items()
train_images = train_images[1]
train_labels = train_labels[1]
train_mean   = train_mean[1]

In [None]:
train_images

In [None]:
train_labels[0:5]

In [None]:
from keras.preprocessing.image import ImageDataGenerator

In [None]:
generator = ImageDataGenerator( featurewise_center=False,
                                samplewise_center=True,
                                featurewise_std_normalization=False,
                                samplewise_std_normalization=False,
                                zca_whitening=False,
                                zca_epsilon=1e-6,
                                rotation_range=0.,
                                width_shift_range=0.,
                                height_shift_range=0.,
                                shear_range=0.,
                                zoom_range=0.,
                                channel_shift_range=0.,
                                fill_mode='nearest',
                                cval=0.,
                                horizontal_flip=False,
                                vertical_flip=False,
                                rescale=None,
                                preprocessing_function=None,
                                data_format="channels_last")

In [None]:
## Not useful?!
generator.fit(train_images[:50])    # test first 50

In [None]:
_gen = generator.flow(train_images[:50],train_labels[:50]) # test first 50

In [None]:
sample = _gen.next()
sample[0]

In [None]:
## 256x256x3 --?!?-> 2x32x32
len(sample)