# Saving and Loading image files into HDF5 file

The data was downloaded from https://www.kaggle.com/c/cifar-10/data
Stored in a relative path '../data' to this notebook.

The CIFAR-10 data consists of 60,000 32x32 color images in 10 classes, with 6000 images per class.
The train folder contains, 50,000 images
The test folder contains, 10,000 images


In [7]:
import glob
import pandas as pd
import os


image_path  = '../data/cifar-10/'
image_files = '*.png'
labels_file = 'trainLabels.csv'


# Read the image paths and labels
images  = glob.glob(image_path + 'train/' + image_files)
labels  = pd.read_csv(image_path + labels_file)

# Match the labels for image paths
labels_sorted = [labels[labels['id'] == int(os.path.splitext(os.path.basename(img))[0])]['label'].values[0] for img in images]



In [8]:
print("No of images in train folder {}".format(len(images)))

No of images in train folder 50000


In [9]:
import pandas as pd

# Code label string to integers
labels_coded = pd.get_dummies(labels_sorted)
labels_coded = labels_coded.values.argmax(1)

In [10]:
from random import shuffle

## Shuffle the data set
sh = list(zip(images, labels_coded))
shuffle(sh)
imgs, lbls = zip(*sh)

## Create train data set
train_size = 0.95
train_images = imgs[0:int(train_size * len(imgs))]
train_labels = lbls[0:int(train_size * len(lbls))]

## create test data set
test_images = imgs[int(train_size * len(imgs)):]
test_labels = lbls[int(train_size * len(lbls)):]





In [11]:
print("Our training set size {}".format(len(train_images)))
print("Our test set size {}".format(len(test_images)))




Our training set size 47500
Our test set size 2500


# Create HDF5 file

In [17]:
import numpy as np
import h5py

# Image width x height is 32 x 32, with 3 channels
train_shape = (len(train_images), 32, 32, 3)
test_shape  = (len(test_images), 32, 32, 3)


hdf5_path = '../data/cifar-10/hdf5/cifar_10.h5'

# Open a hdf5 file and create earray
hdf5_file = h5py.File(hdf5_path, mode = 'w')

hdf5_file.create_dataset("train_images", train_shape, np.int8)
hdf5_file.create_dataset("test_images", test_shape, np.int8)


hdf5_file.create_dataset("train_mean", train_shape[1:], np.float32)

hdf5_file.create_dataset("train_labels", (len(train_labels),), np.int8)
hdf5_file["train_labels"][...] = train_labels
hdf5_file.create_dataset("test_labels", (len(test_labels),), np.int8)
hdf5_file["test_labels"][...] = test_labels






## Load and save the images

In [18]:
import cv2
mean = np.zeros(train_shape[1:], np.float32)

## Load train image
for i in range(len(train_images)):
    if i % 1000 == 0 and i > 0:
        print("Train data {}/{}".format(i, len(train_images)))
    
    path = train_images[i]
    image = cv2.imread(path)
    hdf5_file["train_images"][i, ...] = image[None]
    mean += image / float(len(train_labels))

## Load test image
for i in range(len(test_images)):
    if i % 1000 == 0 and i > 0:
        print("Test data {}/{}".format(i, len(test_images)))
    
    path = train_images[i]
    image = cv2.imread(path)
    hdf5_file["test_images"][i, ...] = image[None]


    




Train data 1000/47500
Train data 2000/47500
Train data 3000/47500
Train data 4000/47500
Train data 5000/47500
Train data 6000/47500
Train data 7000/47500
Train data 8000/47500
Train data 9000/47500
Train data 10000/47500
Train data 11000/47500
Train data 12000/47500
Train data 13000/47500
Train data 14000/47500
Train data 15000/47500
Train data 16000/47500
Train data 17000/47500
Train data 18000/47500
Train data 19000/47500
Train data 20000/47500
Train data 21000/47500
Train data 22000/47500
Train data 23000/47500
Train data 24000/47500
Train data 25000/47500
Train data 26000/47500
Train data 27000/47500
Train data 28000/47500
Train data 29000/47500
Train data 30000/47500
Train data 31000/47500
Train data 32000/47500
Train data 33000/47500
Train data 34000/47500
Train data 35000/47500
Train data 36000/47500
Train data 37000/47500
Train data 38000/47500
Train data 39000/47500
Train data 40000/47500
Train data 41000/47500
Train data 42000/47500
Train data 43000/47500
Train data 44000/475

In [19]:
hdf5_file["train_mean"][...] = mean
hdf5_file.close()