# Using Hobbit with a Data Generator
When you have a dataset that is too large to load into memory in its entirety it is helpful to use a generator to stream data from your hard drive. Hobbit allows you to train using generators. In this tutorial we will show how. The tutorial is adapting code from the [Keras CIFAR10 Example](https://github.com/fchollet/keras/blob/master/examples/cifar10_cnn.py "Keras CIFAR10 Example").

## Storing data as HDF5
We will store the data in HDF5 format so we can read parts of the file without having to load the entire dataset. We also store the data pre-processed and shuffled for more efficiency at training time.

In [11]:
from __future__ import print_function
import keras
from keras.datasets import cifar10
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
import h5py
import numpy as np
from hobbit.algorithms import Hyperband
from hobbit import Hyperparameter

num_classes = 10

# The data, shuffled and split between train and test sets:
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

# Convert class vectors to binary class matrices.
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255

x_train shape: (50000, 32, 32, 3)
50000 train samples
10000 test samples


In [None]:
path_to_hdf5 = './cifar10.h5'
with h5py.File(path_to_hdf5) as f:
    f.create_dataset(name='x_train', shape=x_train.shape, dtype=x_train.dtype, data=x_train)
    f.create_dataset(name='y_train', shape=y_train.shape, dtype=y_train.dtype, data=y_train)
    f.create_dataset(name='x_test', shape=x_test.shape, dtype=x_test.dtype, data=x_test)
    f.create_dataset(name='y_test', shape=y_test.shape, dtype=y_test.dtype, data=y_test)

## Building a generator to stream data
Let's build a simple generator for the CIFAR10 dataset. We Python's API where a function ending with `yield` returns a generator. We want the generator to return samples indefinitely.

In [6]:
def get_hdf5_generator(x, y, batch_size=100):
    num_samples = y.shape[0]
    num_batches = np.ceil(num_samples/batch_size).astype('int')
    while True:
        for i in range(num_batches):
            from_ = i*batch_size
            to_ = min((i+1)*batch_size, num_samples)
            yield x[from_:to_], y[from_:to_]

## The model

In [16]:
def get_model(hparams):
    model = Sequential()

    model.add(Conv2D(hparams['num_layer_1_units'], (3, 3), padding='same',
                     input_shape=x_train.shape[1:]))
    model.add(Activation('relu'))
    model.add(Conv2D(hparams['num_layer_1_units'], (3, 3)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(hparams['dropout_1']))

    model.add(Conv2D(2*hparams['num_layer_1_units'], (3, 3), padding='same'))
    model.add(Activation('relu'))
    model.add(Conv2D(2*hparams['num_layer_1_units'], (3, 3)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(hparams['dropout_1']))

    model.add(Flatten())
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dropout(hparams['dropout_2']))
    model.add(Dense(num_classes))
    model.add(Activation('softmax'))

    # initiate RMSprop optimizer
    opt = keras.optimizers.rmsprop(lr=hparams['lr'], decay=1e-6)

    # Let's train the model using RMSprop
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
    return model

## Setting up Hobbit

In [17]:
repo_dir = './cifar_repo'
hparam_ranges = [Hyperparameter(name='num_layer_1_units', distr_args=[np.arange(16,128,16)], distribution='choice'),
                 Hyperparameter(name='lr', distr_args=[0.00001, 0.001], distribution='log-uniform'),
                 Hyperparameter(name='dropout_1', distr_args=[0., 0.8]),
                 Hyperparameter(name='dropout_2', distr_args=[0., 0.8])]
batch_size = 100

with h5py.File(path_to_hdf5) as f:
    num_train_batches = np.ceil(f['x_train'].shape[0]/batch_size).astype('int')
    num_test_batches = np.ceil(f['x_test'].shape[0] / batch_size).astype('int')

    hband = Hyperband(model_function=get_model,
                      hparam_ranges=hparam_ranges,
                      repo_dir=repo_dir,
                      generator_function=get_hdf5_generator,
                      train_gen_args=(f['x_train'], f['y_train'], batch_size),
                      valid_gen_args={'x': f['x_test'], 'y': f['y_test'], 'batch_size': batch_size},
                      steps_per_epoch=num_train_batches,
                      validation_steps=num_test_batches)

    tab = hband.run(R=20, eta=3)

Epoch 1/2
 71/500 [===>..........................] - ETA: 271s - loss: 2.3032 - acc: 0.1114

KeyboardInterrupt: 

In [13]:
path_to_hdf5 = './cifar10.h5'