This is an example of how to train a basic image classification model with Keras and Tensorflow, using TileDB as storage for images.
First of all let's import everything we will need.

In [None]:
import os
import tensorflow as tf
import glob
import cv2
import numpy as np

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Activation, MaxPooling2D, Dropout

First create the needed directories and define some globla variables like image size, batch size and paths

In [None]:
# Image size for rescaling. Feel free to increase image size in order to check how fast TileDB array writes and reads are.
IMAGE_SIZE = (64, 64)

# Batch size for training an image classification model.
BATCH_SIZE = 32

# Where our data live.
DATA_PATH = "data/"

# Where our images live
IMAGE_PATH = "data/flower_photos"

# Where trained models live
MODEL_PATH = "data/trained_models"

if not os.path.exists(DATA_PATH):
    os.mkdir(DATA_PATH)

if not os.path.exists(IMAGE_PATH):
    os.mkdir(IMAGE_PATH)

if not os.path.exists(MODEL_PATH):
    os.mkdir(MODEL_PATH)


We have to download the flower image dataset.

In [None]:
print("[STATUS] downloading image data...")
data_dir = tf.keras.utils.get_file(
    'flower_photos',
    'https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz',
    untar=True)
os.system("mv ~/.keras/datasets/flower_photos ./data")
print("[STATUS] downloading image data finished...")

We move on by getting class names from image data directory. We then perform basic one hot encoding for image labels.

In [None]:
# Get labels/classes
labels = [name for name in os.listdir(IMAGE_PATH) if os.path.isdir(os.path.join(IMAGE_PATH, name))]

# Encode labels
labels_dict = {labels[i]: i for i in range(0, len(labels))}
number_of_classes = len(labels)
one_hot_encodings = np.eye(number_of_classes, dtype=np.float32)

Next step is to load images, perform a basic preprocessing step and store them as TileDB arrays.

In [None]:
# Empty lists to hold images and labels
images = []
labels = []

print("[STATUS] image loading and basic preprocessing...")

# loop over the training data sub-folders
for current_label in labels_dict:

    class_image_paths = glob.glob(IMAGE_PATH + "/" + current_label + "/*.jpg")

    # loop over the images per class
    for image_path in class_image_paths:

        # read the image and resize it to a fixed-size
        image = cv2.imread(image_path)
        image = cv2.resize(image, IMAGE_SIZE)

        # Here is where you may add any kind of image preprocessing you need.

        # update the list of images
        images.append(image.astype(np.float32))

        # update the list of labels
        labels.append(one_hot_encodings[labels_dict[current_label]])

    print("[STATUS] processed folder: {}".format(current_label))

# Create two numpy arrays with all images and all labels respectively.
images = np.stack(images, axis=0) / 255.0 # Scale RGB values between 0.0 - 1.0
labels = np.stack(labels, axis=0)

print("[STATUS] completed image resizing...")

We will now split our dataset in train and validate.

In [None]:
# Shuffle image and label data in the same manner
randomize = np.arange(images.shape[0])
np.random.shuffle(randomize)
images = images[randomize]
labels = labels[randomize]

train_max_indx = int(labels.shape[0] * 0.8)

train_images = images[:train_max_indx]
train_labels = labels[:train_max_indx]

validate_images = images[train_max_indx:]
validate_labels = labels[train_max_indx:]

# get the overall image dataset shapes
print("[STATUS] train images array shape {}".format(train_images.shape))
print("[STATUS] validate images array shape {}".format(validate_images.shape))

# get the overall label dataset shapes
print("[STATUS] train labels array shape {}".format(train_labels.shape))
print("[STATUS] validate labels array shape {}".format(validate_labels.shape))


In [None]:
### Prepare the vineyard store
import vineyard
vineyard_store = vineyard.connect(os.environ['VINEYARD_IPC_SOCKET'])

Now is the time to store image data and labels to TileDB arrays. We have to define the schema, dimensions and tile extend.
We will use 3 dimensions for images, i.e, 1st dimension will be the image id, while the 2nd and 3rd dimensions correspond to each
image's x-axis and y-axis. RGB values will be stored as attributes in each TileDB array cell. Because of the fact that during
training a model we will load image batches equal to the BATCH_SIZE, tile extend of the image_id dimension should be equal
with the BATCH_SIZE. The tile extend of the other two dimensions should be equal with the image x and y size respectively.

In [None]:
train_image_view = train_images.view([("", np.float32), ("", np.float32), ("", np.float32)])
validate_image_view = validate_images.view([("", np.float32), ("", np.float32), ("", np.float32)])

vineyard_store.put('train_image_view', train_image_view)
vineyard_store.put('validate_image_view', validate_image_view)

print("[STATUS] images vineyard arrays are ready.")

train_labels_view = train_labels.view([
    ("", np.float32), ("", np.float32), ("", np.float32), ("", np.float32), ("", np.float32)])
validate_labels_view = validate_labels.view([
    ("", np.float32), ("", np.float32), ("", np.float32), ("", np.float32), ("", np.float32)])


vineyard_store.put('train_labels_view', train_labels_view)
vineyard_store.put('validate_label_array', validate_labels_view)

print("[STATUS] labels vineyard arrays are ready.")

In [None]:
### Prepare the vineyard store

vineyard_store = vineyard.connect(os.environ['VINEYARD_IPC_SOCKET'])

We will need a data generator than will feed training and validation data into the model while training.

In [None]:
def generator(images_obj, labels_obj, shape, batch_size=32):
    """
    Yields the next training batch.
    """

    while True:  # Loop forever so the generator never terminates

        # Get index to start each batch
        for offset in range(0, shape, batch_size):

            # Get the samples you'll use in this batch. We have to convert structured numpy arrays to
            # numpy arrays.

            # Avoid reshaping error in last batch
            if offset + batch_size > shape:
                batch_size = shape - offset

            x_train = images_obj[offset:offset + batch_size]['rgb'].\
                view(np.float32).reshape(batch_size, IMAGE_SIZE[0], IMAGE_SIZE[1], 3)

            y_train = labels_obj[offset:offset + batch_size]['label'].\
                view(np.float32).reshape(batch_size, number_of_classes)

            # The generator-y part: yield the next training batch
            yield x_train, y_train

We will create generators for train and validation data.

In [None]:
# Open image and label arrays from vineyard.
train_images = vineyard_store.get("train_image_array")
train_labels = vineyard_store.get("train_label_array")

validate_images = vineyard_store.get("validate_image_array")
validate_labels = vineyard_store.get("validate_label_array")

# Create generators
train_generator = generator(images_obj=train_images,
                            labels_obj=train_labels,
                            shape=train_images.shape[0],
                            batch_size=BATCH_SIZE)


validate_generator = generator(images_obj=validate_images,
                               labels_obj=validate_labels,
                               shape=validate_images.shape[0],
                               batch_size=BATCH_SIZE)

We will now define a function that creates an image classification model (taken from https://www.kaggle.com/alxmamaev/flowers-recognition) using Keras with Tensorflow backend.
Because of the fact that we don't perform any image preprocessing or data augmentation for image classification problems,
the model is not expected to achieve great accuracy. Great accuracy is out of the scope of this notebook, which just presents how we can employ
TileDB as storage for images in order to train a model with Tensorflow and Keras.

In [None]:
def create_model(input_shape, num_of_classes):

    input_shape = input_shape

    model = Sequential()
    model.add(Conv2D(32, (3, 3), padding='same', input_shape=input_shape, name='conv2d_1'))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2), name='maxpool2d_1'))
    model.add(Conv2D(32, (3, 3), name='conv2d_2'))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2), name='maxpool2d_2'))
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(64))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_of_classes))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])

    return model

We proceed by creating a model with the corresponding checkpoint and early stopping callbacks and train it by passing
train and validation generators as arguments.

In [None]:
model = create_model(input_shape=images[0].shape, num_of_classes=number_of_classes)

checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    MODEL_PATH + "/flower_model.h5", save_best_only=True
)

model.summary()

model.fit_generator(
        train_generator,
        steps_per_epoch=train_images.shape[0] // BATCH_SIZE,
        epochs=5,
        validation_data=validate_generator,
        validation_steps=validate_images.shape[0] // BATCH_SIZE,
        callbacks=[checkpoint_cb])

In [None]:
# Saving the model to vineyard
vineyard_store.put('flower_classification_keras_model', model)