In [None]:
import h5py
import pickle
import numpy as np

## Load Dataset

In [None]:
# training data
h5f = h5py.File('dataset/train_data.h5', 'r')
x_train = h5f['x_train'][:]
y_train = h5f['y_train'][:]
h5f.close()

print('x_train:', x_train.shape)
print('y_train:', y_train.shape)

In [None]:
# validation data
h5f = h5py.File('dataset/val_data.h5', 'r')
x_val = h5f['x_val'][:]
y_val = h5f['y_val'][:]
h5f.close()

print('x_val:', x_val.shape)
print('y_val:', y_val.shape)

## Load vocabulary and embeddings

In [None]:
def read_glove_vecs(glove_file):
    print('Creating word to vec map...')
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float32)
    print('Done!')
    return word_to_vec_map

In [None]:
# load embeddings
word_to_vec_map = read_glove_vecs('{}/glove.6B.100d.txt'.format('dataset'))

In [None]:
# assign embeddings values to tokens
size = word_to_vec_map['unk'].shape

word_to_vec_map['<sos>'] = np.random.uniform(low=-1.0, high=1.0, size=size)
word_to_vec_map['<eos>'] = np.random.uniform(low=-1.0, high=1.0, size=size)
word_to_vec_map['<pad>'] = np.random.uniform(low=-1.0, high=1.0, size=size)

In [None]:
# load vocabulary
with open('dataset/vocabulary.pickle', 'rb') as vocab_file:
    vocabulary_dict = pickle.load(vocab_file)

vocabulary = vocabulary_dict['vocabulary']
word_to_index = vocabulary_dict['word_to_index']
index_to_word = vocabulary_dict['index_to_word']

## Modify caption vector

Choose one caption out of five per image.

In [None]:
def choose_random_caption(data):
    data_new = []
    for captions in data:
        data_new.append(captions[np.random.randint(0, 5)])
    return np.array(data_new)

y_train = choose_random_caption(y_train)
y_val = choose_random_caption(y_val)

print('y_train:', y_train.shape)
print('y_val:', y_val.shape)

## Training Data

In [None]:
# Input to encoder is the numpy array of the image
encoder_input_data = x_train
encoder_input_data.shape

The input and output data for the decoder is identical, except shifted one time-step. We can use the same numpy array to save memory by slicing it, which merely creates different 'views' of the same data in memory.

In [None]:
decoder_input_data = y_train[:, :-1]
decoder_input_data.shape

In [None]:
decoder_output_data = y_train[:, 1:]
decoder_output_data.shape

## Define Image Model (Encoder)

In [None]:
from keras.models import Model
from keras.layers import Conv2D, Dense, Input, Dropout, GRU, Activation
from keras.layers.embeddings import Embedding

from capsule_layers import CapsuleLayer, PrimaryCap, Length

In [None]:
from keras.optimizers import RMSprop
from keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping

In [None]:
from keras import backend as K

In [None]:
encoder_input_shape = x_train.shape[1:]
n_class = 10
routings = 3

In [None]:
# placeholder for input image
encoder_input = Input(shape=encoder_input_shape)

In [None]:
# Layer 1-3: Just some conventional Conv2D layers
conv1 = Conv2D(filters=96, kernel_size=13, strides=4, padding='valid', activation='relu', name='conv1')(encoder_input)
conv2 = Conv2D(filters=96, kernel_size=5, strides=2, padding='valid', activation='relu', name='conv2')(conv1)
conv3 = Conv2D(filters=256, kernel_size=9, strides=1, padding='valid', activation='relu', name='conv3')(conv2)
conv3

In [None]:
# Layer 4: Conv2D layer with `squash` activation, then reshape to [None, num_capsule, dim_capsule]
primary_caps = PrimaryCap(conv3, dim_capsule=8, n_channels=32, kernel_size=9, strides=2, padding='valid')

# Layer 5: Capsule layer. Routing algorithm works here.
caption_caps = CapsuleLayer(num_capsule=n_class, dim_capsule=16, routings=routings, name='caption_caps')(primary_caps)
caption_caps

In [None]:
encoder_output = Length(name='capsnet')(caption_caps)
encoder_output

## Define caption model (Decoder)

### Define embedding layer

In [None]:
vocabulary_length = len(vocabulary) + 1  # adding 1 to fit Keras embedding (requirement)
embedding_dimensions = word_to_vec_map['unk'].shape[0]  # define dimensionality of GloVe word vectors (= 100)

In [None]:
embedding_matrix = np.zeros((vocabulary_length, embedding_dimensions))
for word, index in word_to_index.items():
    try:
        embedding_matrix[index, :] = word_to_vec_map[word]
    except KeyError:
        embedding_matrix[index, :] = word_to_vec_map['unk']

In [None]:
decoder_embedding = Embedding(vocabulary_length, embedding_dimensions, trainable=False)
decoder_embedding.build((None,))
decoder_embedding.set_weights([embedding_matrix])

### Define RNN models

In [None]:
# Internal state size of GRU layers in the RNN
state_size = 512

In [None]:
decoder_initial_state = Input(shape=(state_size,), name='decoder_initial_state')

In [None]:
decoder_input = Input(shape=(None, ), name='decoder_input')

In [None]:
decoder_gru1 = GRU(state_size, name='decoder_gru1', return_sequences=True)
decoder_gru2 = GRU(state_size, name='decoder_gru2', return_sequences=True)
decoder_gru3 = GRU(state_size, name='decoder_gru3', return_sequences=True)

In [None]:
decoder_dense = Dense(vocabulary_length, activation='linear', name='decoder_output')

In [None]:
decoder_transfer_map = Dense(state_size, activation='tanh', name='decoder_transfer_map')

## Connect and Create the Training Model

In [None]:
decoder_initial_state = decoder_transfer_map(encoder_output)

In [None]:
# Start the decoder-network with its input-layer.
net = decoder_input

# Connect the embedding-layer.
net = decoder_embedding(net)

In [None]:
# Connect all the LSTM layers.
net = decoder_gru1(net, initial_state=decoder_initial_state)
net = decoder_gru2(net, initial_state=decoder_initial_state)
net = decoder_gru3(net, initial_state=decoder_initial_state)
net

In [None]:
# Connect the final dense layer that converts to
# one-hot encoded arrays.
decoder_output = decoder_dense(net)
decoder_output

In [None]:
# Model 1
model_train = Model(inputs=[encoder_input, decoder_input], outputs=[decoder_output])

In [None]:
# Model 2
model_encoder = Model(inputs=[encoder_input], outputs=[encoder_output])

In [None]:
# Model 3
decoder_output = connect_decoder(initial_state=decoder_initial_state)

model_decoder = Model(inputs=[decoder_input, decoder_initial_state], outputs=[decoder_output])

In [None]:
model_train.summary()

In [None]:
import tensorflow as tf

In [None]:
def sparse_cross_entropy(y_true, y_pred):
    """
    Calculate the cross-entropy loss between y_true and y_pred.
    
    y_true is a 2-rank tensor with the desired output.
    The shape is [batch_size, sequence_length] and it
    contains sequences of integer-tokens.

    y_pred is the decoder's output which is a 3-rank tensor
    with shape [batch_size, sequence_length, num_words]
    so that for each sequence in the batch there is a one-hot
    encoded array of length num_words.
    """

    # Calculate the loss. This outputs a
    # 2-rank tensor of shape [batch_size, sequence_length]
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true,
                                                          logits=y_pred)

    # Keras may reduce this across the first axis (the batch)
    # but the semantics are unclear, so to be sure we use
    # the loss across the entire 2-rank tensor, we reduce it
    # to a single scalar with the mean function.
    loss_mean = tf.reduce_mean(loss)

    return loss_mean

In [None]:
optimizer = RMSprop(lr=1e-3)

In [None]:
decoder_target = K.placeholder(dtype='int64', shape=(None, None))

In [None]:
model_train.compile(optimizer=optimizer,
                    loss=sparse_cross_entropy,
                    target_tensors=[decoder_target])

## Callback Functions

In [None]:
path_checkpoint = '21_checkpoint.keras'
callback_checkpoint = ModelCheckpoint(filepath=path_checkpoint,
                                      monitor='val_loss',
                                      verbose=1,
                                      save_weights_only=True,
                                      save_best_only=True)

In [None]:
callback_early_stopping = EarlyStopping(monitor='val_loss',
                                        patience=3, verbose=1)

In [None]:
callback_tensorboard = TensorBoard(log_dir='./21_logs/',
                                   histogram_freq=0,
                                   write_graph=False)

In [None]:
callbacks = [callback_early_stopping,
             callback_checkpoint,
             callback_tensorboard]

## Train model

In [None]:
try:
    model_train.load_weights(path_checkpoint)
except Exception as error:
    print("Error trying to load checkpoint.")
    print(error)

In [None]:
x_data = {
    'encoder_input': encoder_input_data,
    'decoder_input': decoder_input_data
}

y_data = {
    'decoder_output': decoder_output_data
}

In [None]:
model_train.fit(x=x_data,
                y=y_data,
                batch_size=640,
                epochs=10,
                validation_split=validation_split,  # fix for validation data
                callbacks=callbacks)

In [None]:
# TODO: Add tokenizer