In [1]:
import h5py
import pickle
import numpy as np

## Load Dataset

In [2]:
# training data
h5f = h5py.File('dataset/train_data.h5', 'r')
x_train = h5f['x_train'][:]
y_train = h5f['y_train'][:]
h5f.close()

print('x_train:', x_train.shape)
print('y_train:', y_train.shape)

x_train: (10000, 250, 250, 1)
y_train: (10000, 5, 16)


In [3]:
# validation data
h5f = h5py.File('dataset/val_data.h5', 'r')
x_val = h5f['x_val'][:]
y_val = h5f['y_val'][:]
h5f.close()

print('x_val:', x_val.shape)
print('y_val:', y_val.shape)

x_val: (2500, 250, 250, 1)
y_val: (2500, 5, 16)


## Load vocabulary and embeddings

In [4]:
def read_glove_vecs(glove_file):
    print('Creating word to vec map...')
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float32)
    print('Done!')
    return word_to_vec_map

In [5]:
# load embeddings
word_to_vec_map = read_glove_vecs('{}/glove.6B.100d.txt'.format('dataset'))

Creating word to vec map...
Done!


In [6]:
# assign embeddings values to tokens
size = word_to_vec_map['unk'].shape

word_to_vec_map['<sos>'] = np.random.uniform(low=-1.0, high=1.0, size=size)
word_to_vec_map['<eos>'] = np.random.uniform(low=-1.0, high=1.0, size=size)
word_to_vec_map['<pad>'] = np.random.uniform(low=-1.0, high=1.0, size=size)

In [7]:
# load vocabulary
with open('dataset/vocabulary.pickle', 'rb') as vocab_file:
    vocabulary_dict = pickle.load(vocab_file)

vocabulary = vocabulary_dict['vocabulary']
word_to_index = vocabulary_dict['word_to_index']
index_to_word = vocabulary_dict['index_to_word']

## Modify caption vector

Choose one caption out of five per image.

In [8]:
def choose_random_caption(data):
    data_new = []
    for captions in data:
        data_new.append(captions[np.random.randint(0, 5)])
    return np.array(data_new)

y_train = choose_random_caption(y_train)
y_val = choose_random_caption(y_val)

print('y_train:', y_train.shape)
print('y_val:', y_val.shape)

y_train: (10000, 16)
y_val: (2500, 16)


## Training Data

In [9]:
# Input to encoder is the numpy array of the image
encoder_input_data = x_train
encoder_input_data_val = x_val

The input and output data for the decoder is identical, except shifted one time-step. We can use the same numpy array to save memory by slicing it, which merely creates different 'views' of the same data in memory.

In [10]:
decoder_input_data = y_train[:, :-1]
decoder_input_data_val = y_val[:, :-1]

In [11]:
decoder_output_data = y_train[:, 1:]
decoder_output_data_val = y_val[:, 1:]

## Define Image Model (Encoder)

In [12]:
import tensorflow as tf
from keras import backend as K
from keras.models import Model
from keras.optimizers import RMSprop
from keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping
from keras.layers import Conv2D, Dense, Input, Dropout, GRU, Activation
from keras.layers.embeddings import Embedding

from capsule_layers import CapsuleLayer, PrimaryCap, Length

Using TensorFlow backend.


In [102]:
tf.reset_default_graph()
K.clear_session()

In [103]:
encoder_input_shape = x_train.shape[1:]
n_class = 10
routings = 3

In [104]:
# placeholder for input image
encoder_input = Input(shape=encoder_input_shape, name='encoder_input')

In [105]:
def connect_encoder():
    
    # Layer 1-3: Just some conventional Conv2D layers
    conv1 = Conv2D(filters=96, kernel_size=13, strides=4, padding='valid', activation='relu', name='conv1')(encoder_input)
    conv2 = Conv2D(filters=96, kernel_size=5, strides=2, padding='valid', activation='relu', name='conv2')(conv1)
    conv3 = Conv2D(filters=256, kernel_size=9, strides=1, padding='valid', activation='relu', name='conv3')(conv2)
    
    # Layer 4: Conv2D layer with `squash` activation, then reshape to [None, num_capsule, dim_capsule]
    primary_caps = PrimaryCap(conv3, dim_capsule=8, n_channels=32, kernel_size=9, strides=2, padding='valid')

    # Layer 5: Capsule layer. Routing algorithm works here.
    caption_caps = CapsuleLayer(num_capsule=n_class, dim_capsule=16, routings=routings, name='caption_caps')(primary_caps)

    encoder_output = Length(name='capsnet')(caption_caps)
    
    return encoder_output

In [106]:
encoder_output = connect_encoder()

## Define caption model (Decoder)

### Define embedding layer

In [107]:
def create_embedding_layer():
    vocabulary_length = len(vocabulary) + 1  # adding 1 to fit Keras embedding (requirement)
    embedding_dimensions = word_to_vec_map['unk'].shape[0]  # define dimensionality of GloVe word vectors (= 100)
    
    embedding_matrix = np.zeros((vocabulary_length, embedding_dimensions))
    for word, index in word_to_index.items():
        try:
            embedding_matrix[index, :] = word_to_vec_map[word]
        except KeyError:
            embedding_matrix[index, :] = word_to_vec_map['unk']
    
    decoder_embedding = Embedding(vocabulary_length, embedding_dimensions, trainable=False)
    decoder_embedding.build((None,))
    decoder_embedding.set_weights([embedding_matrix])
    
    return decoder_embedding

In [108]:
decoder_embedding = create_embedding_layer()

### Define RNN models

In [109]:
# Internal state size of GRU layers in the RNN
state_size = 512

In [110]:
decoder_initial_state = Input(shape=(state_size,), name='decoder_initial_state')

In [111]:
decoder_transfer_map = Dense(state_size, activation='tanh', name='decoder_transfer_map')

In [112]:
decoder_input = Input(shape=(None, ), name='decoder_input')

In [113]:
decoder_gru1 = GRU(state_size, name='decoder_gru1', return_sequences=True)
decoder_gru2 = GRU(state_size, name='decoder_gru2', return_sequences=True)
decoder_gru3 = GRU(state_size, name='decoder_gru3', return_sequences=True)

In [114]:
decoder_dense = Dense(len(vocabulary), activation='linear', name='decoder_output')

## Connect Decoder

In [115]:
def connect_decoder(initial_state):
    net = decoder_embedding(decoder_input)
    
    # Connect all the LSTM layers.
    net = decoder_gru1(net, initial_state=initial_state)
    net = decoder_gru2(net, initial_state=initial_state)
    net = decoder_gru3(net, initial_state=initial_state)
    
    # Connect the final dense layer that converts to
    # one-hot encoded arrays.
    decoder_output = decoder_dense(net)
    
    return decoder_output

In [116]:
# Model 1
decoder_output = connect_decoder(initial_state=decoder_transfer_map(encoder_output))
model_train = Model(inputs=[encoder_input, decoder_input], outputs=[decoder_output])

In [117]:
# Model 2
model_encoder = Model(inputs=[encoder_input], outputs=[encoder_output])

In [118]:
# Model 3
decoder_output = connect_decoder(initial_state=decoder_initial_state)
model_decoder = Model(inputs=[decoder_input, decoder_initial_state], outputs=[decoder_output])

In [119]:
model_train.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      (None, 250, 250, 1)  0                                            
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, 60, 60, 96)   16320       encoder_input[0][0]              
__________________________________________________________________________________________________
conv2 (Conv2D)                  (None, 28, 28, 96)   230496      conv1[0][0]                      
__________________________________________________________________________________________________
conv3 (Conv2D)                  (None, 20, 20, 256)  1990912     conv2[0][0]                      
__________________________________________________________________________________________________
primarycap

In [120]:
def sparse_cross_entropy(y_true, y_pred):
    """
    Calculate the cross-entropy loss between y_true and y_pred.
    
    y_true is a 2-rank tensor with the desired output.
    The shape is [batch_size, sequence_length] and it
    contains sequences of integer-tokens.

    y_pred is the decoder's output which is a 3-rank tensor
    with shape [batch_size, sequence_length, num_words]
    so that for each sequence in the batch there is a one-hot
    encoded array of length num_words.
    """

    # Calculate the loss. This outputs a
    # 2-rank tensor of shape [batch_size, sequence_length]
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true,
                                                          logits=y_pred)

    # Keras may reduce this across the first axis (the batch)
    # but the semantics are unclear, so to be sure we use
    # the loss across the entire 2-rank tensor, we reduce it
    # to a single scalar with the mean function.
    loss_mean = tf.reduce_mean(loss)

    return loss_mean

In [121]:
optimizer = RMSprop(lr=1e-3)

In [122]:
decoder_target = K.placeholder(dtype='int64', shape=(None, None))

In [123]:
model_train.compile(optimizer=optimizer,
                    loss=sparse_cross_entropy,
                   target_tensors=[decoder_target],
                    metrics=['accuracy'])

## Callback Functions

In [124]:
path_checkpoint = '21_checkpoint.keras'
callback_checkpoint = ModelCheckpoint(filepath=path_checkpoint,
                                      monitor='val_loss',
                                      verbose=1,
                                      save_weights_only=True,
                                      save_best_only=True)

In [125]:
callback_early_stopping = EarlyStopping(monitor='val_loss',
                                        patience=3, verbose=1)

In [126]:
callback_tensorboard = TensorBoard(log_dir='./21_logs/',
                                   histogram_freq=0,
                                   write_graph=False)

In [127]:
callbacks = [callback_early_stopping,
             callback_checkpoint,
             callback_tensorboard]

## Train model

In [128]:
try:
    model_train.load_weights(path_checkpoint)
except Exception as error:
    print("Error trying to load checkpoint.")
    print(error)

Error trying to load checkpoint.
Unable to open file (unable to open file: name = '21_checkpoint.keras', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


In [129]:
x_data = {
    'encoder_input': encoder_input_data,
    'decoder_input': decoder_input_data
}

y_data = {
    'decoder_output': decoder_output_data
}

In [130]:
x_data_val = {
    'encoder_input': encoder_input_data_val,
    'decoder_input': decoder_input_data_val
}

y_data_val = {
    'decoder_output': decoder_output_data_val
}

In [131]:
model_train.fit(x=x_data,
                y=y_data,
                batch_size=100,
                epochs=10,
                callbacks=callbacks,
                validation_data=(x_data_val, y_data_val)
               )

Train on 10000 samples, validate on 2500 samples
Epoch 1/10

Epoch 00001: val_loss improved from inf to 3.35006, saving model to 21_checkpoint.keras
Epoch 2/10

Epoch 00002: val_loss improved from 3.35006 to 3.02572, saving model to 21_checkpoint.keras
Epoch 3/10

Epoch 00003: val_loss improved from 3.02572 to 2.85571, saving model to 21_checkpoint.keras
Epoch 4/10

Epoch 00004: val_loss improved from 2.85571 to 2.75591, saving model to 21_checkpoint.keras
Epoch 5/10
  700/10000 [=>............................] - ETA: 5:44 - loss: 2.4753

KeyboardInterrupt: 

In [None]:
# TODO: Add tokenizer