## Prepare the inputs

- encoder_input_data: 2D array of shape `(num_images * 5, 512)`
- decoder_input_data: 3D array of shape `(num_captions, max_words_in_sentence, num_words)`
- decoder_output_data: same as decoder_input_data but offset by one timestep. decoder_target_data[:, t, :] will be the same as decoder_input_data[:, t + 1, :]

## Image Preprocessing

obtain bottleneck features

In [1]:
import numpy as np

In [2]:
# Since there are 5 captions per image, duplicate the bottleneck features
def duplicate_bottleneck_features(features):
    num_captions = 5 # 5 stands for number of captions per image
    num_rows = features.shape[0] * num_captions 

    features_dup = np.zeros((num_rows, features.shape[1]))
    for i, image in enumerate(features):
        for j in range(num_captions):
            features_dup[i*num_captions + j] = image
    return features_dup    

In [3]:
bottleneck_features = np.load('bottleneck_features/Flicker8k_bottleneck_features_VGG16_avgpooling.npz')
bottleneck_features_train = bottleneck_features["train"]
bottleneck_features_validation = bottleneck_features["validation"]
bottleneck_features_test = bottleneck_features["test"]

bottleneck_features_train_dup = duplicate_bottleneck_features(bottleneck_features_train)
bottleneck_features_validation_dup = duplicate_bottleneck_features(bottleneck_features_validation)
bottleneck_features_test_dup = duplicate_bottleneck_features(bottleneck_features_test)

In [4]:
print(bottleneck_features_train_dup.shape)
print(bottleneck_features_validation_dup.shape)
print(bottleneck_features_test_dup.shape)

(30000, 512)
(5000, 512)
(5000, 512)


## Word Embedding

In [5]:
from caption_utils import *
train_fns_list, dev_fns_list, test_fns_list = load_split_lists()

train_captions_raw, dev_captions_raw, test_captions_raw = get_caption_split()
vocab = create_vocab(train_captions_raw)
token2idx, idx2token = vocab_to_index(vocab)     
captions_data = (train_captions_raw.copy(), dev_captions_raw.copy(), test_captions_raw.copy())
train_captions, dev_captions, test_captions = process_captions(captions_data, token2idx)

  assert(idx2token[idx] == token, "token2idx and idx2token not equivalent")
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [6]:
print(len(vocab))

2531


## Caption Preprocessing


In [7]:
captions_onehot = np.load('preprocessed_captions/Flicker8k_onehot_'+str(len(vocab))+'_words.npz')
train_captions_onehot = captions_onehot["train"]
validation_captions_onehot = captions_onehot["validation"]
test_captions_onehot = captions_onehot["test"]

train_captions_onehot = train_captions_onehot.astype(np.float32)
validation_captions_onehot = validation_captions_onehot.astype(np.float32)
test_captions_onehot = test_captions_onehot.astype(np.float32)

In [8]:
print(train_captions_onehot.shape)
print(validation_captions_onehot.shape)
print(test_captions_onehot.shape)

(30000, 39, 2531)
(5000, 39, 2531)
(5000, 39, 2531)


In [9]:
print(train_captions_onehot.dtype, validation_captions_onehot.dtype, test_captions_onehot.dtype)

float32 float32 float32


In [10]:
def captions_onehot_split(captions_onehot):
    """ returns decoder input data and decoder target data """
    return captions_onehot[:, :-1, :], captions_onehot[:, :, :]

### Training captions - > decoder input, target data  

In [11]:
train_decoder_input, train_decoder_target = captions_onehot_split(train_captions_onehot)
validation_decoder_input, validation_decoder_target = captions_onehot_split(validation_captions_onehot)
test_decoder_input, test_decoder_target = captions_onehot_split(test_captions_onehot)

In [12]:
train_encoder_output = bottleneck_features_train_dup.astype(np.float32)

In [163]:
test_decoder_input = np.argmax(test_decoder_input, axis=-1)
train_decoder_input = np.argmax(train_decoder_input, axis=-1)
validation_decoder_input = np.argmax(validation_decoder_input, axis=-1)

(5000, 38)

In [14]:
print("Decoder Input", train_decoder_input.shape, train_decoder_input.dtype)
print("Decoder Target", train_decoder_target.shape, train_decoder_target.dtype)
print("Encoder Output", train_encoder_output.shape, train_encoder_output.dtype)

Decoder Input (30000, 38, 2531) float32
Decoder Target (30000, 39, 2531) float32
Encoder Output (30000, 512) float32


## Build Model

In [None]:
batch_size = 64  # Batch size for training.
epochs = 5  # Number of epochs to train for.
latent_dim = 300  # Latent dimensionality of the encoding space.
num_samples = train_encoder_output.shape[0]

-------------------------
# Experiment 

## Parameters

In [121]:
from keras.models import Model, Sequential
from keras.layers import Input, LSTM, GRU, Dense, Embedding, BatchNormalization, RepeatVector, Concatenate, TimeDistributed, Merge
from keras import backend as K
import tensorflow as tf

In [94]:
emb_size = 150
lstm_size = 300
vocab_size = len(vocab)
max_length = train_decoder_target.shape[1]
print(max_length)

39


In [95]:
K.clear_session()

## Image Embedding

In [96]:
image_input = Input(shape=(train_encoder_output.shape[1], ), dtype='float32')
print(image_input)
img_emb = Dense(emb_size, activation='relu')(image_input)
img_emb = RepeatVector(1)(img_emb)
print(img_emb)

Tensor("input_1:0", shape=(?, 512), dtype=float32)
Tensor("repeat_vector_1/Tile:0", shape=(?, 1, 150), dtype=float32)


## Word Embedding

In [97]:
caption_inputs = Input(shape=(max_length-1, ), dtype='float32')
print(caption_inputs)
word_emb = Embedding(input_dim=vocab_size, output_dim=emb_size)(caption_inputs)
print(word_emb)

Tensor("input_2:0", shape=(?, 38), dtype=float32)
Tensor("embedding_1/Gather:0", shape=(?, 38, 150), dtype=float32)


In [98]:
seq_input = Concatenate(axis=1)([img_emb, word_emb])
print(seq_input)

Tensor("concatenate_1/concat:0", shape=(?, 39, 150), dtype=float32)


In [99]:
gru_cell = GRU(lstm_size, return_sequences=True)(seq_input)
seq_out = TimeDistributed(Dense(vocab_size))(gru_cell)
print(seq_out.shape)

(?, 39, 2531)


In [100]:
Model(inputs=[train_encoder_output, train_decoder_input],
              outputs=[train_decoder_target])

TypeError: unhashable type: 'numpy.ndarray'

In [None]:
train_captions = np.zeros((train_captions_onehot.shape[0], train_captions_onehot.shape[1]))
for i, onehot_caption in enumerate(train_captions_onehot):
    train_captions[i] = np.argmax(onehot_caption, axis=1)

In [155]:
K.clear_session()

# Image -> Image embedding
img_emb = Sequential()
img_emb.add(Dense(emb_size, input_dim=train_encoder_output.shape[1], activation='relu'))
img_emb.add(RepeatVector(1))

# Sentence to Word embedding
word_emb = Sequential()
word_emb.add(Embedding(input_dim=vocab_size, output_dim=emb_size, input_length=max_length-1))

# Merge img_emb and word_emb
seq_in = Sequential()
seq_in.add(Merge([img_emb, word_emb], mode='concat', concat_axis=1))
#seq_in.add(Concatenate([img_emb, word_emb]))

# RNN Layer
seq_in.add(GRU(lstm_size, return_sequences=True))
seq_in.add(TimeDistributed(Dense(vocab_size)))

seq_in.compile(optimizer='adam',
               loss='categorical_crossentropy',
               metrics=['accuracy'])

  


In [156]:
seq_in.fit([train_encoder_output, train_decoder_input], [train_decoder_target], epochs=5, batch_size=64, verbose=2)

ValueError: Error when checking input: expected embedding_1_input to have 2 dimensions, but got array with shape (30000, 38, 2531)

In [159]:
print(img_emb.summary())
print(word_emb.summary())
print(seq_in.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 150)               76950     
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 1, 150)            0         
Total params: 76,950
Trainable params: 76,950
Non-trainable params: 0
_________________________________________________________________
None
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 38, 150)           379650    
Total params: 379,650
Trainable params: 379,650
Non-trainable params: 0
_________________________________________________________________
None
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
merge_1 (Merge)              (None, 39, 150)           0

In [160]:
from keras.utils import plot_model
plot_model(seq_in, to_file='model.png')

keras.legacy.layers.Merge

In [126]:
print(Merge.__doc__)

A `Merge` layer can be used to merge a list of tensors
    into a single tensor, following some merge `mode`.
    # Example
    ```python
    model1 = Sequential()
    model1.add(Dense(32, input_dim=32))
    model2 = Sequential()
    model2.add(Dense(32, input_dim=32))
    merged_model = Sequential()
    merged_model.add(Merge([model1, model2], mode='concat', concat_axis=1))
    ```
    # Arguments
        layers: Can be a list of Keras tensors or
            a list of layer instances. Must be more
            than one layer/tensor.
        mode: String or lambda/function. If string, must be one
            of: 'sum', 'mul', 'concat', 'ave', 'cos', 'dot', 'max'.
            If lambda/function, it should take as input a list of tensors
            and return a single tensor.
        concat_axis: Integer, axis to use in mode `concat`.
        dot_axes: Integer or tuple of integers,
            axes to use in mode `dot` or `cos`.
        output_shape: Either a shape tuple (tuple of intege