In [29]:
import numpy as np

### Load Dataset

In [6]:
all_data = np.load('train_dev_test.npz')

In [7]:
train_encoder_output = all_data['train_encoder_output']
train_decoder_input = all_data['train_decoder_input']
train_decoder_target = all_data['train_decoder_target']
validation_encoder_output = all_data['validation_encoder_output']
validation_decoder_input = all_data['validation_decoder_input']
validation_decoder_target = all_data['validation_decoder_target']
test_encoder_output = all_data['test_encoder_output']
test_decoder_input = all_data['test_decoder_input']
test_decoder_target = all_data['test_decoder_target']

In [89]:
from caption_utils import *
train_fns_list, dev_fns_list, test_fns_list = load_split_lists()

train_captions_raw, dev_captions_raw, test_captions_raw = get_caption_split()
vocab = create_vocab(train_captions_raw)
token2idx, idx2token = vocab_to_index(vocab)    
captions_data = (train_captions_raw.copy(), dev_captions_raw.copy(), test_captions_raw.copy())
train_captions, dev_captions, test_captions = process_captions(captions_data, token2idx)

## Model Architecture

In [203]:
from keras.models import Model, Sequential
from keras.layers import Input, Dense, BatchNormalization, RepeatVector, Concatenate, Merge, Masking
from keras.layers import LSTM, GRU, Embedding, TimeDistributed, Bidirectional
from keras import backend as K

#### Parameters

In [204]:
emb_size = 300
lstm_size = 300
vocab_size = len(vocab)
max_length = train_decoder_target.shape[1]

lr = 0.001
dropout_rate = 0.2
batch_size = 64
n_epochs = 20

### Model

In [220]:
K.clear_session()

In [221]:
# Image -> Image embedding
image_input = Input(shape=(train_encoder_output.shape[1], ), name='image_input')
print("Image Input shape", image_input.shape)
img_emb = Dense(emb_size, activation='relu')(image_input)
img_emb = RepeatVector(1)(img_emb)
print(img_emb.shape)

Image Input shape (?, 512)
(?, 1, 300)


In [222]:
# Sentence to Word embedding
caption_inputs = Input(shape=(None, ), name='caption_input')
print("Caption Input Shape", caption_inputs.shape)
word_emb = Embedding(input_dim=vocab_size, output_dim=emb_size)(caption_inputs)
print(word_emb.shape)

Caption Input Shape (?, ?)
(?, ?, 300)


In [223]:
# Merge img_emb and word_emb
seq_input = Concatenate(axis=1)([img_emb, word_emb])
seq_input = BatchNormalization()(seq_input)
seq_input = Masking(mask_value=0., input_shape=(None, emb_size))(seq_input)
print(seq_input.shape)

(?, ?, 300)


In [224]:
# Sequence to Sequence
gru_cell = Bidirectional(LSTM(lstm_size, return_sequences=True))(seq_input)
gru_cell = BatchNormalization()(gru_cell)
seq_out = TimeDistributed(Dense(vocab_size, activation='softmax'))(gru_cell)

print(seq_out.shape)

(?, ?, 2531)


In [225]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
image_input (InputLayer)        (None, 512)          0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 300)          153900      image_input[0][0]                
__________________________________________________________________________________________________
caption_input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
repeat_vector_1 (RepeatVector)  (None, 1, 300)       0           dense_1[0][0]                    
__________________________________________________________________________________________________
embedding_

In [226]:
from keras.utils import plot_model
plot_model(model, to_file='model.png', show_shapes=True)

In [227]:
model = Model(inputs=[image_input,caption_inputs],outputs=[seq_out])

model.compile(optimizer='rmsprop',
               loss='categorical_crossentropy',
               metrics=['accuracy'])

In [228]:
model.fit([train_encoder_output[:100,:], train_decoder_input[:100,:]], [train_decoder_target[:100,:,:]], 
#           validation_data=([validation_encoder_output, validation_decoder_input], [validation_decoder_target]),
           epochs=n_epochs, batch_size=batch_size, verbose=2)

Epoch 1/20
 - 6s - loss: 2.3584 - acc: 0.0308
Epoch 2/20
 - 3s - loss: 1.3561 - acc: 0.0908
Epoch 3/20
 - 3s - loss: 0.9781 - acc: 0.1226
Epoch 4/20
 - 3s - loss: 0.6944 - acc: 0.1723
Epoch 5/20
 - 3s - loss: 0.5033 - acc: 0.2072
Epoch 6/20
 - 3s - loss: 0.3304 - acc: 0.2441
Epoch 7/20
 - 3s - loss: 0.1960 - acc: 0.2997
Epoch 8/20
 - 3s - loss: 0.1353 - acc: 0.3115
Epoch 9/20
 - 3s - loss: 0.1226 - acc: 0.3133
Epoch 10/20
 - 3s - loss: 0.0668 - acc: 0.3264
Epoch 11/20
 - 3s - loss: 0.0488 - acc: 0.3282
Epoch 12/20
 - 3s - loss: 0.0382 - acc: 0.3279
Epoch 13/20
 - 3s - loss: 0.0307 - acc: 0.3287
Epoch 14/20
 - 4s - loss: 0.0236 - acc: 0.3297
Epoch 15/20
 - 4s - loss: 0.0189 - acc: 0.3295
Epoch 16/20
 - 4s - loss: 0.0163 - acc: 0.3292
Epoch 17/20
 - 5s - loss: 0.0136 - acc: 0.3295
Epoch 18/20
 - 4s - loss: 0.0115 - acc: 0.3295
Epoch 19/20
 - 3s - loss: 0.0090 - acc: 0.3297
Epoch 20/20
 - 4s - loss: 0.0078 - acc: 0.3297


<keras.callbacks.History at 0x4db0d3ac8>

### Test the Model

In [229]:
def _generate_seq(encoder_output):
    generated_sentence = []
    start, end = token2idx['<bos>'], token2idx['<eos>']
    current = start
    while len(generated_sentence) < 20:
        X = [encoder_output.reshape(1, 512), np.array([current])]
        
        predicted = model.predict(X)
        
        current = np.argmax(predicted, axis=-1)[0][1]
        
        if current == end:
            break
        generated_sentence.append(idx2token[current])
            
    return ' '.join(generated_sentence)

In [230]:
for i in range(0, 200, 5):
    res = _generate_seq(train_encoder_output[i, :])
    print(res, len(res.split()))

<bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> 20
<bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> 20
<bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> 20
<bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> 20
dog dog dog dog dog dog dog dog dog dog dog dog dog dog dog dog dog dog dog dog 20
person person person person person person person person person person person person person person person person person person person person 20
<bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> 20
<bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> 20
<bos> <bos> <bos> <bos> <bos> <bos> 

In [231]:
for i in range(20):
    for j in range(5):
        print(intseq_to_caption(idx2token, train_captions[train_fns_list[i]][j]))

<bos> a black dog is running after a white dog in the snow <eos>
<bos> black dog chasing brown dog through snow <eos>
<bos> two dogs chase each other across the snowy ground <eos>
<bos> two dogs play together in the snow <eos>
<bos> two dogs running through a low lying body of water <eos>
<bos> a little baby plays croquet <eos>
<bos> a little girl plays croquet next to a truck <eos>
<bos> the child is playing <unk> by the truck <eos>
<bos> the kid is in front of a car with a put and a ball <eos>
<bos> the little boy is playing with a croquet <unk> and ball beside the car <eos>
<bos> a brown dog in the snow has something hot pink in its mouth <eos>
<bos> a brown dog in the snow holding a pink hat <eos>
<bos> a brown dog is holding a pink shirt in the snow <eos>
<bos> a dog is carrying something pink in its mouth while walking through the snow <eos>
<bos> a dog with something pink in its mouth is looking forward <eos>
<bos> a brown dog is running along a beach <eos>
<bos> a brown dog wea