In [15]:
import numpy as np

### Load Dataset

In [16]:
all_data = np.load('train_dev_test.npz')

In [17]:
train_encoder_output = all_data['train_encoder_output']
train_decoder_input = all_data['train_decoder_input']
train_decoder_target = all_data['train_decoder_target']
validation_encoder_output = all_data['validation_encoder_output']
validation_decoder_input = all_data['validation_decoder_input']
validation_decoder_target = all_data['validation_decoder_target']
test_encoder_output = all_data['test_encoder_output']
test_decoder_input = all_data['test_decoder_input']
test_decoder_target = all_data['test_decoder_target']

In [18]:
from caption_utils import *
train_fns_list, dev_fns_list, test_fns_list = load_split_lists()

train_captions_raw, dev_captions_raw, test_captions_raw = get_caption_split()
vocab = create_vocab(train_captions_raw)
token2idx, idx2token = vocab_to_index(vocab)    
captions_data = (train_captions_raw.copy(), dev_captions_raw.copy(), test_captions_raw.copy())
train_captions, dev_captions, test_captions = process_captions(captions_data, token2idx)

## Model Architecture

In [36]:
from keras.models import Model, Sequential
from keras.layers import Input, Dense, BatchNormalization, RepeatVector, Concatenate, Merge, Masking
from keras.layers import LSTM, GRU, Embedding, TimeDistributed, Bidirectional
from keras import backend as K
from keras import optimizers
from keras.utils import plot_model

#### Parameters

In [37]:
emb_size = 300
lstm_size = 300
vocab_size = len(vocab)
max_length = train_decoder_target.shape[1]

lr = 0.001
dropout_rate = 0.2
batch_size = 64
n_epochs = 20

### Model

In [38]:
K.clear_session()

In [39]:
# Image -> Image embedding
image_input = Input(shape=(train_encoder_output.shape[1], ), name='image_input')
print("Image Input shape", image_input.shape)
img_emb = Dense(emb_size, activation='relu')(image_input)
img_emb = RepeatVector(1)(img_emb)
print(img_emb.shape)

Image Input shape (?, 512)
(?, 1, 300)


In [40]:
# Sentence to Word embedding
caption_inputs = Input(shape=(None, ), name='caption_input')
print("Caption Input Shape", caption_inputs.shape)
word_emb = Embedding(input_dim=vocab_size, output_dim=emb_size)(caption_inputs)
print(word_emb.shape)

Caption Input Shape (?, ?)
(?, ?, 300)


In [41]:
# Merge img_emb and word_emb
seq_input = Concatenate(axis=1)([img_emb, word_emb])
seq_input = BatchNormalization()(seq_input)
seq_input = Masking(mask_value=0., input_shape=(None, emb_size))(seq_input)
print(seq_input.shape)

(?, ?, 300)


In [42]:
# Sequence to Sequence
gru_cell = Bidirectional(LSTM(lstm_size, return_sequences=True))(seq_input)
gru_cell = BatchNormalization()(gru_cell)
seq_out = TimeDistributed(Dense(vocab_size, activation='softmax'))(gru_cell)

print(seq_out.shape)

(?, ?, 2531)


In [43]:
rmsprop = optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=1e-6)

In [44]:
model = Model(inputs=[image_input,caption_inputs], outputs=[seq_out])

model.compile(optimizer=rmsprop,
               loss='categorical_crossentropy',
               metrics=['accuracy'])

In [45]:
model.summary()
#plot_model(model, to_file='model.png', show_shapes=True)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
image_input (InputLayer)        (None, 512)          0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 300)          153900      image_input[0][0]                
__________________________________________________________________________________________________
caption_input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
repeat_vector_1 (RepeatVector)  (None, 1, 300)       0           dense_1[0][0]                    
__________________________________________________________________________________________________
embedding_

In [56]:
model.fit([train_encoder_output[:10,:], train_decoder_input[:10,:]], [train_decoder_target[:10,:,:]], 
#           validation_data=([validation_encoder_output, validation_decoder_input], [validation_decoder_target]),
           epochs=100, batch_size=batch_size, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x7f7aefe902e8>

### Test the Model

In [51]:
def _generate_seq(encoder_output):
    generated_sentence = []
    start, end = token2idx['<bos>'], token2idx['<eos>']
    current = start
    while len(generated_sentence) < 20:
        X = [encoder_output.reshape(1, 512), np.array([current])]
        
        predicted = model.predict(X)
        
        current = np.argmax(predicted, axis=-1)[0][1]
        
        if current == end:
            break
        generated_sentence.append(idx2token[current])
            
    return ' '.join(generated_sentence)

In [57]:
for i in range(0, 10, 5):
    res = _generate_seq(train_encoder_output[i, :])
    print(res, len(res.split()))

two dogs <bos> two dogs <bos> two dogs <bos> two dogs <bos> two dogs <bos> two dogs <bos> two dogs 20
the <bos> the <bos> the <bos> the <bos> the <bos> the <bos> the <bos> the <bos> the <bos> the <bos> 20


In [53]:
for i in range(20):
    for j in range(5):
        print(intseq_to_caption(idx2token, train_captions[train_fns_list[i]][j]))

<bos> a black dog is running after a white dog in the snow <eos>
<bos> black dog chasing brown dog through snow <eos>
<bos> two dogs chase each other across the snowy ground <eos>
<bos> two dogs play together in the snow <eos>
<bos> two dogs running through a low lying body of water <eos>
<bos> a little baby plays croquet <eos>
<bos> a little girl plays croquet next to a truck <eos>
<bos> the child is playing <unk> by the truck <eos>
<bos> the kid is in front of a car with a put and a ball <eos>
<bos> the little boy is playing with a croquet <unk> and ball beside the car <eos>
<bos> a brown dog in the snow has something hot pink in its mouth <eos>
<bos> a brown dog in the snow holding a pink hat <eos>
<bos> a brown dog is holding a pink shirt in the snow <eos>
<bos> a dog is carrying something pink in its mouth while walking through the snow <eos>
<bos> a dog with something pink in its mouth is looking forward <eos>
<bos> a brown dog is running along a beach <eos>
<bos> a brown dog wea