In [1]:
import numpy as np
from PIL import Image

### Load Dataset

In [2]:
all_data = np.load('train_dev_test.npz')

In [3]:
train_encoder_output = all_data['train_encoder_output']
train_decoder_input = all_data['train_decoder_input']
train_decoder_target = all_data['train_decoder_target']
validation_encoder_output = all_data['validation_encoder_output']
validation_decoder_input = all_data['validation_decoder_input']
validation_decoder_target = all_data['validation_decoder_target']
test_encoder_output = all_data['test_encoder_output']
test_decoder_input = all_data['test_decoder_input']
test_decoder_target = all_data['test_decoder_target']

In [4]:
from caption_utils import *
train_fns_list, dev_fns_list, test_fns_list = load_split_lists()

train_captions_raw, dev_captions_raw, test_captions_raw = get_caption_split()
vocab = create_vocab(train_captions_raw)
token2idx, idx2token = vocab_to_index(vocab)     

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


## Model Architecture

In [5]:
from keras.models import Model, Sequential
from keras.layers import Input, Dense, BatchNormalization, RepeatVector, Concatenate, Merge
from keras.layers import LSTM, GRU, Embedding, TimeDistributed
from keras import backend as K

#### Parameters

In [6]:
emb_size = 150
lstm_size = 300
vocab_size = len(vocab)
max_length = train_decoder_target.shape[1]

lr = 0.001
dropout_rate = 0.2
batch_size = 64
n_epochs = 20

### Model

In [14]:
K.clear_session()

In [15]:
# Image -> Image embedding
img_emb = Sequential()
img_emb.add(Dense(emb_size, input_dim=train_encoder_output.shape[1], activation='relu'))
img_emb.add(RepeatVector(1))

In [16]:
# Sentence to Word embedding
word_emb = Sequential()
word_emb.add(Embedding(input_dim=vocab_size, output_dim=emb_size, input_length=max_length-1))
#word_emb.add(Embedding(input_dim=vocab_size, output_dim=emb_size))

In [17]:
# Merge img_emb and word_emb
model = Sequential()
model.add(Merge([img_emb, word_emb], mode='concat', concat_axis=1))
#merged = Concatenate([img_emb, word_emb])
model.add(BatchNormalization())

  This is separate from the ipykernel package so we can avoid doing imports until


In [18]:
# RNN Layer
model.add(GRU(lstm_size, return_sequences=True, dropout=dropout_rate, recurrent_dropout=dropout_rate))
model.add(BatchNormalization())
model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))

In [19]:
print(img_emb.summary())
print(word_emb.summary())
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 150)               76950     
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 1, 150)            0         
Total params: 76,950
Trainable params: 76,950
Non-trainable params: 0
_________________________________________________________________
None
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 38, 150)           379650    
Total params: 379,650
Trainable params: 379,650
Non-trainable params: 0
_________________________________________________________________
None
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
merge_1 (Merge)              (None, 39, 150)           0

In [22]:
from keras.utils import plot_model
plot_model(model, to_file='model.png', show_shapes=True)

In [23]:
model.compile(optimizer='adam',
               loss='categorical_crossentropy',
               metrics=['accuracy'])

### Train the Model

In [None]:
model.fit([train_encoder_output, train_decoder_input], [train_decoder_target], 
           validation_data=([validation_encoder_output, validation_decoder_input], [validation_decoder_target]),
           epochs=n_epochs, batch_size=batch_size, verbose=2)

### Add code to save the model 

In [None]:
from keras.callbacks import ModelCheckpoint

In [None]:
checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.VGG16.flicker8k', 
                               verbose=1, save_best_only=True)

# Model fit

### Load the Model with the Best Validation Loss

In [None]:
VGG16_model.load_weights('saved_models/weights.best.VGG16.flicker8k')

### Test the Model

In [25]:
print(test_encoder_output.shape)
print(test_decoder_input.shape)
print(test_decoder_target.shape)

(5000, 512)
(5000, 38)
(5000, 39, 2531)
