In [1]:
from keras.applications.vgg16 import VGG16
from keras.models import Model
from keras import backend as K
from keras.layers import Input, Dense, Embedding, Reshape, Concatenate, LSTM, TimeDistributed 
import tensorflow as tf
from keras.callbacks import Callback

Using TensorFlow backend.


In [2]:
class RestoreCkptCallback(Callback):
    def __init__(self, pretrained_file):
        self.pretrained_file = pretrained_file
        self.sess = K.get_session()
        self.saver = tf.train.Saver()
    def on_train_begin(self, logs=None):
        if self.pretrained_file:
            self.saver.restore(self.sess, self.pretrained_file)
            print('load weights: OK.')

In [3]:
max_question_length = 100
vocab_size = 6000
n_answers = 6000

In [4]:
CNN = VGG16(weights='imagenet', include_top=True)
# Only removes the classification layer from model.layers
CNN.layers.pop()

<keras.layers.core.Dense at 0x7f4f4e5695c0>

In [7]:
image_embedding = Dense(units=256)(CNN.layers[-1].output)
image_embedding = Reshape((1, 256))(image_embedding)

question_input = Input(shape=(max_question_length,), dtype='int32')
question_embedding = Embedding(input_dim=vocab_size, output_dim=256, input_length=None)(question_input)

image_question_embedding = Concatenate(axis=1)([image_embedding, question_embedding])
question_features, last_h, last_c = LSTM(units=512, return_sequences=True, return_state=True)(image_question_embedding)

question_pred = TimeDistributed(Dense(units=vocab_size, activation='softmax'))(question_features)
model = Model(inputs=[CNN.input, question_input], outputs=[question_pred])

model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
restore_ckpt_callback = RestoreCkptCallback(pretrained_file='./weights/oreilly/model-500') 

answer_pred = Dense(units=n_answers, activation='softmax')(last_h)

In [6]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
block1_conv1 (Conv2D)           (None, 224, 224, 64) 1792        input_1[0][0]                    
__________________________________________________________________________________________________
block1_conv2 (Conv2D)           (None, 224, 224, 64) 36928       block1_conv1[0][0]               
__________________________________________________________________________________________________
block1_pool (MaxPooling2D)      (None, 112, 112, 64) 0           block1_conv2[0][0]               
__________________________________________________________________________________________________
block2_con