In [1]:
import tensorflow as tf
import h5py as h5
import keras.backend as K
import numpy as np

Using TensorFlow backend.


In [2]:
VOCAB_SIZE = 12602
MAX_QUESTION_LEN = 26

In [26]:
class VQANet:
    def __init__(self, combine_type, question_embed_dim, lstm_dim, n_answers):
        self.combine_type = combine_type
        self.question_embed_dim = question_embed_dim
        self.lstm_dim = lstm_dim
        self.n_answers = n_answers
        self.build()
        
    def build(self):
        
        image_features = tf.keras.layers.Input(shape=(4096,), 
                                                   dtype='float32')
            
        image_embedding = tf.keras.layers.Dense(units=self.question_embed_dim, 
                                                activation='elu',
                                                name='image_embedding')(inputs=image_features)
        
        if self.combine_type == 'show_and_tell':
            concat_axis = 1
            image_embedding = tf.keras.layers.Reshape((1, self.question_embed_dim))(inputs=image_embedding)
        elif self.combine_type == 'feed_CNN_to_all':
            concat_axis = -1
            image_embedding = tf.keras.layers.RepeatVector(MAX_QUESTION_LEN)(inputs=image_embedding)
            
        question_input = tf.keras.layers.Input(shape=(MAX_QUESTION_LEN,), 
                                               dtype='int32',
                                               name='question_input')

        question_embedding = tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, 
                                                       output_dim=self.question_embed_dim, 
                                                       input_length=MAX_QUESTION_LEN,
                                                       name='question_embedding')(inputs=question_input)

        image_question_embedding = tf.keras.layers.Concatenate(axis=concat_axis, 
                                                               name='image_question_embedding')(inputs=[image_embedding, question_embedding])

        question_features, last_h, _ = tf.keras.layers.LSTM(units=self.lstm_dim, 
                                                            return_sequences=True, 
                                                            return_state=True, 
                                                            name='question_generator')(inputs=image_question_embedding)

        question_pred = tf.keras.layers.TimeDistributed(layer=tf.keras.layers.Dense(units=VOCAB_SIZE, 
                                                                                    activation='softmax', 
                                                                                    name='word_classifier'))(inputs=question_features)

        if self.combine_type == 'show_and_tell':
            question_pred = tf.keras.layers.Lambda(lambda x: x[:, :-1, :], 
                                                   name='ignore_last_word')(inputs=question_pred)

        answer_fc1 = tf.keras.layers.Dense(units=1000,
                                            activation='elu',
                                            name='answer_dense_1')(inputs=last_h)

        answer_fc2 = tf.keras.layers.Dense(units=1000,
                                            activation='elu',
                                            name='answer_dense_2')(inputs=answer_fc1)

        answer_pred = tf.keras.layers.Dense(units=self.n_answers,
                                            activation='softmax',
                                            name='answer_classifier')(inputs=answer_fc2)

        self.model = tf.keras.Model(inputs=[image_features, question_input], 
                                    outputs=[question_pred, answer_pred])  

        self.model.compile(loss='categorical_crossentropy', 
                           optimizer='adam', metrics=['accuracy'])
            
    
    def train(self, x_train, y_train, x_val, y_val, batch_size, epochs):
        self.model.fit(x=x_train, 
                       y=y_train, 
                       batch_size=batch_size, 
                       epochs=epochs, 
                       verbose=1,
                       validation_split=0.2,
                       shuffle=True)

In [27]:
batch_size = 32
epochs = 10
question_embed_dim = 256
lstm_dim = 512
n_answers = 1000
n_train = 1000

In [28]:
ques = h5.File("../../data/data_train_val/data_prepro.h5", "r")
ques_train = ques['ques_train'][:n_train]
ques_to_image_train = ques['img_pos_train'][:n_train] - 1

ans_train = tf.keras.utils.to_categorical(y=ques['answers'][:n_train],
                                          num_classes=n_answers)

img_feat = h5.File("../../data/data_train_val/data_img.h5", "r")
img_train = np.array(img_feat['images_train'])[ques_to_image_train]

print(img_train.shape)

model = VQANet(combine_type='feed_CNN_to_all', 
               question_embed_dim=question_embed_dim, 
               lstm_dim=lstm_dim, 
               n_answers=n_answers)

print(model.model.summary())

model.train(x_train=[img_train, ques_train], 
            y_train=[tf.keras.utils.to_categorical(y=ques_train, num_classes=VOCAB_SIZE), ans_train], 
            x_val=[], 
            y_val=[], 
            batch_size=batch_size, 
            epochs=epochs)

(1000, 4096)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            (None, 4096)         0                                            
__________________________________________________________________________________________________
image_embedding (Dense)         (None, 256)          1048832     input_9[0][0]                    
__________________________________________________________________________________________________
question_input (InputLayer)     (None, 26)           0                                            
__________________________________________________________________________________________________
repeat_vector_1 (RepeatVector)  (None, 26, 256)      0           image_embedding[0][0]            
________________________________________________________________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 800 samples, validate on 200 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
160/800 [=====>........................] - ETA: 24s - loss: 5.9384 - time_distributed_5_loss: 1.7796 - answer_classifier_loss: 4.1588 - time_distributed_5_acc: 0.7228 - answer_classifier_acc: 0.2313

KeyboardInterrupt: 