In [1]:
import tensorflow as tf
import h5py as h5
import keras.backend as K
import numpy as np

Using TensorFlow backend.


In [2]:
VOCAB_SIZE = 12602
MAX_QUESTION_LEN = 26

In [29]:
class VQANet:
    def __init__(self, combine_type, question_embed_dim, lstm_dim, n_answers):
        self.combine_type = combine_type
        self.question_embed_dim = question_embed_dim
        self.lstm_dim = lstm_dim
        self.n_answers = n_answers
        self.build()
        
    def build(self):
        if self.combine_type == 'show-and-tell':
            image_features = tf.keras.layers.Input(shape=(4096,), 
                                                   dtype='float32')
            
            image_embedding = tf.keras.layers.Dense(units=self.question_embed_dim, 
                                                    activation='elu',
                                                    name='image_embedding')(inputs=image_features)

            image_embedding = tf.keras.layers.Reshape((1, self.question_embed_dim))(image_embedding)
            
            question_input = tf.keras.layers.Input(shape=(MAX_QUESTION_LEN,), 
                                                   dtype='int32',
                                                   name='question_input')
            
            question_embedding = tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, 
                                                           output_dim=self.question_embed_dim, 
                                                           input_length=MAX_QUESTION_LEN,
                                                           name='question_embedding')(inputs=question_input)
            
            image_question_embedding = tf.keras.layers.Concatenate(axis=1, 
                                                                   name='image_question_embedding')(inputs=[image_embedding, question_embedding])
            
            question_features, last_h, _ = tf.keras.layers.LSTM(units=self.lstm_dim, 
                                                                return_sequences=True, 
                                                                return_state=True, 
                                                                name='question_generator')(inputs=image_question_embedding)

            question_pred = tf.keras.layers.TimeDistributed(layer=tf.keras.layers.Dense(units=VOCAB_SIZE, 
                                                                  activation='softmax', 
                                                                  name='word_classifier'))(inputs=question_features)
            
            # question_pred[:-1] ignores the last output. Need to add <START> and <END>.
            question_pred = tf.keras.layers.Lambda(lambda x: x[:, :-1, :], 
                                                   name='ignore_last_word')(inputs=question_pred)
            
            answer_fc1 = tf.keras.layers.Dense(units=1000,
                                                activation='elu',
                                                name='answer_dense_1')(inputs=last_h)
            
            answer_pred = tf.keras.layers.Dense(units=self.n_answers,
                                                activation='softmax',
                                                name='answer_classifier')(inputs=answer_fc1)
            
            self.model = tf.keras.Model(inputs=[image_features, question_input], 
                                        outputs=[answer_pred])  
            
            
            optimizer = tf.keras.optimizers.Adam(lr=0.001)
            self.model.compile(loss='categorical_crossentropy', 
                               optimizer=optimizer, metrics=['accuracy'])
    
    def train(self, x_train, y_train, x_val, y_val, batch_size, epochs):
        self.model.fit(x=x_train, 
                       y=y_train, 
                       batch_size=batch_size, 
                       epochs=epochs, 
                       verbose=1,
                       validation_split=0.2,
                       shuffle=True)

In [31]:
batch_size = 32
epochs = 100
question_embed_dim = 256
lstm_dim = 512
n_answers = 1000
n_train = 500

In [32]:
ques = h5.File("../../data/data_train_val/data_prepro.h5", "r")
ques_train = ques['ques_train'][:n_train]
ques_to_image_train = ques['img_pos_train'][:n_train] - 1

ans_train = tf.keras.utils.to_categorical(y=ques['answers'][:n_train],
                                          num_classes=n_answers)

img_feat = h5.File("../../data/data_train_val/data_img.h5", "r")
img_train = np.array(img_feat['images_train'])[ques_to_image_train]

print(img_train.shape)

model = VQANet(combine_type='show-and-tell', 
               question_embed_dim=question_embed_dim, 
               lstm_dim=lstm_dim,
               n_answers=n_answers)

# Issue: ques_train input and output are the same.
model.train(x_train=[img_train, ques_train], 
            y_train=[ans_train], 
            x_val=[], 
            y_val=[], 
            batch_size=batch_size, 
            epochs=epochs)

(500, 4096)
Train on 400 samples, validate on 100 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100


Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
