In [1]:
import tensorflow as tf
import h5py as h5
import keras.backend as K
import numpy as np

Using TensorFlow backend.


In [2]:
VOCAB_SIZE = 12602
MAX_QUESTION_LEN = 26

In [3]:
# def qa_loss(y_true, y_pred):
#     print(y_true[0])
#     print(y_pred[0])
#     q_true, a_true = y_true[0], y_true[1]
#     q_pred, a_pred = y_pred[0][:-1], y_pred[1]
    
#     q_loss = K.sum(q_true * K.log(q_pred), axis=-1)
#     q_loss = K.sum(q_loss, axis=-1)
#     a_loss = K.sum(a_true * K.log(a_pred), axis=-1)
#     loss = K.sum(y_true * K.log(y_pred), axis=-1)
#     return loss

In [7]:
class VQANet:
    def __init__(self, combine_type, question_embed_dim, lstm_dim, n_answers):
        self.combine_type = combine_type
        self.question_embed_dim = question_embed_dim
        self.lstm_dim = lstm_dim
        self.n_answers = n_answers
        self.build()
        
    def build(self):
        if self.combine_type == 'show-and-tell':
            image_features = tf.keras.layers.Input(shape=(4096,), 
                                                   dtype='float32')
            
            image_embedding = tf.keras.layers.Dense(units=self.question_embed_dim, 
                                                    activation='elu',
                                                    name='image_embedding')(inputs=image_features)

            image_embedding = tf.keras.layers.Reshape((1, self.question_embed_dim))(image_embedding)
            
            question_input = tf.keras.layers.Input(shape=(MAX_QUESTION_LEN,), 
                                                   dtype='int32',
                                                   name='question_input')
            
            question_embedding = tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, 
                                                           output_dim=self.question_embed_dim, 
                                                           input_length=MAX_QUESTION_LEN,
                                                           name='question_embedding')(inputs=question_input)
            
            image_question_embedding = tf.keras.layers.Concatenate(axis=1, 
                                                                   name='image_question_embedding')(inputs=[image_embedding, question_embedding])
            
            question_features, last_h, _ = tf.keras.layers.LSTM(units=self.lstm_dim, 
                                                                return_sequences=True, 
                                                                return_state=True, 
                                                                name='question_generator')(inputs=image_question_embedding)

            question_pred = tf.keras.layers.TimeDistributed(layer=tf.keras.layers.Dense(units=VOCAB_SIZE, 
                                                                  activation='softmax', 
                                                                  name='word_classifier'))(inputs=question_features)
            
            # question_pred[:-1] ignores the last output. Need to add <START> and <END>.
            question_pred = tf.keras.layers.Lambda(lambda x: x[:, :-1, :], 
                                                   name='ignore_last_word')(inputs=question_pred)
            
            answer_fc1 = tf.keras.layers.Dense(units=1000,
                                                activation='elu',
                                                name='answer_dense_1')(inputs=last_h)
            
            answer_fc2 = tf.keras.layers.Dense(units=1000,
                                                activation='elu',
                                                name='answer_dense_2')(inputs=answer_fc1)
            
            answer_pred = tf.keras.layers.Dense(units=self.n_answers,
                                                activation='softmax',
                                                name='answer_classifier')(inputs=answer_fc2)
            
            self.model = tf.keras.Model(inputs=[image_features, question_input], 
                                        outputs=[answer_pred])  
            
            self.model.compile(loss='categorical_crossentropy', 
                               optimizer='adam', metrics=['accuracy'])
    
    def train(self, x_train, y_train, x_val, y_val, batch_size, epochs):
        self.model.fit(x=x_train, 
                       y=y_train, 
                       batch_size=batch_size, 
                       epochs=epochs, 
                       verbose=1,
                       validation_split=0.2,
                       shuffle=True)

In [5]:
batch_size = 32
epochs = 10
question_embed_dim = 256
lstm_dim = 512
n_answers = 1000
n_train = 1000

In [8]:
ques = h5.File("../../data/data_train_val/data_prepro.h5", "r")
ques_train = ques['ques_train'][:n_train]
ques_to_image_train = ques['img_pos_train'][:n_train] - 1

ans_train = tf.keras.utils.to_categorical(y=ques['answers'][:n_train],
                                          num_classes=n_answers)

img_feat = h5.File("../../data/data_train_val/data_img.h5", "r")
img_train = np.array(img_feat['images_train'])[ques_to_image_train]

print(img_train.shape)

model = VQANet(combine_type='show-and-tell', 
               question_embed_dim=question_embed_dim, 
               lstm_dim=lstm_dim, 
               n_answers=n_answers)

print(model.model.summary())

# Issue: ques_train input and output are the same.
model.train(x_train=[img_train, ques_train], 
            y_train=[num_classes=VOCAB_SIZE), ans_train], 
            x_val=[], 
            y_val=[], 
            batch_size=batch_size, 
            epochs=epochs)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 4096)         0                                            
__________________________________________________________________________________________________
image_embedding (Dense)         (None, 256)          1048832     input_2[0][0]                    
__________________________________________________________________________________________________
question_input (InputLayer)     (None, 26)           0                                            
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 1, 256)       0           image_embedding[0][0]            
__________________________________________________________________________________________________
question_e

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 800 samples, validate on 200 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
