# Visual Question Answering 

### Baseline Approach: A Bag of Words Model

This notebook is simply an execution of the code to build VQA model, I would highly encourage you to read the [full post here](https://sominwadhwa.github.io/blog/2018/01/01/de/)

<p align="center">
  <img src="https://github.com/sominwadhwa/sominwadhwa.github.io/blob/master/assets/vqa/6.jpg?raw=true"/>
</p>

In [11]:
import sys, warnings
warnings.filterwarnings("ignore")
from random import shuffle, sample
import pickle as pk
import gc

import numpy as np
import pandas as pd
import scipy.io
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Reshape
from keras.layers.recurrent import LSTM
from keras.layers import Merge
from keras.layers.merge import Concatenate
from keras.optimizers import SGD
from keras.utils import np_utils, generic_utils
from progressbar import Bar, ETA, Percentage, ProgressBar    
from keras.models import model_from_json
from sklearn.preprocessing import LabelEncoder
import spacy
#from spacy.en import English

from src.utils import freq_answers, grouped, get_questions_sum, get_images_matrix, get_answers_sum
from src.extract_features import get_questions_matrix_sum, get_images_matrix, get_answers_matrix, get_questions_tensor_timeseries

In [12]:
training_questions = open("preprocessed/v2/ques_train.txt","rb").read().decode('utf8').splitlines()
training_questions_len = open("preprocessed/v2/ques_train_len.txt","rb").read().decode('utf8').splitlines()
answers_train = open("preprocessed/v2/answer_train.txt","rb").read().decode('utf8').splitlines()
images_train = open("preprocessed/v2/images_coco_id.txt","rb").read().decode('utf8').splitlines()
img_ids = open('preprocessed/v2/coco_vgg_IDMap.txt').read().splitlines()
vgg_path = "data/coco/vgg_feats.mat"

In [13]:
sample(list(zip(images_train, training_questions, answers_train)), 5)

[('372870', 'Are these tree branches?', 'no'),
 ('18367', "What is on the person's head?", 'hat'),
 ('330004',
  'Is there a decorative floral border at the top of the wall?',
  'yes'),
 ('394269', "Is there a children's cup here?", 'no'),
 ('579623', 'Is there room for at least one more passenger here?', 'yes')]

In [14]:
nlp = spacy.load("en_core_web_md")
print ("Loaded WordVec")

Loaded WordVec


In [15]:
vgg_features = scipy.io.loadmat(vgg_path)
img_features = vgg_features['feats']
id_map = dict()
print ("Loaded VGG Weights")

Loaded VGG Weights


In [16]:
gc.collect()

278

In [17]:
upper_lim = 1000 #Number of most frequently occurring answers in COCOVQA (Covering >80% of the total data)
training_questions, answers_train, images_train = freq_answers(training_questions, 
                                                               answers_train, images_train, upper_lim)
training_questions_len, training_questions, answers_train, images_train = (list(t) for t in zip(*sorted(zip(training_questions_len, 
                                                                                                          training_questions, answers_train, 
                                                                                                          images_train))))
print (len(training_questions), len(answers_train),len(images_train))

387861 387861 387861


In [18]:
lbl = LabelEncoder()
lbl.fit(answers_train)
nb_classes = len(list(lbl.classes_))
pk.dump(lbl, open('preprocessed/v2/label_encoder_lstm.sav','wb'))

In [19]:
batch_size               =      128
img_dim                  =     4096
word2vec_dim             =      300
#max_len                 =       30 # Required only when using Fixed-Length Padding

num_hidden_nodes_mlp     =     1024
num_hidden_nodes_lstm    =      512
num_layers_mlp           =        3
num_layers_lstm          =        3
dropout                  =       0.5
activation_mlp           =     'tanh'


In [20]:
# Change the following based on your usage, THESE WILL DIRECTLY AFFECT THE DURATION OF NETWORK TRAINING
num_epochs               =         1 
log_interval             =         1 

In [21]:
for ids in img_ids:
    id_split = ids.split()
    id_map[id_split[0]] = int(id_split[1])

In [22]:
image_model = Sequential()
image_model.add(Reshape(input_shape = (img_dim,), target_shape=(img_dim,)))
image_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_1 (Reshape)          (None, 4096)              0         
Total params: 0
Trainable params: 0
Non-trainable params: 0
_________________________________________________________________


In [23]:
language_model = Sequential()
language_model.add(LSTM(output_dim=num_hidden_nodes_lstm, 
                        return_sequences=True, input_shape=(None, word2vec_dim)))

for i in range(num_layers_lstm-2):
    language_model.add(LSTM(output_dim=num_hidden_nodes_lstm, return_sequences=True))
language_model.add(LSTM(output_dim=num_hidden_nodes_lstm, return_sequences=False))

language_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, None, 512)         1665024   
_________________________________________________________________
lstm_2 (LSTM)                (None, None, 512)         2099200   
_________________________________________________________________
lstm_3 (LSTM)                (None, 512)               2099200   
Total params: 5,863,424
Trainable params: 5,863,424
Non-trainable params: 0
_________________________________________________________________


In [24]:
model = Sequential()
model.add(Merge([language_model, image_model], mode='concat', concat_axis=1))
for i in range(num_layers_mlp):
    model.add(Dense(num_hidden_nodes_mlp, init='uniform'))
    model.add(Activation('tanh'))
    model.add(Dropout(0.5))
model.add(Dense(upper_lim))
model.add(Activation("softmax"))

In [25]:
model_dump = model.to_json()
open('lstm_structure'  + '.json', 'w').write(model_dump)

5720

In [26]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
merge_1 (Merge)              (None, 4608)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1024)              4719616   
_________________________________________________________________
activation_1 (Activation)    (None, 1024)              0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1024)              1049600   
_________________________________________________________________
activation_2 (Activation)    (None, 1024)              0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 1024)              0         
__________

In [18]:
for k in range(num_epochs):
    progbar = generic_utils.Progbar(len(training_questions))
    for ques_batch, ans_batch, im_batch in zip(grouped(training_questions, batch_size, 
                                                       fillvalue=training_questions[-1]), 
                                               grouped(answers_train, batch_size, 
                                                       fillvalue=answers_train[-1]), 
                                               grouped(images_train, batch_size, fillvalue=images_train[-1])):
        timestep = len(nlp(ques_batch[-1]))
        X_ques_batch = get_questions_tensor_timeseries(ques_batch, nlp, timestep)
        #print (X_ques_batch.shape)
        X_img_batch = get_images_matrix(im_batch, id_map, img_features)
        Y_batch = get_answers_sum(ans_batch, lbl)
        loss = model.train_on_batch([X_ques_batch, X_img_batch], Y_batch)
        progbar.add(batch_size, values=[('train loss', loss)])
    if k%log_interval == 0:
        model.save_weights("weights/LSTM" + "_epoch_{:02d}.hdf5".format(k))
model.save_weights("weights/MLP" + "_epoch_{:02d}.hdf5".format(k))

  2560/387861 [..............................] - ETA: 10765s - train loss: 2.4371

KeyboardInterrupt: 

In [27]:
model = model_from_json(open('lstm_structure.json').read())
#model.load_weights('weights/') #Pass in your weights file
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

print ("Model Loaded with Weights")
model.summary()

Model Loaded with Weights
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
merge_1 (Merge)              (None, 4608)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1024)              4719616   
_________________________________________________________________
activation_1 (Activation)    (None, 1024)              0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1024)              1049600   
_________________________________________________________________
activation_2 (Activation)    (None, 1024)              0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 1024)         

In [28]:
val_imgs = open('preprocessed/v2/val_images_coco_id.txt','rb').read().decode('utf-8').splitlines()
val_ques = open('preprocessed/v2/ques_val.txt','rb').read().decode('utf-8').splitlines()
val_ans = open('preprocessed/v2/answer_val.txt','rb').read().decode('utf-8').splitlines()

In [29]:
label_encoder = pk.load(open('preprocessed/v2/label_encoder_lstm.sav','rb'))

In [30]:
y_pred = []
batch_size = 128 

#print ("Word2Vec Loaded!")

widgets = ['Evaluating ', Percentage(), ' ', Bar(marker='#',left='[',right=']'), ' ', ETA()]
pbar = ProgressBar(widgets=widgets)

In [31]:
for qu_batch,an_batch,im_batch in pbar(zip(grouped(val_ques, batch_size, 
                                                   fillvalue=val_ques[0]), 
                                           grouped(val_ans, batch_size, 
                                                   fillvalue=val_ans[0]), 
                                           grouped(val_imgs, batch_size, 
                                                   fillvalue=val_imgs[0]))):
    timesteps = len(nlp(qu_batch[-1]))
    X_ques_batch = get_questions_tensor_timeseries(qu_batch, nlp, timesteps)
    X_i_batch = get_images_matrix(im_batch, id_map, img_features)
    X_batch = [X_ques_batch, X_i_batch]
    y_predict = model.predict_classes(X_batch, verbose=0)
    y_pred.extend(label_encoder.inverse_transform(y_predict))
    
correct_val = 0.0
total = 0
f1 = open('res.txt','w')

for pred, truth, ques, img in zip(y_pred, val_ans, val_ques, val_imgs):
    t_count = 0
    for _truth in truth.split(';'):
        if pred == truth:
            t_count += 1 
    if t_count >=2:
        correct_val +=1
    else:
        correct_val += float(t_count)/3

    total +=1

    try:
        f1.write(str(ques))
        f1.write('\n')
        f1.write(str(img))
        f1.write('\n')
        f1.write(str(pred))
        f1.write('\n')
        f1.write(str(truth))
        f1.write('\n')
        f1.write('\n')
    except:
        pass

print ("Accuracy: ", round((correct_val/total)*100,2)*)
f1.write('Final Accuracy is ' + str(round(correct_val/total),2)*100)
f1.close()

Accuracy:  54.79
