In [2]:
import sys
from random import shuffle
import pickle as pk
import gc

import numpy as np
import scipy.io
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD
from keras.utils import np_utils, generic_utils
from progressbar import Bar, ETA, Percentage, ProgressBar    
from keras.models import model_from_json
from sklearn.preprocessing import LabelEncoder
import spacy
#from spacy.en import English

from src.utils import freq_answers, grouped, get_questions_sum, get_images_matrix, get_answers_sum
from extract_features import get_questions_matrix_sum, get_images_matrix, get_answers_matrix


In [3]:
training_questions = open("preprocessed/v2/ques_train.txt","rb").read().decode('utf8').splitlines()
answers_train = open("preprocessed/v2/answer_train.txt","rb").read().decode('utf8').splitlines()
images_train = open("preprocessed/v2/images_coco_id.txt","rb").read().decode('utf8').splitlines()
img_ids = open('preprocessed/v2/coco_vgg_IDMap.txt').read().splitlines()
vgg_path = "data/coco/vgg_feats.mat"

In [4]:
list(zip(training_questions, answers_train))[:5]

[('What is this photo taken looking through?', 'net'),
 ('What position is this man playing?', 'pitcher'),
 ('What color is the players shirt?', 'orange'),
 ('Is this man a professional baseball player?', 'yes'),
 ('What color is the snow?', 'white')]

In [5]:
nlp = spacy.load("en_core_web_md")
print ("Loaded WordVec")

Loaded WordVec


In [6]:
vgg_features = scipy.io.loadmat(vgg_path)
img_features = vgg_features['feats']
id_map = dict()
print ("Loaded VGG Weights")

Loaded VGG Weights


In [7]:
lbl = LabelEncoder()
lbl.fit(answers_train)
nb_classes = len(list(lbl.classes_))
pk.dump(lbl, open('preprocessed/v2/label_encoder.sav','wb'))

In [8]:
gc.collect()

6

In [9]:
upper_lim = 1500 #Number of most frequently occurring answers in COCOVQA (Coverting >85% of the total data)
training_questions, answers_train, images_train = freq_answers(training_questions, answers_train, images_train, upper_lim)
print (len(training_questions), len(answers_train),len(images_train))

398120 398120 398120


In [10]:
num_hidden_units = 1024
num_hidden_layers = 3
batch_size = 128
dropout = 0.5
activation = 'tanh'
img_dim = 4096
word2vec_dim = 300

In [11]:
num_epochs = 1
log_interval = 1

In [12]:
for ids in img_ids:
    id_split = ids.split()
    id_map[id_split[0]] = int(id_split[1])

In [15]:
model = Sequential()
model.add(Dense(num_hidden_units, input_dim=word2vec_dim+img_dim, kernel_initializer='uniform'))
model.add(Dropout(dropout))
for i in range(num_hidden_layers):
    model.add(Dense(num_hidden_units, kernel_initializer='uniform'))
    model.add(Activation(activation))
    model.add(Dropout(dropout))
model.add(Dense(nb_classes, kernel_initializer='uniform'))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
#tensorboard = TensorBoard(log_dir='/output/Graph', histogram_freq=0, write_graph=True, write_images=True)

In [16]:
model_dump = model.to_json()
open('baseline_mlp'  + '.json', 'w').write(model_dump)

3292

In [17]:
model.load_weights('weights/MLP_epoch_99.hdf5')

In [14]:
for k in range(num_epochs):
    index_shuffle = list(range(len(training_questions)))
    shuffle(index_shuffle)
    training_questions = [training_questions[i] for i in index_shuffle]
    answers_train = [answers_train[i] for i in index_shuffle]
    images_train = [images_train[i] for i in index_shuffle]
    progbar = generic_utils.Progbar(len(training_questions))
    for ques_batch, ans_batch, im_batch in zip(grouped(training_questions, batch_size, 
                                                       fillvalue=training_questions[-1]), 
                                               grouped(answers_train, batch_size, 
                                                       fillvalue=answers_train[-1]), 
                                               grouped(images_train, batch_size, fillvalue=images_train[-1])):
        X_ques_batch = get_questions_sum(ques_batch, nlp)
        X_img_batch = get_images_matrix(im_batch, id_map, img_features)
        X_batch = np.hstack((X_ques_batch, X_img_batch))
        Y_batch = get_answers_sum(ans_batch, lbl)
        #loss = model.train_on_batch(X_batch, Y_batch,callbacks= [tensorboard])
        loss = model.train_on_batch(X_batch, Y_batch)
        progbar.add(batch_size, values=[('train loss', loss)])

    if k%log_interval == 0:
        model.save_weights("weights/MLP" + "_epoch_{:02d}.hdf5".format(k))
model.save_weights("weights/MLP" + "_epoch_{:02d}.hdf5".format(k))

  1664/398120 [..............................] - ETA: 2:58:21 - train loss: 4.6438

KeyboardInterrupt: 

In [None]:
model = model_from_json(open('baseline_mlp.json').read())
model.load_weights('weights/MLP_epoch_99.hdf5')
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

print ("Model Loaded with Weights")


In [None]:
val_imgs = open('preprocessed/v2/val_images_coco_id.txt','rb').read().decode('utf-8').splitlines()
val_ques = open('preprocessed/v2/ques_val.txt','rb').read().decode('utf-8').splitlines()
val_ans = open('preprocessed/v2/answer_val.txt','rb').read().decode('utf-8').splitlines()

In [None]:
label_encoder = pk.load(open('preprocessed/v2/label_encoder.sav','rb'))

In [None]:
y_pred = []
batch_size = 128 

print ("Word2Vec Loaded!")

widgets = ['Evaluating ', Percentage(), ' ', Bar(marker='#',left='[',right=']'), ' ', ETA()]
pbar = ProgressBar(widgets=widgets)
#i=1

for qu_batch,an_batch,im_batch in pbar(zip(grouped(val_ques, batch_size, fillvalue=val_ques[0]), grouped(val_ans, batch_size, fillvalue=val_ans[0]), grouped(val_imgs, batch_size, fillvalue=val_imgs[0]))):
    X_q_batch = get_questions_matrix_sum(qu_batch, nlp)
    X_i_batch = get_images_matrix(im_batch, id_map, vgg_features)
    X_batch = np.hstack((X_q_batch, X_i_batch))
    y_predict = model.predict_classes(X_batch, verbose=0)
    y_pred.extend(label_encoder.inverse_transform(y_predict))
    #print (i,"/",len(val_ques))
    #i+=1
    #print(label_encoder.inverse_transform(y_predict))

correct_val = 0.0
total = 0
f1 = open('res.txt','w')

for pred, truth, ques, img in zip(y_pred, val_ans, val_ques, val_imgs):
    t_count = 0
    for _truth in truth.split(';'):
        if pred == truth:
            t_count += 1 
    if t_count >=2:
        correct_val +=1
    else:
        correct_val += float(t_count)/3

    total +=1

    try:
        f1.write(str(ques))
        f1.write('\n')
        f1.write(str(img))
        f1.write('\n')
        f1.write(str(pred))
        f1.write('\n')
        f1.write(str(truth))
        f1.write('\n')
        f1.write('\n')
    except:
        pass

print ("Accuracy: ", correct_val/total)
f1.write('Final Accuracy is ' + str(correct_val/total))
f1.close()