In [3]:
import json
import zipfile
import random
import numpy as np
import h5py
from collections import Counter, defaultdict
from time import time
from collections import defaultdict
import random
import torch
from torch.autograd import Variable
import torch.nn as nn
import gzip
import os
import torch.optim as optim

#only using cbow + MLP 
USING_CBOW = False
#use (CBOW -> LSTM) + MLP
LSTM_TRAINED_WITH_CBOW_EMBEDDINGS = True
UPDATE_CBOW_EMBEDDING = True

DATA_PATH = '..' + os.sep + 'NLP1-2017-VQA' + os.sep + 'data' + os.sep

with zipfile.ZipFile('./data/v2_Questions_Train_mscoco.zip', 'r') as file:
    qdata = json.load(file.open(file.namelist()[0]))

with zipfile.ZipFile('./data/v2_Annotations_Train_mscoco.zip', 'r') as file:
    adata = json.load(file.open(file.namelist()[0])) 
    
#statistics
question_types = set()
multiple_choice_answers = set()
answer2count = defaultdict(int)
answer_types = set()
answertypes2count = defaultdict(int)
top_answers_per_type = defaultdict(lambda: defaultdict(int))
for ann in adata['annotations']:
    question_types.add(ann['question_type'])
    
    multiple_choice_answers.add(ann['multiple_choice_answer'])
    
    answer2count[ann['multiple_choice_answer']] += 1
    answer_types.add(ann['answer_type'])
    
    answertypes2count[ann['answer_type']] += 1
    top_answers_per_type[ann['answer_type']][ann['multiple_choice_answer']] += 1

    
#dataset creation
start_time = time()
idx = list(range(0,len(qdata['questions'])))
random.seed(42)
random.shuffle(idx)

np.random.seed(42)
splits = ['train', 'valid', 'test']

n = 20000
qdata_small = {'questions': list()}
adata_small = {'annotations': list()}
a_type_counts = {'yes/no': 0, 'number': 0, 'other': 0}

while len(qdata_small['questions']) < 3*n:
    i = idx.pop()
    
    at = adata['annotations'][i]['answer_type'] 
    
    if a_type_counts[at] < n:
        
        if at == 'yes/no' and adata['annotations'][i]['multiple_choice_answer'] not in ['yes', 'no']:
            continue
            
        adata_small['annotations'].append(adata['annotations'][i])
        qdata_small['questions'].append(qdata['questions'][i])
        
        split = np.random.choice(splits, p=(.8, .15, .05))
        adata_small['annotations'][-1]['split'] = split
        qdata_small['questions'][-1]['split'] = split
        
        a_type_counts[at] += 1
        
# Tests
assert len(qdata_small['questions']) == len(adata_small['annotations']) == 3*n, "Inconsitent Lengths."
a_type_counts = {'yes/no': 0, 'number': 0, 'other': 0}
for ann in adata_small['annotations']:
    a_type_counts[ann['answer_type']] += 1
assert a_type_counts['yes/no'] == a_type_counts['number'] == a_type_counts['other'] == n, "Inconsistent Answer Type Lengths."

print("Data Creation Looks good! Time Taken %.2f" %(time()-start_time))


question_types = set()
multiple_choice_answers = set()
answer2count = defaultdict(int)
answer_types = set()
answertypes2count = defaultdict(int)
top_answers_per_type = defaultdict(lambda: defaultdict(int))
for ann in adata_small['annotations']:
    question_types.add(ann['question_type'])
    
    multiple_choice_answers.add(ann['multiple_choice_answer'])
    
    answer2count[ann['multiple_choice_answer']] += 1
    answer_types.add(ann['answer_type'])
    
    answertypes2count[ann['answer_type']] += 1
    top_answers_per_type[ann['answer_type']][ann['multiple_choice_answer']] += 1


#saving
qdata_small_splits = {\
                      'train': {'questions': list()}, 
                      'valid': {'questions': list()}, 
                      'test': {'questions': list()}
                     }

adata_small_splits = {\
                      'train': {'annotations': list()}, 
                      'valid': {'annotations': list()}, 
                      'test': {'annotations': list()}
                     }

for i in range(len(qdata_small['questions'])):
    
    split = qdata_small['questions'][i]['split']
    assert split == adata_small['annotations'][i]['split'], "Inconsistent Splits."
    assert adata_small['annotations'][i]['question_id'] == qdata_small['questions'][i]['question_id'], "Inconsistent IDs."
    
    qdata_small_splits[split]['questions'].append(qdata_small['questions'][i])
    adata_small_splits[split]['annotations'].append(adata_small['annotations'][i])
    
        
print("Training Set Size: %i" %(len(qdata_small_splits['train']['questions'])))
print("\nValidation Set Size: %i" %(len(qdata_small_splits['valid']['questions'])))
print("\nTest Set Size: %i" %(len(qdata_small_splits['test']['questions'])))


for split in ['train', 'valid', 'test']:
    
    with gzip.GzipFile('data/vqa_annotatons_' + split + '.gzip', 'w') as file:
        file.write(json.dumps(adata_small_splits[split]).encode('utf-8'))
        
    with gzip.GzipFile('data/vqa_questions_' + split + '.gzip', 'w') as file:
        file.write(json.dumps(qdata_small_splits[split]).encode('utf-8'))

image_ids = set()
for q in qdata_small['questions']:
    image_ids.add(q['image_id'])

image_ids_json = {'image_ids': list(image_ids)}
with open('data/image_ids_vqa.json', 'w') as file:
    json.dump(image_ids_json, file)
    
    
#read data
with gzip.open(DATA_PATH + 'vqa_questions_train.gzip', 'rb') as file:
    file_content = file.read().decode('utf-8')
    qdata_train = json.loads(file_content)

with gzip.GzipFile(DATA_PATH + 'vqa_annotatons_train.gzip', 'r') as file:
    adata_train = json.loads(file.read().decode('utf-8')) 
    
with gzip.open(DATA_PATH + 'vqa_questions_test.gzip', 'rb') as file:
    file_content = file.read().decode('utf-8')
    qdata_test = json.loads(file_content)
    
with gzip.GzipFile(DATA_PATH + 'vqa_annotatons_test.gzip', 'r') as file:
    adata_test = json.loads(file.read().decode('utf-8')) 
    
#print(qdata_train['questions'][0])
#print("#1: ", adata_train['annotations'][0])

question_types = set()
multiple_choice_answers = set()
answer2count = defaultdict(int)
answer_types = set()
answertypes2count = defaultdict(int)
top_answers_per_type = defaultdict(lambda: defaultdict(int))
for ann in adata_train['annotations']:
    question_types.add(ann['question_type'])
    
    multiple_choice_answers.add(ann['multiple_choice_answer'])
    
    answer2count[ann['multiple_choice_answer']] += 1
    answer_types.add(ann['answer_type'])
    
    answertypes2count[ann['answer_type']] += 1
    top_answers_per_type[ann['answer_type']][ann['multiple_choice_answer']] += 1    
    
    
torch.manual_seed(1)

#w2i is the dict that change vocabulary used in question to index, exmpale: how are you -> {1,3,4}
w2i = defaultdict(lambda: len(w2i))

w2i['pad'] = 0
UNK = w2i["<unk>"]

#answer to index dictionary
ans_dict = {'pad': 0, '<unk>': 1}
#ans_dict = {'unk': 0}

#index to answer dictionary
rev_ans_dict = {'0': 'pad', '1': '<unk>'}
#rev_ans_dict = {'0': 'unk'}

#answer which occur more than 5 times, then it will be add to answer dictionary
def answer_to_idx(answer2count):
    count = 1
    for ans in answer2count: 
        if answer2count[ans] > 5:
            ans_dict[ans] = count
            rev_ans_dict[str(count)] = ans
            count = count + 1
    return ans_dict

#also include 'unk here', not sure if it's right???
def ans_to_onehot(ans_idx):
    #ans_idx is the integer index of an answer
    ans1h = np.zeros(no_ans).reshape(no_ans)
    ans1h[ans_idx] = 1
    return ans1h


#change vocabular in question sets to index
#it will return a list with every words in question changing to index and also the anser index
def read_data_with_answers(t_data, t_answer_data, training ):
    for idx in range(len(t_data)):
        question = t_data[idx]['question']
        question = question.lower().split("?",1)[0]
        
        question_with_pad = question.split(" ")
        question_lenth = len(question_with_pad)
        
        #adding pad token to the question until the length of the sentence is = 10
        for idx_1 in range(10-question_lenth):
            question_with_pad.append('pad')        
            
        ans_yield = 0
        
        if t_answer_data[idx]['multiple_choice_answer'] in ans_dict:
            ans_yield = ans_dict[t_answer_data[idx]['multiple_choice_answer']]
        
        if training == True:
            yield ([w2i[x] for x in question_with_pad], ans_yield)   
        else:
            yield ([w2i[x] if x in w2i else w2i["<unk>"] for x in question_with_pad ], ans_yield)
             


#answer vocabulary 
ans_dict = answer_to_idx(answer2count)

#number of answers which occur more than 5 times in training set
no_ans = len(ans_dict)

data_ans_train = list(read_data_with_answers(qdata_train['questions'], adata_train['annotations'], training = True))
w2i = defaultdict(lambda: UNK, w2i)
data_ans_test = list(read_data_with_answers(qdata_test['questions'], adata_test['annotations'], training = False))

nwords = len(w2i)
    
#after training, get word embedding layer through cbow, input should be a question_to_index_list(w2i?)
def get_word_embedding_layer(data):
    
    #get bow vector from each question, so question_bow will be a matrix 
    #each row of matrix is a vector coming from sum of word embeddings from each questions
    question_bow = np.zeros((len(data), no_embeddings))
    
    #iterate each question, so we can append each bow of question to question_bow
    for idx, words in enumerate(data):
        lookup_tensor = Variable(torch.LongTensor([words]))
        scores = model(lookup_tensor)
        question_bow[idx] = (model.bows.data.numpy())
    
    return question_bow#FOR TRAINING on UNIQUE WORDS

Data Creation Looks good! Time Taken 1.50
Training Set Size: 48061

Validation Set Size: 8977

Test Set Size: 2962


In [None]:
no_embeddings = 1024   
all_losses = []

if LSTM_TRAINED_WITH_CBOW_EMBEDDINGS or USING_CBOW:
    
    class CBOW(nn.Module):
        
        def __init__(self, vocab_size, embedding_dim, output_dim):
            super(CBOW, self).__init__()
            self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx = 0)
            self.linear = nn.Linear(embedding_dim, output_dim)
            self.bows = torch.FloatTensor(1, embedding_dim).zero_()

        def forward(self, inputs):
            embeds = self.embeddings(inputs)
            #print(embeds.size)
            bow = torch.sum(embeds, 1)
            self.bows = bow
            logits = self.linear(bow)
            return logits


    model = CBOW(nwords, no_embeddings, no_ans)
    print(model)


    def evaluate(model, data):
        """Evaluate a model on a data set."""
        correct = 0.0
 
        for words, tag in data:
            lookup_tensor = Variable(torch.LongTensor([words]))
            scores = model(lookup_tensor)
            predict = scores.data.numpy().argmax(axis=1)[0]

            if predict == tag:
                correct += 1

        return correct, len(data), correct/len(data), scores


    optimizer = optim.SGD(model.parameters(), lr=0.01)

    for ITER in range(50):

        #random.shuffle(data_ans_train)
        train_loss = 0.0
        start = time()
        correct = 0
        
        for words, tag in data_ans_train:
            # forward pass
            lookup_tensor = Variable(torch.LongTensor([words]))
            scores = model(lookup_tensor)
            loss = nn.CrossEntropyLoss()
            target = Variable(torch.LongTensor([tag]))
            
            predict = scores.data.numpy().argmax(axis=1)[0]
            if predict == tag:
                correct += 1
            
            
            output = loss(scores, target)
            train_loss += output.data[0]

            # backward pass
            model.zero_grad()
            output.backward()

            # update weights
            optimizer.step()

        print("iter %r: train loss/sent=%.4f, time=%.2fs" % 
              (ITER, train_loss/len(data_ans_train), time()-start))
        print('correct', correct/len(data_ans_train))

        all_losses.append(1 - correct/len(data_ans_train))
# if iter % plot_every == 0:
#         all_losses.append(current_loss / plot_every)
#         current_loss = 0  

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

plt.figure()
plt.plot(all_losses)

CBOW (
  (embeddings): Embedding(6941, 1024, padding_idx=0)
  (linear): Linear (1024 -> 532)
)


In [None]:
#after training, get word embedding layer through cbow, input should be a question_to_index_list(w2i?)
def get_word_embedding_layer(data):
    
    #get bow vector from each question, so question_bow will be a matrix 
    #each row of matrix is a vector coming from sum of word embeddings from each questions
    question_bow = np.zeros((len(data), no_embeddings))
    
    #iterate each question, so we can append each bow of question to question_bow
    for idx, words in enumerate(data):
        lookup_tensor = Variable(torch.LongTensor([words]))
        scores = model(lookup_tensor)
        question_bow[idx] = (model.bows.data.numpy())
    
    return question_bow

#test, get_word_embedding_layer get correct answer   
#print((get_word_embedding_layer(list([data_ans_train[0][0]]))).shape)   #size 1x64

#read from image
path_to_h5_file = DATA_PATH + 'VQA_image_features.h5'
path_to_json_file = DATA_PATH + 'VQA_img_features2id.json'

#get image feature from h5_id 
img_features = np.asarray(h5py.File(path_to_h5_file, 'r')['img_features'])

#get h5_id from image_id, which can see in the answer data 
with open(path_to_json_file, 'r') as f:
     visual_feat_mapping = json.load(f)['VQA_imgid2id']

        
# print(model.embeddings)
# print(model.embeddings.weight.data)
# print(type(model.embeddings.weight.data))
# word_embeddings = nn.Embedding(6941, 64, padding_idx = 0)
# word_embeddings.weight = nn.Parameter(model.embeddings.weight.data)

# print(word_embeddings)    





In [None]:
if not USING_CBOW:

    class LSTMTagger(nn.Module):

        def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
            super(LSTMTagger, self).__init__()
            self.hidden_dim = hidden_dim

            if not LSTM_TRAINED_WITH_CBOW_EMBEDDINGS:
                self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx = 0)
    
            if LSTM_TRAINED_WITH_CBOW_EMBEDDINGS and UPDATE_CBOW_EMBEDDING:
                self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx = 0)
                self.word_embeddings.weight = nn.Parameter(model.embeddings.weight.data)
                          

            # The LSTM takes word embeddings as inputs, and outputs hidden states
            # with dimensionality hidden_dim.
            self.lstm = nn.LSTM(embedding_dim, hidden_dim)

            # The linear layer that maps from hidden state space to tag space
            self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
            self.hidden = self.init_hidden()

        def init_hidden(self):
            # Before we've done anything, we dont have any hidden state.
            # Refer to the Pytorch documentation to see exactly
            # why they have this dimensionality.
            # The axes semantics are (num_layers, minibatch_size, hidden_dim)
            return (torch.autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
                    torch.autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))

        def forward(self, sentence):

            if LSTM_TRAINED_WITH_CBOW_EMBEDDINGS:
                embeds = model.embeddings(sentence)
            else:
                embeds = self.word_embeddings(sentence)

            lstm_out, self.hidden = self.lstm(
                embeds.view(len(sentence), 1, -1), self.hidden)
            tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
            tag_scores = nn.functional.log_softmax(tag_space)
            return tag_scores

    LSTMmodel = LSTMTagger(no_embeddings, no_embeddings, nwords, no_ans)
    loss_function = nn.NLLLoss()
    #optimizer = optim.SGD(LSTMmodel.parameters(), lr=0.05)
    
    if UPDATE_CBOW_EMBEDDING:
            
        optimizer = optim.SGD([
                    {'params': LSTMmodel.word_embeddings.parameters(), 'lr': 1e-5},
                    {'params': LSTMmodel.lstm.parameters(), 'lr' : 1e-3},
                    {'params': LSTMmodel.hidden2tag.parameters(),'lr' : 1e-3}
                ], momentum=0.9)

    elif LSTM_TRAINED_WITH_CBOW_EMBEDDINGS:
        
        optimizer = optim.SGD([
                    {'params': LSTMmodel.lstm.parameters(), 'lr' : 1e-3},
                    {'params': LSTMmodel.hidden2tag.parameters(),'lr' : 1e-3}
                ], momentum=0.9)
    else:
        optimizer = optim.SGD(LSTMmodel.parameters(), lr=0.05)
        

    for epoch in range(1000):  # again, normally you would NOT do 300 epochs, it is toy data

        #random.shuffle(data_ans_train)
        train_loss = 0.0
        start = time.time()
        correct = 0.0
        #for words, tag in data_ans_train[0:10]:
        for random_idx in range(5000):        

            #shuffle data
            word_idx = random.randint(0,len(data_ans_train)-1)
            words = data_ans_train[word_idx][0]
            tag = data_ans_train[word_idx][1]

            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            LSTMmodel.zero_grad()

            # Also, we need to clear out the hidden state of the LSTM,
            # detaching it from its history on the last instance.
            LSTMmodel.hidden = LSTMmodel.init_hidden()

            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Variables of word indices.
            try:
                expand_length = words.index(0)
                words = words[0:words.index(0)]
            except ValueError as e:
                words = words[0:10]
                expand_length = 10

            lookup_tensor = Variable(torch.LongTensor([words])).view(-1) 
            targets = Variable(torch.LongTensor([tag])).expand(expand_length)

            # Step 3. Run our forward pass.
            tag_scores = LSTMmodel(lookup_tensor)#.view(1, 10, no_ans)
            tag_predict = tag_scores.data.numpy()[-1].argmax()

            if (tag_predict == tag):
                correct += 1


            loss = loss_function(tag_scores, targets)
            loss.backward()
            train_loss += loss.data[0]

            optimizer.step()

        if (correct/5000 > 0.85):
            break    

        if (epoch % 2 == 0):
            print("iter %r: train loss/sent=%.4f, time=%.2fs" % 
                      (epoch, train_loss/5000, time.time()-start))
            print('correct:', correct/5000)

In [None]:
#test LSTM result
if not USING_CBOW:
    for words, tag in data_ans_train[0:8]:

        LSTMmodel.hidden = LSTMmodel.init_hidden()
        try:
            expand_length = words.index(0)
            words = words[0:words.index(0)]
        except ValueError as e:
            words = words[0:10]
            expand_length = 10

        lookup_tensor1 = Variable(torch.LongTensor([words])).view(-1) 
        targets = Variable(torch.LongTensor([tag])).expand(expand_length)

        # Step 3. Run our forward pass.
        tag_scores = LSTMmodel(lookup_tensor1)#.view(1, 10, no_ans)
        tag_predict = tag_scores.data.numpy()[-1].argmax()
        print(tag_predict, tag)
    


In [None]:
#build MLP network

class Net(torch.nn.Module):
    def __init__(self, n_feature, n_hidden, n_output):
        super(Net, self).__init__()
        self.hidden = torch.nn.Linear(n_feature, n_hidden)   # hidden layer
        self.out = torch.nn.Linear(n_hidden, n_output)   # output layer

    def forward(self, x):
        x = nn.functional.relu(self.hidden(x))      # activation function for hidden layer
        x = self.out(x)
        return x

In [None]:
if USING_CBOW:
    n_input = img_feat.shape[0] + no_embeddings
else:
    n_input = img_feat.shape[0] + no_ans

n_output = len(ans_dict)
n_hidden_size = 200
learning_rate = 0.001

word_img_model = Net(n_input, n_hidden_size, n_output)

loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(word_img_model.parameters(), lr=learning_rate)

temp_x = Variable()
temp_y = Variable()

for ITER in range(100):
    
    train_loss = 0.0
    #loss = []
    #print('start!')
    count_err = 0
    #for idx, adata in enumerate(adata_train['annotations'][0:3]):
    for random_idx in range(5000):
        
        idx = random.randint(0,len(data_ans_train)-1)
        adata = adata_train['annotations'][idx]
                
        #get text vector:
        if USING_CBOW:
            #try to use cbow:
            question_word_vector = get_word_embedding_layer(list([data_ans_train[idx][0]])).reshape( no_embeddings, ) 
        else:
            #try to use LSTM        
            words = data_ans_train[idx][0]
            tag = data_ans_train[idx][1]
            LSTMmodel.hidden = LSTMmodel.init_hidden()
            try:
                expand_length = words.index(0)
                words = words[0:words.index(0)]
            except ValueError as e:
                words = words[0:10]
                expand_length = 10

            lookup_tensor1 = Variable(torch.LongTensor([words])).view(-1) 
            targets = Variable(torch.LongTensor([tag])).expand(expand_length)

            # Step 3. Run our forward pass.
            tag_scores = LSTMmodel(lookup_tensor1)#.view(1, 10, no_ans)
            question_word_vector = tag_scores.data.numpy()[-1]

            #LSTM word vecotr normalize
            question_word_vector = (question_word_vector - question_word_vector.mean()) / (np.max(question_word_vector) - np.min(question_word_vector))

        #get image vector
        h5_id = visual_feat_mapping[str(adata['image_id'])]        
        img_feat = img_features[h5_id]
        
        #concatenate word vecotr and image vector : (64,)+(2048,) = (2112,)    
        img_word_vector = np.concatenate((question_word_vector, img_feat), axis=0)
        
        #get answer's word index
        answer_index = 0 if not(adata['multiple_choice_answer'] in ans_dict) else ans_dict[adata['multiple_choice_answer']]
        
        output_vector = np.array([answer_index])#.reshape(1,1)
        x = Variable(torch.from_numpy(img_word_vector).type(torch.FloatTensor))#.cuda()
        y = Variable(torch.from_numpy(output_vector))#.cuda()        
        
        y_pred = word_img_model(x).view(1,n_output)
        if (ITER  % 5 == 0) and (random_idx < 5):
            print(rev_ans_dict[str(y_pred.data.numpy().argmax())], adata['multiple_choice_answer'])
        
        if rev_ans_dict[str(y_pred.data.numpy().argmax())] != adata['multiple_choice_answer']:
            count_err += 1
        #if idx % 1000 == 0:
        #    print('idx ' + str(idx))
        loss = loss_fn(y_pred, y)
            
        train_loss += loss.data[0]
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()   
    
    
    if (ITER  % 2 == 0):
        print('{:>5}'.format(ITER),' loss: ', train_loss/5000)#len(adata_train['annotations']))
        print(ITER,' err: ', count_err/5000)#len(adata_train['annotations']))
        #pass
        #print('{:>5}'.format(ITER),' loss: ', train_loss)



        

In [None]:
from IPython.display import Image
from IPython.core.display import HTML 

with open('./data/imgid2imginfo.json', 'r') as file:
    imgid2info = json.load(file)

def show_predict(idx, question = None):
    #idx = 1
    
    if question == None:       
        ans = adata_train['annotations'][idx]['multiple_choice_answer']
    else:
        ans = ''
        
    if question == None:
        question = qdata_train['questions'][idx]['question']
    img_id =  qdata_train['questions'][idx]['image_id']
    
    print(question)
    print('real answer: ',ans)
    
    #print(w2i['question'])
    
    if USING_CBOW:
        #CBOW word vector
        word_idx = []
        question_split = question.split(' ')
        for word in question_split:
            word_idx += [w2i[word]]
            
        question_word_vector = get_word_embedding_layer([word_idx]).reshape( no_embeddings, ) 
    else:
        words = data_ans_train[idx][0]
        tag = data_ans_train[idx][1]
        LSTMmodel.hidden = LSTMmodel.init_hidden()
        try:
            expand_length = words.index(0)
            words = words[0:words.index(0)]
        except ValueError as e:
            words = words[0:10]
            expand_length = 10

        lookup_tensor1 = Variable(torch.LongTensor([words])).view(-1) 
        targets = Variable(torch.LongTensor([tag])).expand(expand_length)

        # Step 3. Run our forward pass.
        tag_scores = LSTMmodel(lookup_tensor1)#.view(1, 10, no_ans)
        question_word_vector = tag_scores.data.numpy()[-1]

        #LSTM word vecotr normalize
        question_word_vector = (question_word_vector - question_word_vector.mean()) / (np.max(question_word_vector) - np.min(question_word_vector))
    
    
    
    #get image vector
    h5_id = visual_feat_mapping[str(img_id)]
    img_feat = img_features[h5_id]
    #print(img_feat)tag_scores
    #concatenate word vecotr and image vector
    img_word_vector = np.concatenate((question_word_vector, img_feat), axis=0)
    print(img_word_vector.shape, question_word_vector.shape, img_feat.shape)
    #predict the output
    x = Variable(torch.from_numpy(img_word_vector).type(torch.FloatTensor))
    predict_y = word_img_model(x)        
    print('model prediction: ', rev_ans_dict[str(predict_y.data.numpy().argmax())])
    #print(predict_y.max())
    '''
    #using image vector to predict the output
    zero_question_word_vector = np.zeros(no_embeddings,)
    img_word_vector = np.concatenate((zero_question_word_vector, img_feat), axis=0)
    x_img = Variable(torch.from_numpy(img_word_vector).type(torch.FloatTensor))#.cuda()
    predict_y_img = word_img_model(x_img)
    print('img: ', rev_ans_dict[str(predict_y_img.data.numpy().argmax())])
    '''
    #using word vecotr to predict the output
    zero_question_img_vector = np.zeros(2048,)
    img_word_vector = np.concatenate((question_word_vector, zero_question_img_vector), axis=0)
    x_word = Variable(torch.from_numpy(img_word_vector).type(torch.FloatTensor))#.cuda()
    predict_y_word = word_img_model(x_word)
    print('word: ', rev_ans_dict[str(predict_y_word.data.numpy().argmax())])

    display(Image(url= imgid2info[str(img_id)]['coco_url']))
for i in range(100):
    #show_predict(i, 'how many people are there')
    show_predict(i)
    