# Visual Qustion Answering Dataset

VQA Homepage http://visualqa.org/download.html

Annotations taken from [Training annotations 2017 v2.0](http://visualqa.org/data/mscoco/vqa/v2_Annotations_Train_mscoco.zip)

Questions taken from [Training questions 2017 v2.0](http://visualqa.org/data/mscoco/vqa/v2_Questions_Train_mscoco.zip)

![title](img/vqa_examples.jpg)

In [38]:
import json
import zipfile
import random
import numpy as np
import h5py
from collections import Counter, defaultdict
from time import time
from collections import defaultdict
import random
import torch
from torch.autograd import Variable
import torch.nn as nn
import gzip
import os

#only using cbow + MLP 
USING_CBOW = False
#use (CBOW -> LSTM) + MLP
LSTM_TRAINED_WITH_CBOW_EMBEDDINGS = True
UPDATE_CBOW_EMBEDDING = True

DATA_PATH = '..' + os.sep + 'NLP1-2017-VQA' + os.sep + 'data' + os.sep

In [39]:
with zipfile.ZipFile('./data/v2_Questions_Train_mscoco.zip', 'r') as file:
    qdata = json.load(file.open(file.namelist()[0]))

with zipfile.ZipFile('./data/v2_Annotations_Train_mscoco.zip', 'r') as file:
    adata = json.load(file.open(file.namelist()[0])) 

### Preprocessing

* Spelling correction (using Bing Speller) of question and answer strings
* Question normalization (first char uppercase, last char ‘?’)
* Answer normalization (all chars lowercase, no period except as decimal point, number words —> digits, strip articles (a, an the))
* Adding apostrophe if a contraction is missing it (e.g., convert "dont" to "don't")

## Data Exploration

### Annotation Data

In [6]:
print("# Datapoints: ", len(adata['annotations']))
print("Datapoint keys: ", adata['annotations'][0].keys())

# Datapoints:  443757
Datapoint keys:  dict_keys(['question_type', 'multiple_choice_answer', 'answers', 'image_id', 'answer_type', 'question_id'])


Let's look at some datapoints:

In [7]:
print("#1: ", adata['annotations'][0])
print("\n#2: ", adata['annotations'][1])
print("\n#3: ", adata['annotations'][2])

#1:  {'question_type': 'what is this', 'multiple_choice_answer': 'net', 'answers': [{'answer': 'net', 'answer_confidence': 'maybe', 'answer_id': 1}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 2}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 3}, {'answer': 'netting', 'answer_confidence': 'yes', 'answer_id': 4}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 5}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 6}, {'answer': 'mesh', 'answer_confidence': 'maybe', 'answer_id': 7}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 8}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 9}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 10}], 'image_id': 458752, 'answer_type': 'other', 'question_id': 458752000}

#2:  {'question_type': 'what', 'multiple_choice_answer': 'pitcher', 'answers': [{'answer': 'pitcher', 'answer_confidence': 'yes', 'answer_id': 1}, {'answer': 'catcher', 'answer_confidence': 'no', 'answer_

### Question Data

In [8]:
print("# Datapoints: ", len(qdata['questions']))
print("\nDatapoint keys: ", qdata['questions'][0].keys())

# Datapoints:  443757

Datapoint keys:  dict_keys(['image_id', 'question', 'question_id'])


Let's look at some datapoints

In [9]:
print("#1: ", qdata['questions'][0])
print("\n#2: ", qdata['questions'][1])
print("\n#3: ", qdata['questions'][2])

#1:  {'image_id': 458752, 'question': 'What is this photo taken looking through?', 'question_id': 458752000}

#2:  {'image_id': 458752, 'question': 'What position is this man playing?', 'question_id': 458752001}

#3:  {'image_id': 458752, 'question': 'What color is the players shirt?', 'question_id': 458752002}


### Dataset Statistics

In [10]:
question_types = set()
multiple_choice_answers = set()
answer2count = defaultdict(int)
answer_types = set()
answertypes2count = defaultdict(int)
top_answers_per_type = defaultdict(lambda: defaultdict(int))
for ann in adata['annotations']:
    question_types.add(ann['question_type'])
    
    multiple_choice_answers.add(ann['multiple_choice_answer'])
    
    answer2count[ann['multiple_choice_answer']] += 1
    answer_types.add(ann['answer_type'])
    
    answertypes2count[ann['answer_type']] += 1
    top_answers_per_type[ann['answer_type']][ann['multiple_choice_answer']] += 1
    
adata['annotations']

[{'answer_type': 'other',
  'answers': [{'answer': 'net', 'answer_confidence': 'maybe', 'answer_id': 1},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 2},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 3},
   {'answer': 'netting', 'answer_confidence': 'yes', 'answer_id': 4},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 5},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 6},
   {'answer': 'mesh', 'answer_confidence': 'maybe', 'answer_id': 7},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 8},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 9},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 10}],
  'image_id': 458752,
  'multiple_choice_answer': 'net',
  'question_id': 458752000,
  'question_type': 'what is this'},
 {'answer_type': 'other',
  'answers': [{'answer': 'pitcher',
    'answer_confidence': 'yes',
    'answer_id': 1},
   {'answer': 'catcher', 'answer_confidence': 'no', 'ans

#### Question Types

In [11]:
print("# Unique Question Types: ", len(question_types))
print(question_types)

# Unique Question Types:  65
{'what room is', 'is there', 'are they', 'how many people are in', 'where are the', 'what is the person', 'what is the man', 'what are', 'is this person', 'what number is', 'does the', 'what', 'does this', 'what is', 'is it', 'why is the', 'what animal is', 'why', 'what is the woman', 'what color are the', 'what is the', 'none of the above', 'is there a', 'is the person', 'what is on the', 'how many people are', 'what sport is', 'can you', 'has', 'are', 'is the', 'what brand', 'is that a', 'where is the', 'what is this', 'is he', 'is this', 'is', 'is the woman', 'what kind of', 'could', 'are there any', 'what type of', 'what does the', 'how many', 'are there', 'are these', 'are the', 'what color is the', 'do', 'what color is', 'is the man', 'what is in the', 'what color', 'what are the', 'do you', 'what is the color of the', 'what time', 'was', 'is this a', 'how', 'which', 'who is', 'is this an', 'what is the name'}


#### Answer Types

In [12]:
print("Answer Types: ", answer_types)
print("Answer Type Counts: ", Counter(answertypes2count).most_common())
for t in list(answer_types):
    print("\nType '%s' Top 50 Answers %s" %(t, Counter(top_answers_per_type[t]).most_common(50)))

Answer Types:  {'yes/no', 'other', 'number'}
Answer Type Counts:  [('other', 219269), ('yes/no', 166882), ('number', 57606)]

Type 'yes/no' Top 50 Answers [('yes', 84615), ('no', 82263), ('africa', 1), ('not', 1), ('cutting apples', 1), ('cups', 1)]

Type 'other' Top 50 Answers [('white', 8915), ('blue', 5455), ('red', 5201), ('black', 5066), ('brown', 3814), ('green', 3750), ('yellow', 2792), ('gray', 2113), ('nothing', 1814), ('right', 1760), ('frisbee', 1641), ('baseball', 1597), ('left', 1563), ('none', 1562), ('tennis', 1502), ('wood', 1449), ('orange', 1425), ('bathroom', 1230), ('pizza', 1203), ('pink', 1201), ('kitchen', 1093), ('cat', 933), ('dog', 890), ('water', 888), ('man', 885), ('skateboarding', 884), ('grass', 879), ('skiing', 866), ('kite', 793), ('silver', 773), ('black and white', 766), ('surfing', 762), ('horse', 708), ('living room', 702), ('skateboard', 701), ('phone', 697), ('snow', 641), ('wii', 636), ('giraffe', 636), ('woman', 632), ('standing', 627), ('surfbo

#### Answers

In [13]:
print("# Unique Answers: ", len(multiple_choice_answers))
print("\nSome Answers: ", list(np.random.choice(list(multiple_choice_answers), 100)))
print("\nTop 100 Common Answers: ", Counter(answer2count).most_common(100))

# Unique Answers:  22531

Some Answers:  ['wakecom', 'motorcycles', 'gaming', 'hair nets', 'fifa', 'bluebird', '117', 'insert', 'alifeinbalancenet', '828', 'mile', 'salt and pepper shakers', 'holding her arm', 'uneaten', 'skull and bone', '7th player', 'on yogurt', 'bowness', '41', 'go potty', 'lazyboy', 'kitty hawk', 'delivered', 'bridget samuels', 'iberia', 'ratchaphakhinai road soi 1', '3.25', 'at her friend', 'rice, broccoli chicken', 'standard for equestrians for safety', 'italian', 'cedar st', 'carrot and asparagus', 'hanging out', 'bananas, peaches, strawberries, blueberries, grapes, apples, oranges', '6/12/2009', 'coronation', 'pepperidge farms', 'balancing', 'waveland', 'ripped', 'town', 'keep clean', 'keep clear', 'red house', 'american doughnut kitchen', 'printing', 'pia', '1000 pounds', '2 in front', "baltimore's best crab cake", 'dinner fork', 'ceiling', 'blue shirt', '60 watt', '2 stories', 'kickstands', 'war', 'covered it', 'about to fly', 'beer', 'yellow blue red', 'gre

## Dataset Creation

The subset will follow the same structure as the original VQA dataset. This is:

* Answer
    * Question Type
    * Majority Answer
    * Answer Type
    * Answer Candidates
        * Given Answer
        * Confidence
        * Answerer ID
        
        
* Question
    * Question
    * Image ID
   
   
* Images
    * ResNet Image Features (Size: 2048)
    

In order to train your models on your machine with a CPU (or if you have a GPU), we need to reduce the size of the Dataset. We will reduce the original dataset in the following way:
* 20k Q/A of answer type _yes/no_
* 20k Q/A of answer type _number_
* 20k Q/A of answer type _other_

The total number of Q/A will then be 60000. We will divide into training, validation and test split. The ratio between the splits will be approximately: 80%, 15%, 5% respectively.

In [14]:
start_time = time()
idx = list(range(0,len(qdata['questions'])))
random.seed(42)
random.shuffle(idx)

np.random.seed(42)
splits = ['train', 'valid', 'test']

n = 20000
qdata_small = {'questions': list()}
adata_small = {'annotations': list()}
a_type_counts = {'yes/no': 0, 'number': 0, 'other': 0}

while len(qdata_small['questions']) < 3*n:
    i = idx.pop()
    
    at = adata['annotations'][i]['answer_type'] 
    
    if a_type_counts[at] < n:
        
        if at == 'yes/no' and adata['annotations'][i]['multiple_choice_answer'] not in ['yes', 'no']:
            continue
            
        adata_small['annotations'].append(adata['annotations'][i])
        qdata_small['questions'].append(qdata['questions'][i])
        
        split = np.random.choice(splits, p=(.8, .15, .05))
        adata_small['annotations'][-1]['split'] = split
        qdata_small['questions'][-1]['split'] = split
        
        a_type_counts[at] += 1
        
# Tests
assert len(qdata_small['questions']) == len(adata_small['annotations']) == 3*n, "Inconsitent Lengths."
a_type_counts = {'yes/no': 0, 'number': 0, 'other': 0}
for ann in adata_small['annotations']:
    a_type_counts[ann['answer_type']] += 1
assert a_type_counts['yes/no'] == a_type_counts['number'] == a_type_counts['other'] == n, "Inconsistent Answer Type Lengths."

print("Data Creation Looks good! Time Taken %.2f" %(time()-start_time))

Data Creation Looks good! Time Taken 1.59


Let's look at some examples to verify this is the same data. Calculating the statistics again.

#### Annotations Small Dataset

In [15]:
print("# Datapoints: ", len(adata_small['annotations']))
print("\nDatapoint keys: ", adata_small['annotations'][0].keys())
print("\n#1: ", adata_small['annotations'][0])
print("\n#2: ", adata_small['annotations'][1])
print("\n#3: ", adata_small['annotations'][2])

# Datapoints:  60000

Datapoint keys:  dict_keys(['question_type', 'multiple_choice_answer', 'answers', 'image_id', 'answer_type', 'question_id', 'split'])

#1:  {'question_type': 'what', 'multiple_choice_answer': 'tea', 'answers': [{'answer': 'brunch', 'answer_confidence': 'maybe', 'answer_id': 1}, {'answer': 'tea', 'answer_confidence': 'yes', 'answer_id': 2}, {'answer': 'tea time', 'answer_confidence': 'yes', 'answer_id': 3}, {'answer': 'brunch', 'answer_confidence': 'yes', 'answer_id': 4}, {'answer': 'breakfast', 'answer_confidence': 'maybe', 'answer_id': 5}, {'answer': 'tea', 'answer_confidence': 'yes', 'answer_id': 6}, {'answer': 'teatime', 'answer_confidence': 'yes', 'answer_id': 7}, {'answer': 'lunch', 'answer_confidence': 'yes', 'answer_id': 8}, {'answer': 'reception', 'answer_confidence': 'maybe', 'answer_id': 9}, {'answer': 'breakfast', 'answer_confidence': 'yes', 'answer_id': 10}], 'image_id': 228478, 'answer_type': 'other', 'question_id': 228478002, 'split': 'train'}

#2:  

#### Questions Small Dataset

In [16]:
print("# Datapoints: ", len(qdata_small['questions']))
print("\nDatapoint keys: ", qdata_small['questions'][0].keys())
print("\n#1: ", qdata_small['questions'][0])
print("\n#2: ", qdata_small['questions'][1])
print("\n#3: ", qdata_small['questions'][2])

# Datapoints:  60000

Datapoint keys:  dict_keys(['image_id', 'question', 'question_id', 'split'])

#1:  {'image_id': 228478, 'question': 'What English meal is this likely for?', 'question_id': 228478002, 'split': 'train'}

#2:  {'image_id': 540769, 'question': 'Is there a bell on the train?', 'question_id': 540769000, 'split': 'test'}

#3:  {'image_id': 111756, 'question': 'What color is his uniform?', 'question_id': 111756005, 'split': 'train'}


### Dataset Statistics Small Dataset

In [17]:
question_types = set()
multiple_choice_answers = set()
answer2count = defaultdict(int)
answer_types = set()
answertypes2count = defaultdict(int)
top_answers_per_type = defaultdict(lambda: defaultdict(int))
for ann in adata_small['annotations']:
    question_types.add(ann['question_type'])
    
    multiple_choice_answers.add(ann['multiple_choice_answer'])
    
    answer2count[ann['multiple_choice_answer']] += 1
    answer_types.add(ann['answer_type'])
    
    answertypes2count[ann['answer_type']] += 1
    top_answers_per_type[ann['answer_type']][ann['multiple_choice_answer']] += 1

#### Quesiton Types Small Dataset

In [18]:
print("# Unique Question Types: ", len(question_types))
print(question_types)

# Unique Question Types:  65
{'what room is', 'is there', 'what are', 'where are the', 'is this person', 'what is the person', 'what is the man', 'what number is', 'are they', 'how many people are in', 'does the', 'what', 'does this', 'what is', 'is it', 'what animal is', 'why', 'what is the woman', 'why is the', 'what color are the', 'what is the name', 'what is the', 'none of the above', 'is there a', 'what sport is', 'can you', 'what is on the', 'how many people are', 'is the person', 'are', 'has', 'is the', 'is that a', 'where is the', 'what is this', 'is he', 'is this', 'is', 'is the woman', 'what kind of', 'could', 'are there any', 'what type of', 'what does the', 'how many', 'are there', 'are these', 'are the', 'what color is the', 'do', 'what color is', 'is the man', 'what is in the', 'what color', 'what are the', 'do you', 'what time', 'what is the color of the', 'is this a', 'how', 'was', 'which', 'who is', 'is this an', 'what brand'}


#### Answer Types Small Dataset

In [19]:
print("Answer Types: ", answer_types)
print("Answer Type Counts: ", Counter(answertypes2count).most_common())
for t in list(answer_types):
    print("\nType '%s' Top 50 Answers %s" %(t, Counter(top_answers_per_type[t]).most_common(50)))

Answer Types:  {'yes/no', 'other', 'number'}
Answer Type Counts:  [('other', 20000), ('yes/no', 20000), ('number', 20000)]

Type 'yes/no' Top 50 Answers [('yes', 10178), ('no', 9822)]

Type 'other' Top 50 Answers [('white', 823), ('red', 494), ('black', 460), ('blue', 449), ('green', 355), ('brown', 331), ('yellow', 266), ('gray', 190), ('right', 154), ('frisbee', 152), ('nothing', 151), ('left', 144), ('baseball', 134), ('none', 132), ('orange', 130), ('wood', 127), ('tennis', 123), ('pink', 119), ('pizza', 118), ('kitchen', 113), ('bathroom', 106), ('cat', 90), ('water', 86), ('dog', 85), ('skiing', 84), ('grass', 84), ('surfing', 80), ('skateboarding', 78), ('horse', 75), ('black and white', 74), ('kite', 73), ('surfboard', 72), ('silver', 71), ('man', 69), ('living room', 66), ('woman', 65), ('giraffe', 64), ('table', 63), ('wii', 61), ('apple', 58), ('snow', 58), ('phone', 57), ('skateboard', 56), ('hat', 56), ('broccoli', 54), ('snowboarding', 53), ('eating', 53), ('cow', 52), ('

#### Answers Small Dataset

In [20]:
print("# Unique Answers: ", len(multiple_choice_answers))
print("\nSome Answers: ", list(np.random.choice(list(multiple_choice_answers), 100)))
print("\nTop 100 Common Answers: ", Counter(answer2count).most_common(100))

# Unique Answers:  5691

Some Answers:  ['306', 'taste', 'yellow brick', '2 inches', 'weathered', 'ketchup and mustard', 'cowboy', 'return to newcis wrexham', 'lot', 'vw', 'garage door', 'watching tv', 'kitchenaid', 'styrofoam', 'sticks', 'cucumber', 'bracelet', 'knives', 'transportation', 'on man', 'owner', 'on bike', 'jets', 'kayaking', 'audrey hepburn', "it's fun", '500', '60', 'red white blue', 'leaf', 'hanes', 'front left', '120126-8', 'hotel samara', '35 mph', 'average', 'parking meter', 'peacock attack', 'marlins', 'copyright', 'raft', 'groom', 'lucky luke', 'no bed', 'man', '9:54', 'bnp parib', '11:05', 'sporting event', 'pizza rustica', 'photography', 'short', 'smoothie', 'burgers', '369', 'innova', 'g', 'bar', 'chinese', 'to grip board', 'condos', 'for display', 'between legs', '120', 'stop wash your hands', 'carrots and onions', 'overcast', 'vest', '215', 'in bag', 'tugboat', 'piper', 'ski poles', 'stadium', 'humans', 'dashes', 'landing', 'rural', 'riding horses', 'domestic'

## Saving

In [21]:
import gzip

### Splitting

In [22]:
qdata_small_splits = {\
                      'train': {'questions': list()}, 
                      'valid': {'questions': list()}, 
                      'test': {'questions': list()}
                     }

adata_small_splits = {\
                      'train': {'annotations': list()}, 
                      'valid': {'annotations': list()}, 
                      'test': {'annotations': list()}
                     }

for i in range(len(qdata_small['questions'])):
    
    split = qdata_small['questions'][i]['split']
    assert split == adata_small['annotations'][i]['split'], "Inconsistent Splits."
    assert adata_small['annotations'][i]['question_id'] == qdata_small['questions'][i]['question_id'], "Inconsistent IDs."
    
    qdata_small_splits[split]['questions'].append(qdata_small['questions'][i])
    adata_small_splits[split]['annotations'].append(adata_small['annotations'][i])
    
        
print("Training Set Size: %i" %(len(qdata_small_splits['train']['questions'])))
print("\nValidation Set Size: %i" %(len(qdata_small_splits['valid']['questions'])))
print("\nTest Set Size: %i" %(len(qdata_small_splits['test']['questions'])))

Training Set Size: 48061

Validation Set Size: 8977

Test Set Size: 2962


### Write out the files

In [34]:
for split in ['train', 'valid', 'test']:
    
    with gzip.GzipFile('data/vqa_annotatons_' + split + '.gzip', 'w') as file:
        file.write(json.dumps(adata_small_splits[split]).encode('utf-8'))
        
    with gzip.GzipFile('data/vqa_questions_' + split + '.gzip', 'w') as file:
        file.write(json.dumps(qdata_small_splits[split]).encode('utf-8'))

Get list of all image ids

In [35]:
image_ids = set()
for q in qdata_small['questions']:
    image_ids.add(q['image_id'])

image_ids_json = {'image_ids': list(image_ids)}
with open('data/image_ids_vqa.json', 'w') as file:
    json.dump(image_ids_json, file)

In [40]:
#read data
with gzip.open(DATA_PATH + 'vqa_questions_train.gzip', 'rb') as file:
    file_content = file.read().decode('utf-8')
    qdata_train = json.loads(file_content)

with gzip.GzipFile(DATA_PATH + 'vqa_annotatons_train.gzip', 'r') as file:
    adata_train = json.loads(file.read().decode('utf-8')) 
    
with gzip.open(DATA_PATH + 'vqa_questions_test.gzip', 'rb') as file:
    file_content = file.read().decode('utf-8')
    qdata_test = json.loads(file_content)
    
with gzip.GzipFile(DATA_PATH + 'vqa_annotatons_test.gzip', 'r') as file:
    adata_test = json.loads(file.read().decode('utf-8')) 
    


In [41]:
#print(qdata_train['questions'][0])
#print("#1: ", adata_train['annotations'][0])

question_types = set()
multiple_choice_answers = set()
answer2count = defaultdict(int)
answer_types = set()
answertypes2count = defaultdict(int)
top_answers_per_type = defaultdict(lambda: defaultdict(int))
for ann in adata_train['annotations']:
    question_types.add(ann['question_type'])
    
    multiple_choice_answers.add(ann['multiple_choice_answer'])
    
    answer2count[ann['multiple_choice_answer']] += 1
    answer_types.add(ann['answer_type'])
    
    answertypes2count[ann['answer_type']] += 1
    top_answers_per_type[ann['answer_type']][ann['multiple_choice_answer']] += 1

In [42]:
from collections import defaultdict
import time
import random
import torch
from torch.autograd import Variable
import torch.nn as nn

In [43]:
torch.manual_seed(1)

#w2i is the dict that change vocabulary used in question to index, exmpale: how are you -> {1,3,4}
w2i = defaultdict(lambda: len(w2i))

w2i['pad'] = 0
UNK = w2i["<unk>"]

#answer to index dictionary
ans_dict = {'pad': 0, '<unk>': 1}
#ans_dict = {'unk': 0}

#index to answer dictionary
rev_ans_dict = {'0': 'pad', '1': '<unk>'}
#rev_ans_dict = {'0': 'unk'}

#answer which occur more than 5 times, then it will be add to answer dictionary
def answer_to_idx(answer2count):
    count = 1
    for ans in answer2count: 
        if answer2count[ans] > 5:
            ans_dict[ans] = count
            rev_ans_dict[str(count)] = ans
            count = count + 1
    return ans_dict

#also include 'unk here', not sure if it's right???
def ans_to_onehot(ans_idx):
    #ans_idx is the integer index of an answer
    ans1h = np.zeros(no_ans).reshape(no_ans)
    ans1h[ans_idx] = 1
    return ans1h


#change vocabular in question sets to index
#it will return a list with every words in question changing to index and also the anser index
def read_data_with_answers(t_data, t_answer_data, training ):
    for idx in range(len(t_data)):
        question = t_data[idx]['question']
        question = question.lower().split("?",1)[0]
        
        question_with_pad = question.split(" ")
        question_lenth = len(question_with_pad)
        
        #adding pad token to the question until the length of the sentence is = 10
        for idx_1 in range(10-question_lenth):
            question_with_pad.append('pad')        
            
        ans_yield = 0
        
        if t_answer_data[idx]['multiple_choice_answer'] in ans_dict:
            ans_yield = ans_dict[t_answer_data[idx]['multiple_choice_answer']]
        
        if training == True:
            yield ([w2i[x] for x in question_with_pad], ans_yield)   
        else:
            yield ([w2i[x] if x in w2i else w2i["<unk>"] for x in question_with_pad ], ans_yield)
             


#answer vocabulary 
ans_dict = answer_to_idx(answer2count)

#number of answers which occur more than 5 times in training set
no_ans = len(ans_dict)

data_ans_train = list(read_data_with_answers(qdata_train['questions'], adata_train['annotations'], training = True))
w2i = defaultdict(lambda: UNK, w2i)
data_ans_test = list(read_data_with_answers(qdata_test['questions'], adata_test['annotations'], training = False))

nwords = len(w2i)
no_embeddings = 64

In [44]:
## ecekt
#FOR TRAINING on UNIQUE WORDS
import torch.optim as optim

if LSTM_TRAINED_WITH_CBOW_EMBEDDINGS or USING_CBOW:
    
    class CBOW(nn.Module):
        
        def __init__(self, vocab_size, embedding_dim, output_dim):
            super(CBOW, self).__init__()
            self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx = 0)
            self.linear = nn.Linear(embedding_dim, output_dim)
            self.bows = torch.FloatTensor(1, embedding_dim).zero_()

        def forward(self, inputs):
            embeds = self.embeddings(inputs)
            #print(embeds.size)
            bow = torch.sum(embeds, 1)
            self.bows = bow
            logits = self.linear(bow)
            return logits


    model = CBOW(nwords, no_embeddings, no_ans)
    print(model)


    def evaluate(model, data):
        """Evaluate a model on a data set."""
        correct = 0.0
 
        for words, tag in data:
            lookup_tensor = Variable(torch.LongTensor([words]))
            scores = model(lookup_tensor)
            predict = scores.data.numpy().argmax(axis=1)[0]

            if predict == tag:
                correct += 1

        return correct, len(data), correct/len(data), scores


    optimizer = optim.SGD(model.parameters(), lr=0.01)

    for ITER in range(50):

        #random.shuffle(data_ans_train)
        train_loss = 0.0
        start = time.time()
        correct = 0
        
        for words, tag in data_ans_train:
            # forward pass
            lookup_tensor = Variable(torch.LongTensor([words]))
            scores = model(lookup_tensor)
            loss = nn.CrossEntropyLoss()
            target = Variable(torch.LongTensor([tag]))
            
            predict = scores.data.numpy().argmax(axis=1)[0]
            if predict == tag:
                correct += 1
            
            
            output = loss(scores, target)
            train_loss += output.data[0]

            # backward pass
            model.zero_grad()
            output.backward()

            # update weights
            optimizer.step()

        print("iter %r: train loss/sent=%.4f, time=%.2fs" % 
              (ITER, train_loss/len(data_ans_train), time.time()-start))
        print('correct', correct/len(data_ans_train))
        
        


CBOW (
  (embeddings): Embedding(6941, 64, padding_idx=0)
  (linear): Linear (64 -> 532)
)
iter 0: train loss/sent=3.0422, time=64.71s
correct 0.32591914442063213
iter 1: train loss/sent=2.5146, time=60.34s
correct 0.35313455816566447
iter 2: train loss/sent=2.3563, time=58.32s
correct 0.36349639000436945
iter 3: train loss/sent=2.2627, time=59.89s
correct 0.37042508478808184
iter 4: train loss/sent=2.1952, time=59.53s
correct 0.37837331724267076
iter 5: train loss/sent=2.1420, time=65.89s
correct 0.3831380953371757
iter 6: train loss/sent=2.0976, time=61.27s
correct 0.3895258109485862
iter 7: train loss/sent=2.0592, time=76.78s
correct 0.39499802334533196
iter 8: train loss/sent=2.0250, time=80.64s
correct 0.40051184952456254
iter 9: train loss/sent=1.9940, time=59.84s
correct 0.40486048979421985
iter 10: train loss/sent=1.9654, time=59.55s
correct 0.40866815089157527
iter 11: train loss/sent=1.9387, time=79.86s
correct 0.4127879153575664
iter 12: train loss/sent=1.9137, time=68.08s
c

In [48]:
#after training, get word embedding layer through cbow, input should be a question_to_index_list(w2i?)
def get_word_embedding_layer(data):
    
    #get bow vector from each question, so question_bow will be a matrix 
    #each row of matrix is a vector coming from sum of word embeddings from each questions
    question_bow = np.zeros((len(data), no_embeddings))
    
    #iterate each question, so we can append each bow of question to question_bow
    for idx, words in enumerate(data):
        lookup_tensor = Variable(torch.LongTensor([words]))
        scores = model(lookup_tensor)
        question_bow[idx] = (model.bows.data.numpy())
    
    return question_bow

In [49]:
#test, get_word_embedding_layer get correct answer   
print((get_word_embedding_layer(list([data_ans_train[0][0]]))).shape)   #size 1x64

(1, 64)


In [50]:
#read from image
path_to_h5_file = DATA_PATH + 'VQA_image_features.h5'
path_to_json_file = DATA_PATH + 'VQA_img_features2id.json'

#get image feature from h5_id 
img_features = np.asarray(h5py.File(path_to_h5_file, 'r')['img_features'])

#get h5_id from image_id, which can see in the answer data 
with open(path_to_json_file, 'r') as f:
     visual_feat_mapping = json.load(f)['VQA_imgid2id']


In [51]:
#test, for example, using image_id = 228478 to get h5_id through visual_feat_mapping
h5_id = visual_feat_mapping[str(228478)]

#using h5_id to get img_feature though img_features
img_feat = img_features[h5_id]

In [82]:
# print(model.embeddings)
# print(model.embeddings.weight.data)
# print(type(model.embeddings.weight.data))
# word_embeddings = nn.Embedding(6941, 64, padding_idx = 0)
# word_embeddings.weight = nn.Parameter(model.embeddings.weight.data)

# print(word_embeddings)    

Embedding(6941, 64, padding_idx=0)

 0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
 0.9944 -0.2694 -0.6491  ...  -1.0952 -1.0703  0.6404
 1.0174  0.5826 -0.6679  ...   0.2636  0.0931  0.5431
          ...             ⋱             ...          
 0.5202  0.8511 -2.0028  ...   0.3447 -0.2092 -1.8826
-1.1139  0.0141  0.1658  ...  -0.4416 -1.0183 -0.4967
 1.2959  0.7685 -0.7355  ...  -0.2037 -0.6773 -0.0022
[torch.FloatTensor of size 6941x64]

<class 'torch.FloatTensor'>
Embedding(6941, 64, padding_idx=0)


In [83]:
if not USING_CBOW:

    class LSTMTagger(nn.Module):

        def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
            super(LSTMTagger, self).__init__()
            self.hidden_dim = hidden_dim

            if not LSTM_TRAINED_WITH_CBOW_EMBEDDINGS:
                self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx = 0)
    
            if LSTM_TRAINED_WITH_CBOW_EMBEDDINGS and UPDATE_CBOW_EMBEDDING:
                self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx = 0)
                self.word_embeddings.weight = nn.Parameter(model.embeddings.weight.data)
                          

            # The LSTM takes word embeddings as inputs, and outputs hidden states
            # with dimensionality hidden_dim.
            self.lstm = nn.LSTM(embedding_dim, hidden_dim)

            # The linear layer that maps from hidden state space to tag space
            self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
            self.hidden = self.init_hidden()

        def init_hidden(self):
            # Before we've done anything, we dont have any hidden state.
            # Refer to the Pytorch documentation to see exactly
            # why they have this dimensionality.
            # The axes semantics are (num_layers, minibatch_size, hidden_dim)
            return (torch.autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
                    torch.autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))

        def forward(self, sentence):

            if LSTM_TRAINED_WITH_CBOW_EMBEDDINGS:
                embeds = model.embeddings(sentence)
            else:
                embeds = self.word_embeddings(sentence)

            lstm_out, self.hidden = self.lstm(
                embeds.view(len(sentence), 1, -1), self.hidden)
            tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
            tag_scores = nn.functional.log_softmax(tag_space)
            return tag_scores

    LSTMmodel = LSTMTagger(no_embeddings, no_embeddings, nwords, no_ans)
    loss_function = nn.NLLLoss()
    #optimizer = optim.SGD(LSTMmodel.parameters(), lr=0.05)
    
    if UPDATE_CBOW_EMBEDDING:
            
        optimizer = optim.SGD([
                    {'params': LSTMmodel.word_embeddings.parameters(), 'lr': 1e-5},
                    {'params': LSTMmodel.lstm.parameters(), 'lr' : 1e-3},
                    {'params': LSTMmodel.hidden2tag.parameters(),'lr' : 1e-3}
                ], momentum=0.9)

    elif LSTM_TRAINED_WITH_CBOW_EMBEDDINGS:
        
        optimizer = optim.SGD([
                    {'params': LSTMmodel.lstm.parameters(), 'lr' : 1e-3},
                    {'params': LSTMmodel.hidden2tag.parameters(),'lr' : 1e-3}
                ], momentum=0.9)
    else:
        optimizer = optim.SGD(LSTMmodel.parameters(), lr=0.05)
        

    for epoch in range(1000):  # again, normally you would NOT do 300 epochs, it is toy data

        #random.shuffle(data_ans_train)
        train_loss = 0.0
        start = time.time()
        correct = 0.0
        #for words, tag in data_ans_train[0:10]:
        for random_idx in range(5000):        

            #shuffle data
            word_idx = random.randint(0,len(data_ans_train)-1)
            words = data_ans_train[word_idx][0]
            tag = data_ans_train[word_idx][1]

            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            LSTMmodel.zero_grad()

            # Also, we need to clear out the hidden state of the LSTM,
            # detaching it from its history on the last instance.
            LSTMmodel.hidden = LSTMmodel.init_hidden()

            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Variables of word indices.
            try:
                expand_length = words.index(0)
                words = words[0:words.index(0)]
            except ValueError as e:
                words = words[0:10]
                expand_length = 10

            lookup_tensor = Variable(torch.LongTensor([words])).view(-1) 
            targets = Variable(torch.LongTensor([tag])).expand(expand_length)

            # Step 3. Run our forward pass.
            tag_scores = LSTMmodel(lookup_tensor)#.view(1, 10, no_ans)
            tag_predict = tag_scores.data.numpy()[-1].argmax()

            if (tag_predict == tag):
                correct += 1


            loss = loss_function(tag_scores, targets)
            loss.backward()
            train_loss += loss.data[0]

            optimizer.step()

        if (correct/5000 > 0.85):
            break    

        if (epoch % 2 == 0):
            print("iter %r: train loss/sent=%.4f, time=%.2fs" % 
                      (epoch, train_loss/5000, time.time()-start))
            print('correct:', correct/5000)

iter 0: train loss/sent=3.9940, time=30.76s
correct: 0.2248
iter 2: train loss/sent=2.6706, time=26.88s
correct: 0.3774
iter 4: train loss/sent=2.5615, time=28.30s
correct: 0.3724
iter 6: train loss/sent=2.4466, time=28.80s
correct: 0.3886
iter 8: train loss/sent=2.4631, time=27.80s
correct: 0.3954
iter 10: train loss/sent=2.3978, time=27.77s
correct: 0.3864
iter 12: train loss/sent=2.4383, time=27.88s
correct: 0.374
iter 14: train loss/sent=2.4307, time=27.88s
correct: 0.3848
iter 16: train loss/sent=2.3356, time=23.62s
correct: 0.3958
iter 18: train loss/sent=2.3459, time=24.39s
correct: 0.4036
iter 20: train loss/sent=2.3495, time=27.70s
correct: 0.386
iter 22: train loss/sent=2.3547, time=23.59s
correct: 0.386
iter 24: train loss/sent=2.2767, time=23.40s
correct: 0.4094
iter 26: train loss/sent=2.3183, time=23.59s
correct: 0.3932
iter 28: train loss/sent=2.3306, time=23.50s
correct: 0.397
iter 30: train loss/sent=2.1954, time=25.07s
correct: 0.4098
iter 32: train loss/sent=2.2842, 

iter 268: train loss/sent=1.9110, time=34.27s
correct: 0.4738
iter 270: train loss/sent=1.9255, time=33.86s
correct: 0.4754
iter 272: train loss/sent=1.9282, time=30.50s
correct: 0.472
iter 274: train loss/sent=1.8929, time=30.97s
correct: 0.4786
iter 276: train loss/sent=1.8983, time=32.17s
correct: 0.4808
iter 278: train loss/sent=1.8814, time=32.18s
correct: 0.4716
iter 280: train loss/sent=1.9575, time=37.54s
correct: 0.4724
iter 282: train loss/sent=1.8942, time=34.29s
correct: 0.4762
iter 284: train loss/sent=1.8870, time=31.34s
correct: 0.4844
iter 286: train loss/sent=1.8903, time=31.97s
correct: 0.4804
iter 288: train loss/sent=1.9487, time=31.81s
correct: 0.475
iter 290: train loss/sent=1.9006, time=30.07s
correct: 0.4672
iter 292: train loss/sent=1.9336, time=32.40s
correct: 0.482
iter 294: train loss/sent=1.8735, time=32.95s
correct: 0.4898
iter 296: train loss/sent=1.9019, time=33.67s
correct: 0.4892
iter 298: train loss/sent=1.9129, time=33.55s
correct: 0.4654
iter 300: t

KeyboardInterrupt: 

In [None]:
#test LSTM result
if not USING_CBOW:
    for words, tag in data_ans_train[0:8]:

        LSTMmodel.hidden = LSTMmodel.init_hidden()
        try:
            expand_length = words.index(0)
            words = words[0:words.index(0)]
        except ValueError as e:
            words = words[0:10]
            expand_length = 10

        lookup_tensor1 = Variable(torch.LongTensor([words])).view(-1) 
        targets = Variable(torch.LongTensor([tag])).expand(expand_length)

        # Step 3. Run our forward pass.
        tag_scores = LSTMmodel(lookup_tensor1)#.view(1, 10, no_ans)
        tag_predict = tag_scores.data.numpy()[-1].argmax()
        print(tag_predict, tag)
    
    


In [None]:
#build MLP network

class Net(torch.nn.Module):
    def __init__(self, n_feature, n_hidden, n_output):
        super(Net, self).__init__()
        self.hidden = torch.nn.Linear(n_feature, n_hidden)   # hidden layer
        self.out = torch.nn.Linear(n_hidden, n_output)   # output layer

    def forward(self, x):
        x = nn.functional.relu(self.hidden(x))      # activation function for hidden layer
        x = self.out(x)
        return x

In [None]:
if USING_CBOW:
    n_input = img_feat.shape[0] + no_embeddings
else:
    n_input = img_feat.shape[0] + no_ans

n_output = len(ans_dict)
n_hidden_size = 200
learning_rate = 0.001

word_img_model = Net(n_input, n_hidden_size, n_output)

loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(word_img_model.parameters(), lr=learning_rate)

temp_x = Variable()
temp_y = Variable()

for ITER in range(100):
    
    train_loss = 0.0
    #loss = []
    #print('start!')
    count_err = 0
    #for idx, adata in enumerate(adata_train['annotations'][0:3]):
    for random_idx in range(5000):
        
        idx = random.randint(0,len(data_ans_train)-1)
        adata = adata_train['annotations'][idx]
                
        #get text vector:
        if USING_CBOW:
            #try to use cbow:
            question_word_vector = get_word_embedding_layer(list([data_ans_train[idx][0]])).reshape( no_embeddings, ) 
        else:
            #try to use LSTM        
            words = data_ans_train[idx][0]
            tag = data_ans_train[idx][1]
            LSTMmodel.hidden = LSTMmodel.init_hidden()
            try:
                expand_length = words.index(0)
                words = words[0:words.index(0)]
            except ValueError as e:
                words = words[0:10]
                expand_length = 10

            lookup_tensor1 = Variable(torch.LongTensor([words])).view(-1) 
            targets = Variable(torch.LongTensor([tag])).expand(expand_length)

            # Step 3. Run our forward pass.
            tag_scores = LSTMmodel(lookup_tensor1)#.view(1, 10, no_ans)
            question_word_vector = tag_scores.data.numpy()[-1]

            #LSTM word vecotr normalize
            question_word_vector = (question_word_vector - question_word_vector.mean()) / (np.max(question_word_vector) - np.min(question_word_vector))

        #get image vector
        h5_id = visual_feat_mapping[str(adata['image_id'])]        
        img_feat = img_features[h5_id]
        
        #concatenate word vecotr and image vector : (64,)+(2048,) = (2112,)    
        img_word_vector = np.concatenate((question_word_vector, img_feat), axis=0)
        
        #get answer's word index
        answer_index = 0 if not(adata['multiple_choice_answer'] in ans_dict) else ans_dict[adata['multiple_choice_answer']]
        
        output_vector = np.array([answer_index])#.reshape(1,1)
        x = Variable(torch.from_numpy(img_word_vector).type(torch.FloatTensor))#.cuda()
        y = Variable(torch.from_numpy(output_vector))#.cuda()        
        
        y_pred = word_img_model(x).view(1,n_output)
        if (ITER  % 5 == 0) and (random_idx < 5):
            print(rev_ans_dict[str(y_pred.data.numpy().argmax())], adata['multiple_choice_answer'])
        
        if rev_ans_dict[str(y_pred.data.numpy().argmax())] != adata['multiple_choice_answer']:
            count_err += 1
        #if idx % 1000 == 0:
        #    print('idx ' + str(idx))
        loss = loss_fn(y_pred, y)
            
        train_loss += loss.data[0]
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()   
    
    
    if (ITER  % 2 == 0):
        print('{:>5}'.format(ITER),' loss: ', train_loss/5000)#len(adata_train['annotations']))
        print(ITER,' err: ', count_err/5000)#len(adata_train['annotations']))
        #pass
        #print('{:>5}'.format(ITER),' loss: ', train_loss)



        

In [None]:
from IPython.display import Image
from IPython.core.display import HTML 

with open('./data/imgid2imginfo.json', 'r') as file:
    imgid2info = json.load(file)

def show_predict(idx, question = None):
    #idx = 1
    
    if question == None:       
        ans = adata_train['annotations'][idx]['multiple_choice_answer']
    else:
        ans = ''
        
    if question == None:
        question = qdata_train['questions'][idx]['question']
    img_id =  qdata_train['questions'][idx]['image_id']
    
    print(question)
    print('real answer: ',ans)
    
    #print(w2i['question'])
    
    if USING_CBOW:
        #CBOW word vector
        word_idx = []
        question_split = question.split(' ')
        for word in question_split:
            word_idx += [w2i[word]]
            
        question_word_vector = get_word_embedding_layer([word_idx]).reshape( no_embeddings, ) 
    else:
        words = data_ans_train[idx][0]
        tag = data_ans_train[idx][1]
        LSTMmodel.hidden = LSTMmodel.init_hidden()
        try:
            expand_length = words.index(0)
            words = words[0:words.index(0)]
        except ValueError as e:
            words = words[0:10]
            expand_length = 10

        lookup_tensor1 = Variable(torch.LongTensor([words])).view(-1) 
        targets = Variable(torch.LongTensor([tag])).expand(expand_length)

        # Step 3. Run our forward pass.
        tag_scores = LSTMmodel(lookup_tensor1)#.view(1, 10, no_ans)
        question_word_vector = tag_scores.data.numpy()[-1]

        #LSTM word vecotr normalize
        question_word_vector = (question_word_vector - question_word_vector.mean()) / (np.max(question_word_vector) - np.min(question_word_vector))
    
    
    
    #get image vector
    h5_id = visual_feat_mapping[str(img_id)]
    img_feat = img_features[h5_id]
    #print(img_feat)tag_scores
    #concatenate word vecotr and image vector
    img_word_vector = np.concatenate((question_word_vector, img_feat), axis=0)
    print(img_word_vector.shape, question_word_vector.shape, img_feat.shape)
    #predict the output
    x = Variable(torch.from_numpy(img_word_vector).type(torch.FloatTensor))
    predict_y = word_img_model(x)        
    print('model prediction: ', rev_ans_dict[str(predict_y.data.numpy().argmax())])
    #print(predict_y.max())
    '''
    #using image vector to predict the output
    zero_question_word_vector = np.zeros(no_embeddings,)
    img_word_vector = np.concatenate((zero_question_word_vector, img_feat), axis=0)
    x_img = Variable(torch.from_numpy(img_word_vector).type(torch.FloatTensor))#.cuda()
    predict_y_img = word_img_model(x_img)
    print('img: ', rev_ans_dict[str(predict_y_img.data.numpy().argmax())])
    '''
    #using word vecotr to predict the output
    zero_question_img_vector = np.zeros(2048,)
    img_word_vector = np.concatenate((question_word_vector, zero_question_img_vector), axis=0)
    x_word = Variable(torch.from_numpy(img_word_vector).type(torch.FloatTensor))#.cuda()
    predict_y_word = word_img_model(x_word)
    print('word: ', rev_ans_dict[str(predict_y_word.data.numpy().argmax())])

    display(Image(url= imgid2info[str(img_id)]['coco_url']))
for i in range(100):
    #show_predict(i, 'how many people are there')
    show_predict(i)
    