# Visual Qustion Answering Dataset

VQA Homepage http://visualqa.org/download.html

Annotations taken from [Training annotations 2017 v2.0](http://visualqa.org/data/mscoco/vqa/v2_Annotations_Train_mscoco.zip)

Questions taken from [Training questions 2017 v2.0](http://visualqa.org/data/mscoco/vqa/v2_Questions_Train_mscoco.zip)

![title](img/vqa_examples.jpg)

In [3]:
import json
import zipfile
import random
import numpy as np
import h5py
from collections import Counter, defaultdict
from time import time
from collections import defaultdict
import random
import torch
from torch.autograd import Variable
import torch.nn as nn
import gzip
import os

GENSIM = False
DATA_PATH = '..' + os.sep + 'NLP1-2017-VQA' + os.sep + 'data' + os.sep

if GENSIM:
    from gensim.models.keyedvectors import KeyedVectors
    # Load Google's pre-trained Word2Vec model.
    w2v_model = KeyedVectors.load_word2vec_format(DATA_PATH + 'GoogleNews-vectors-negative300.bin', binary=True, limit=200000)

In [4]:
with zipfile.ZipFile('./data/v2_Questions_Train_mscoco.zip', 'r') as file:
    qdata = json.load(file.open(file.namelist()[0]))

with zipfile.ZipFile('./data/v2_Annotations_Train_mscoco.zip', 'r') as file:
    adata = json.load(file.open(file.namelist()[0])) 

### Preprocessing

* Spelling correction (using Bing Speller) of question and answer strings
* Question normalization (first char uppercase, last char ‘?’)
* Answer normalization (all chars lowercase, no period except as decimal point, number words —> digits, strip articles (a, an the))
* Adding apostrophe if a contraction is missing it (e.g., convert "dont" to "don't")

## Data Exploration

### Annotation Data

In [5]:
print("# Datapoints: ", len(adata['annotations']))
print("Datapoint keys: ", adata['annotations'][0].keys())

# Datapoints:  443757
Datapoint keys:  dict_keys(['question_type', 'multiple_choice_answer', 'answers', 'image_id', 'answer_type', 'question_id'])


Let's look at some datapoints:

In [6]:
print("#1: ", adata['annotations'][0])
print("\n#2: ", adata['annotations'][1])
print("\n#3: ", adata['annotations'][2])

#1:  {'question_type': 'what is this', 'multiple_choice_answer': 'net', 'answers': [{'answer': 'net', 'answer_confidence': 'maybe', 'answer_id': 1}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 2}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 3}, {'answer': 'netting', 'answer_confidence': 'yes', 'answer_id': 4}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 5}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 6}, {'answer': 'mesh', 'answer_confidence': 'maybe', 'answer_id': 7}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 8}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 9}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 10}], 'image_id': 458752, 'answer_type': 'other', 'question_id': 458752000}

#2:  {'question_type': 'what', 'multiple_choice_answer': 'pitcher', 'answers': [{'answer': 'pitcher', 'answer_confidence': 'yes', 'answer_id': 1}, {'answer': 'catcher', 'answer_confidence': 'no', 'answer_

### Question Data

In [7]:
print("# Datapoints: ", len(qdata['questions']))
print("\nDatapoint keys: ", qdata['questions'][0].keys())

# Datapoints:  443757

Datapoint keys:  dict_keys(['image_id', 'question', 'question_id'])


Let's look at some datapoints

In [8]:
print("#1: ", qdata['questions'][0])
print("\n#2: ", qdata['questions'][1])
print("\n#3: ", qdata['questions'][2])

#1:  {'image_id': 458752, 'question': 'What is this photo taken looking through?', 'question_id': 458752000}

#2:  {'image_id': 458752, 'question': 'What position is this man playing?', 'question_id': 458752001}

#3:  {'image_id': 458752, 'question': 'What color is the players shirt?', 'question_id': 458752002}


### Dataset Statistics

In [9]:
question_types = set()
multiple_choice_answers = set()
answer2count = defaultdict(int)
answer_types = set()
answertypes2count = defaultdict(int)
top_answers_per_type = defaultdict(lambda: defaultdict(int))
for ann in adata['annotations']:
    question_types.add(ann['question_type'])
    
    multiple_choice_answers.add(ann['multiple_choice_answer'])
    
    answer2count[ann['multiple_choice_answer']] += 1
    answer_types.add(ann['answer_type'])
    
    answertypes2count[ann['answer_type']] += 1
    top_answers_per_type[ann['answer_type']][ann['multiple_choice_answer']] += 1
    
adata['annotations']

[{'answer_type': 'other',
  'answers': [{'answer': 'net', 'answer_confidence': 'maybe', 'answer_id': 1},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 2},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 3},
   {'answer': 'netting', 'answer_confidence': 'yes', 'answer_id': 4},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 5},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 6},
   {'answer': 'mesh', 'answer_confidence': 'maybe', 'answer_id': 7},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 8},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 9},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 10}],
  'image_id': 458752,
  'multiple_choice_answer': 'net',
  'question_id': 458752000,
  'question_type': 'what is this'},
 {'answer_type': 'other',
  'answers': [{'answer': 'pitcher',
    'answer_confidence': 'yes',
    'answer_id': 1},
   {'answer': 'catcher', 'answer_confidence': 'no', 'ans

#### Question Types

In [10]:
print("# Unique Question Types: ", len(question_types))
print(question_types)

# Unique Question Types:  65
{'does this', 'what sport is', 'is the woman', 'is he', 'what type of', 'what number is', 'what room is', 'are the', 'what is the color of the', 'what brand', 'is the', 'what is the man', 'do you', 'are there any', 'what time', 'is the person', 'does the', 'what is the woman', 'could', 'is', 'who is', 'what is in the', 'is this an', 'what is the', 'how many people are', 'what color', 'how many', 'is the man', 'what is on the', 'is this', 'are there', 'how many people are in', 'what is the name', 'can you', 'is this person', 'is it', 'what color are the', 'is this a', 'what', 'why is the', 'what color is', 'what color is the', 'is there a', 'was', 'are they', 'what kind of', 'which', 'do', 'what are', 'what is this', 'what does the', 'is there', 'where are the', 'why', 'has', 'are these', 'is that a', 'what is the person', 'how', 'none of the above', 'are', 'where is the', 'what are the', 'what is', 'what animal is'}


#### Answer Types

In [11]:
print("Answer Types: ", answer_types)
print("Answer Type Counts: ", Counter(answertypes2count).most_common())
for t in list(answer_types):
    print("\nType '%s' Top 50 Answers %s" %(t, Counter(top_answers_per_type[t]).most_common(50)))

Answer Types:  {'number', 'other', 'yes/no'}
Answer Type Counts:  [('other', 219269), ('yes/no', 166882), ('number', 57606)]

Type 'number' Top 50 Answers [('1', 12520), ('2', 12194), ('3', 6527), ('0', 4860), ('4', 4112), ('5', 2359), ('6', 1452), ('10', 972), ('7', 937), ('8', 907), ('12', 519), ('9', 514), ('20', 430), ('11', 360), ('15', 303), ('many', 258), ('25', 246), ('13', 242), ('30', 194), ('14', 175), ('50', 173), ('16', 147), ('100', 133), ('24', 127), ('18', 103), ('17', 82), ('40', 68), ('21', 64), ('200', 63), ('lot', 54), ('2 feet', 51), ('22', 51), ('19', 50), ('one way', 49), ('23', 49), ('27', 48), ('28', 45), ('38', 44), ('35', 42), ('10 feet', 38), ('55', 36), ('3 feet', 36), ('45', 33), ('26', 33), ('29', 31), ('60', 31), ('2010', 29), ('34', 27), ('32', 27), ('10:20', 27)]

Type 'other' Top 50 Answers [('white', 8915), ('blue', 5455), ('red', 5201), ('black', 5066), ('brown', 3814), ('green', 3750), ('yellow', 2792), ('gray', 2113), ('nothing', 1814), ('right', 

#### Answers

In [12]:
print("# Unique Answers: ", len(multiple_choice_answers))
print("\nSome Answers: ", list(np.random.choice(list(multiple_choice_answers), 100)))
print("\nTop 100 Common Answers: ", Counter(answer2count).most_common(100))

# Unique Answers:  22531

Some Answers:  ['putting it in', 'train car', 'floor', 'asmfc', 'lanyards', 'croissant', '1 hr', 'white, yellow and blue', 'wrinkled', 'first republic bank', 'duck crossing', 'lotic', 'garda', 'mexicana', 'clean-shaven', 'park', 'pimp hat', 'tomatoes and sausages', 'behind cat', 'wood burning', 'minor', 'sir', 'doberman', '89', 'doily', '5 pm', "he's jumping", 'indianapolis', '601', 'no currency', 'stationary store beauty clinic', '58666', 'stripe', 'not working', 'german', 'sleigh ride', 'bearded', 'lengthwise', 'not very warm', 'hockey', 'no red sign', 'flying toy plane', 'bali', 'biggby', 'behind toilet', 'traffic fines doubled in work zones', '3 for 20', 'another shirt', 'defrosting', 'airline tags', '404n19909', 'batting', 'newbury road no through road worth', 'signal boats', '11:40', 'skate park', 'white face', 'opening freezer', 'underline', '47832', 'out of service', 'stools', 'hornets', '5:48', '742', 'fort', 'trash bag', "st bride's avenue ec4", 'buy

## Dataset Creation

The subset will follow the same structure as the original VQA dataset. This is:

* Answer
    * Question Type
    * Majority Answer
    * Answer Type
    * Answer Candidates
        * Given Answer
        * Confidence
        * Answerer ID
        
        
* Question
    * Question
    * Image ID
   
   
* Images
    * ResNet Image Features (Size: 2048)
    

In order to train your models on your machine with a CPU (or if you have a GPU), we need to reduce the size of the Dataset. We will reduce the original dataset in the following way:
* 20k Q/A of answer type _yes/no_
* 20k Q/A of answer type _number_
* 20k Q/A of answer type _other_

The total number of Q/A will then be 60000. We will divide into training, validation and test split. The ratio between the splits will be approximately: 80%, 15%, 5% respectively.

In [13]:
start_time = time()
idx = list(range(0,len(qdata['questions'])))
random.seed(42)
random.shuffle(idx)

np.random.seed(42)
splits = ['train', 'valid', 'test']

n = 20000
qdata_small = {'questions': list()}
adata_small = {'annotations': list()}
a_type_counts = {'yes/no': 0, 'number': 0, 'other': 0}

while len(qdata_small['questions']) < 3*n:
    i = idx.pop()
    
    at = adata['annotations'][i]['answer_type'] 
    
    if a_type_counts[at] < n:
        
        if at == 'yes/no' and adata['annotations'][i]['multiple_choice_answer'] not in ['yes', 'no']:
            continue
            
        adata_small['annotations'].append(adata['annotations'][i])
        qdata_small['questions'].append(qdata['questions'][i])
        
        split = np.random.choice(splits, p=(.8, .15, .05))
        adata_small['annotations'][-1]['split'] = split
        qdata_small['questions'][-1]['split'] = split
        
        a_type_counts[at] += 1
        
# Tests
assert len(qdata_small['questions']) == len(adata_small['annotations']) == 3*n, "Inconsitent Lengths."
a_type_counts = {'yes/no': 0, 'number': 0, 'other': 0}
for ann in adata_small['annotations']:
    a_type_counts[ann['answer_type']] += 1
assert a_type_counts['yes/no'] == a_type_counts['number'] == a_type_counts['other'] == n, "Inconsistent Answer Type Lengths."

print("Data Creation Looks good! Time Taken %.2f" %(time()-start_time))

Data Creation Looks good! Time Taken 1.53


Let's look at some examples to verify this is the same data. Calculating the statistics again.

#### Annotations Small Dataset

In [14]:
print("# Datapoints: ", len(adata_small['annotations']))
print("\nDatapoint keys: ", adata_small['annotations'][0].keys())
print("\n#1: ", adata_small['annotations'][0])
print("\n#2: ", adata_small['annotations'][1])
print("\n#3: ", adata_small['annotations'][2])

# Datapoints:  60000

Datapoint keys:  dict_keys(['question_type', 'multiple_choice_answer', 'answers', 'image_id', 'answer_type', 'question_id', 'split'])

#1:  {'question_type': 'what', 'multiple_choice_answer': 'tea', 'answers': [{'answer': 'brunch', 'answer_confidence': 'maybe', 'answer_id': 1}, {'answer': 'tea', 'answer_confidence': 'yes', 'answer_id': 2}, {'answer': 'tea time', 'answer_confidence': 'yes', 'answer_id': 3}, {'answer': 'brunch', 'answer_confidence': 'yes', 'answer_id': 4}, {'answer': 'breakfast', 'answer_confidence': 'maybe', 'answer_id': 5}, {'answer': 'tea', 'answer_confidence': 'yes', 'answer_id': 6}, {'answer': 'teatime', 'answer_confidence': 'yes', 'answer_id': 7}, {'answer': 'lunch', 'answer_confidence': 'yes', 'answer_id': 8}, {'answer': 'reception', 'answer_confidence': 'maybe', 'answer_id': 9}, {'answer': 'breakfast', 'answer_confidence': 'yes', 'answer_id': 10}], 'image_id': 228478, 'answer_type': 'other', 'question_id': 228478002, 'split': 'train'}

#2:  

#### Questions Small Dataset

In [15]:
print("# Datapoints: ", len(qdata_small['questions']))
print("\nDatapoint keys: ", qdata_small['questions'][0].keys())
print("\n#1: ", qdata_small['questions'][0])
print("\n#2: ", qdata_small['questions'][1])
print("\n#3: ", qdata_small['questions'][2])

# Datapoints:  60000

Datapoint keys:  dict_keys(['image_id', 'question', 'question_id', 'split'])

#1:  {'image_id': 228478, 'question': 'What English meal is this likely for?', 'question_id': 228478002, 'split': 'train'}

#2:  {'image_id': 540769, 'question': 'Is there a bell on the train?', 'question_id': 540769000, 'split': 'test'}

#3:  {'image_id': 111756, 'question': 'What color is his uniform?', 'question_id': 111756005, 'split': 'train'}


### Dataset Statistics Small Dataset

In [16]:
question_types = set()
multiple_choice_answers = set()
answer2count = defaultdict(int)
answer_types = set()
answertypes2count = defaultdict(int)
top_answers_per_type = defaultdict(lambda: defaultdict(int))
for ann in adata_small['annotations']:
    question_types.add(ann['question_type'])
    
    multiple_choice_answers.add(ann['multiple_choice_answer'])
    
    answer2count[ann['multiple_choice_answer']] += 1
    answer_types.add(ann['answer_type'])
    
    answertypes2count[ann['answer_type']] += 1
    top_answers_per_type[ann['answer_type']][ann['multiple_choice_answer']] += 1

#### Quesiton Types Small Dataset

In [17]:
print("# Unique Question Types: ", len(question_types))
print(question_types)

# Unique Question Types:  65
{'does this', 'what sport is', 'is the woman', 'is he', 'what type of', 'what number is', 'what room is', 'are the', 'what brand', 'do you', 'is the', 'what is the man', 'what is the color of the', 'what animal is', 'are there any', 'what time', 'is the person', 'does the', 'what is the woman', 'could', 'is', 'who is', 'what is in the', 'is this an', 'what is the', 'how many people are', 'what color', 'how many', 'is the man', 'are there', 'is this', 'what is on the', 'what is the name', 'how many people are in', 'is it', 'what color are the', 'is this a', 'what', 'why is the', 'which', 'what color is the', 'what color is', 'what kind of', 'is there a', 'what are', 'do', 'are they', 'was', 'what is this', 'where are the', 'is there', 'why', 'what does the', 'has', 'are these', 'is that a', 'what is the person', 'how', 'none of the above', 'can you', 'are', 'where is the', 'what are the', 'what is', 'is this person'}


#### Answer Types Small Dataset

In [18]:
print("Answer Types: ", answer_types)
print("Answer Type Counts: ", Counter(answertypes2count).most_common())
for t in list(answer_types):
    print("\nType '%s' Top 50 Answers %s" %(t, Counter(top_answers_per_type[t]).most_common(50)))

Answer Types:  {'number', 'other', 'yes/no'}
Answer Type Counts:  [('other', 20000), ('yes/no', 20000), ('number', 20000)]

Type 'number' Top 50 Answers [('1', 4298), ('2', 4281), ('3', 2270), ('0', 1677), ('4', 1382), ('5', 817), ('6', 510), ('8', 337), ('7', 330), ('10', 321), ('12', 190), ('9', 170), ('11', 135), ('20', 134), ('15', 97), ('25', 89), ('13', 87), ('many', 81), ('30', 77), ('14', 61), ('50', 56), ('16', 53), ('24', 52), ('100', 50), ('18', 46), ('17', 35), ('21', 24), ('27', 23), ('200', 23), ('19', 19), ('2 feet', 19), ('40', 19), ('10 feet', 19), ('lot', 17), ('3 feet', 16), ('35', 16), ('22', 16), ('one way', 15), ('5 years', 14), ('23', 14), ('28', 13), ('2012', 12), ('55', 12), ('old', 12), ('38', 12), ('2016', 11), ('12:00', 11), ('10:20', 11), ('29', 10), ('2010', 10)]

Type 'other' Top 50 Answers [('white', 823), ('red', 494), ('black', 460), ('blue', 449), ('green', 355), ('brown', 331), ('yellow', 266), ('gray', 190), ('right', 154), ('frisbee', 152), ('nothi

#### Answers Small Dataset

In [19]:
print("# Unique Answers: ", len(multiple_choice_answers))
print("\nSome Answers: ", list(np.random.choice(list(multiple_choice_answers), 100)))
print("\nTop 100 Common Answers: ", Counter(answer2count).most_common(100))

# Unique Answers:  5691

Some Answers:  ['piper', 'yd02', '3:22', 'rolled', '10:47', 'singing', 'on counter', 'yamaha', 'rays', 'content', 'gothic', 'on rack', '3:40', '25 cents', 'va', 'slazeng', 'leaning', 'planes', 'talking', 'british', 'for cleaning', '500 feet', 'doorway', 'in front of him', 'river', '36', 'no pancakes', '176', 'miller', '120126-8', 'tuna fish', 'plants', 'feather', 'bottle opener', 'a4(e) a46', 'ski pole', 'white and pink', 'hanes', 'way', 'paint bucket', "i hate feeling like puppet for someone else's pleasure", 'hiding', 'advertisement', 'baked choco', 'freedom', 'cigarette', '11:12', 'licorice pieces', 'trim', 'ph-bea', 'no bridge', 'multiple', 'clothes', 'cab', 'banana and orange', 'person and dog', 'think', 'photography', '43034', '545', '1403', 'book', 'towed', 'goose', 'surfboards', '80', 'ruffle', 'eat cake', 'nintendo', 'keyboard, mouse, booklet', 'preparing to surf', 'inbound', 'no ladder', 'arms', 'tent', 'england', "it's raining", 'toilet', '108', 'to 

## Saving

In [20]:
import gzip

### Splitting

In [21]:
qdata_small_splits = {\
                      'train': {'questions': list()}, 
                      'valid': {'questions': list()}, 
                      'test': {'questions': list()}
                     }

adata_small_splits = {\
                      'train': {'annotations': list()}, 
                      'valid': {'annotations': list()}, 
                      'test': {'annotations': list()}
                     }

for i in range(len(qdata_small['questions'])):
    
    split = qdata_small['questions'][i]['split']
    assert split == adata_small['annotations'][i]['split'], "Inconsistent Splits."
    assert adata_small['annotations'][i]['question_id'] == qdata_small['questions'][i]['question_id'], "Inconsistent IDs."
    
    qdata_small_splits[split]['questions'].append(qdata_small['questions'][i])
    adata_small_splits[split]['annotations'].append(adata_small['annotations'][i])
    
        
print("Training Set Size: %i" %(len(qdata_small_splits['train']['questions'])))
print("\nValidation Set Size: %i" %(len(qdata_small_splits['valid']['questions'])))
print("\nTest Set Size: %i" %(len(qdata_small_splits['test']['questions'])))

Training Set Size: 48061

Validation Set Size: 8977

Test Set Size: 2962


### Write out the files

In [22]:
for split in ['train', 'valid', 'test']:
    
    with gzip.GzipFile('data/vqa_annotatons_' + split + '.gzip', 'w') as file:
        file.write(json.dumps(adata_small_splits[split]).encode('utf-8'))
        
    with gzip.GzipFile('data/vqa_questions_' + split + '.gzip', 'w') as file:
        file.write(json.dumps(qdata_small_splits[split]).encode('utf-8'))

Get list of all image ids

In [2]:
image_ids = set()
for q in qdata_small['questions']:
    image_ids.add(q['image_id'])

image_ids_json = {'image_ids': list(image_ids)}
with open('data/image_ids_vqa.json', 'w') as file:
    json.dump(image_ids_json, file)

NameError: name 'qdata_small' is not defined

In [4]:
#read data
with gzip.open(DATA_PATH + 'vqa_questions_train.gzip', 'rb') as file:
    file_content = file.read().decode('utf-8')
    qdata_train = json.loads(file_content)

with gzip.GzipFile(DATA_PATH + 'vqa_annotatons_train.gzip', 'r') as file:
    adata_train = json.loads(file.read().decode('utf-8')) 
    
with gzip.open(DATA_PATH + 'vqa_questions_test.gzip', 'rb') as file:
    file_content = file.read().decode('utf-8')
    qdata_test = json.loads(file_content)
    
with gzip.GzipFile(DATA_PATH + 'vqa_annotatons_test.gzip', 'r') as file:
    adata_test = json.loads(file.read().decode('utf-8')) 
    


In [5]:
#print(qdata_train['questions'][0])
#print("#1: ", adata_train['annotations'][0])

question_types = set()
multiple_choice_answers = set()
answer2count = defaultdict(int)
answer_types = set()
answertypes2count = defaultdict(int)
top_answers_per_type = defaultdict(lambda: defaultdict(int))
for ann in adata_train['annotations']:
    question_types.add(ann['question_type'])
    
    multiple_choice_answers.add(ann['multiple_choice_answer'])
    
    answer2count[ann['multiple_choice_answer']] += 1
    answer_types.add(ann['answer_type'])
    
    answertypes2count[ann['answer_type']] += 1
    top_answers_per_type[ann['answer_type']][ann['multiple_choice_answer']] += 1

In [6]:
from collections import defaultdict
import time
import random
import torch
from torch.autograd import Variable
import torch.nn as nn

In [10]:
torch.manual_seed(1)

#w2i is the dict that change vocabulary used in question to index, exmpale: how are you -> {1,3,4}
w2i = defaultdict(lambda: len(w2i))
UNK = w2i["<unk>"]

w2i['pad'] = 0
w2i['unk'] = 1

#answer to index dictionary
ans_dict = {'pad': 0, 'unk': 1}

#index to answer dictionary
rev_ans_dict = {'0': 'pad', '1': 'unk'}

#answer which occur more than 5 times, then it will be add to answer dictionary
def answer_to_idx(answer2count):
    count = 1
    for ans in answer2count: 
        if answer2count[ans] > 5:
            ans_dict[ans] = count
            rev_ans_dict[str(count)] = ans
            count = count + 1
    return ans_dict

#also include 'unk here', not sure if it's right???
def ans_to_onehot(ans_idx):
    #ans_idx is the integer index of an answer
    ans1h = np.zeros(no_ans).reshape(no_ans)
    ans1h[ans_idx] = 1
    return ans1h


#change vocabular in question sets to index
#it will return a list with every words in question changing to index and also the anser index
def read_data_with_answers(t_data, t_answer_data, training ):
    for idx in range(len(t_data)):
        question = t_data[idx]['question']
        question = question.lower().split("?",1)[0]
        
        question_with_pad = question.split(" ")
        question_lenth = len(question_with_pad)
        
        #adding pad token to the question until the length of the sentence is = 10
        for idx in range(10-question_lenth):
            question_with_pad.append('pad')        
            
        ans_yield = 0
        
        if t_answer_data[idx]['multiple_choice_answer'] in ans_dict:
            ans_yield = ans_dict[t_answer_data[idx]['multiple_choice_answer']]
        
        if training == True:
            yield ([w2i[x] for x in question_with_pad], ans_yield)   
        else:
            yield ([w2i[x] if x in w2i else w2i["<unk>"] for x in question_with_pad ], ans_yield)
             


#answer vocabulary 
ans_dict = answer_to_idx(answer2count)

#number of answers which occur more than 5 times in training set
no_ans = len(ans_dict)

data_ans_train = list(read_data_with_answers(qdata_train['questions'], adata_train['annotations'], training = True))
w2i = defaultdict(lambda: UNK, w2i)
data_ans_test = list(read_data_with_answers(qdata_test['questions'], adata_test['annotations'], training = False))

nwords = len(w2i)
no_embeddings = 128

In [11]:
print('length of ans_dict: ', len(ans_dict))
print('length of rev_ans_dict: ', len(rev_ans_dict))

print('data_ans_train[1](the word in question 2 transfering to index, and the answer to index: ', data_ans_train[1])
print('\'blue\' to index: ', ans_dict['blue'])
print('index \'1\' to word: ', rev_ans_dict['1'])
print('number of words in question set: ',len(w2i))
print([adata_train['annotations'][0]['multiple_choice_answer']])
print(ans_dict[adata_train['annotations'][0]['multiple_choice_answer']])
#test if pad works
print(data_ans_train[0])

length of ans_dict:  532
length of rev_ans_dict:  531
data_ans_train[1](the word in question 2 transfering to index, and the answer to index:  ([3, 10, 6, 11, 12, 0, 0, 0, 0, 0], 4)
'blue' to index:  2
index '1' to word:  tea
number of words in question set:  6942
['tea']
1
([3, 4, 5, 6, 7, 8, 9, 0, 0, 0], 3)


In [None]:
## ecekt
#FOR TRAINING on UNIQUE WORDS
import torch.optim as optim

if not GENSIM:
    class CBOW(nn.Module):
        
        def __init__(self, vocab_size, embedding_dim, output_dim):
            super(CBOW, self).__init__()
            self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx = 0)
            self.linear = nn.Linear(embedding_dim, output_dim)
            self.bows = torch.FloatTensor(1, embedding_dim).zero_()

        def forward(self, inputs):
            embeds = self.embeddings(inputs)
            bow = torch.sum(embeds, 1)
            self.bows = bow
            logits = self.linear(bow)
            return logits


    model = CBOW(nwords, no_embeddings, no_ans)
    print(model)


    def evaluate(model, data):
        """Evaluate a model on a data set."""
        correct = 0.0
 
        for words, tag in data:
            lookup_tensor = Variable(torch.LongTensor([words]))
            scores = model(lookup_tensor)
            predict = scores.data.numpy().argmax(axis=1)[0]

            if predict == tag:
                correct += 1

        return correct, len(data), correct/len(data), scores


    optimizer = optim.SGD(model.parameters(), lr=0.01)

    for ITER in range(10):

        random.shuffle(data_ans_train)
        train_loss = 0.0
        start = time.time()

        for words, tag in data_ans_train:
            # forward pass
            lookup_tensor = Variable(torch.LongTensor([words]))
            scores = model(lookup_tensor)
            loss = nn.CrossEntropyLoss()
            target = Variable(torch.LongTensor([tag]))
            output = loss(scores, target)
            train_loss += output.data[0]

            # backward pass
            model.zero_grad()
            output.backward()

            # update weights
            optimizer.step()

        print("iter %r: train loss/sent=%.4f, time=%.2fs" % 
              (ITER, train_loss/len(data_ans_train), time.time()-start))

        # evaluate
        #_,_,acc = evaluate(model, data_ans_train[0:5])
        if train_loss/len(data_ans_train) < 0.8:
            break
        
        


CBOW (
  (embeddings): Embedding(6942, 128, padding_idx=0)
  (linear): Linear (128 -> 532)
)


In [None]:
#after training, get word embedding layer through cbow, input should be a question_to_index_list(w2i?)
def get_word_embedding_layer(data):
    
    #get bow vector from each question, so question_bow will be a matrix 
    #each row of matrix is a vector coming from sum of word embeddings from each questions
    question_bow = np.zeros((len(data), no_embeddings))
    
    #iterate each question, so we can append each bow of question to question_bow
    for idx, words in enumerate(data):
        lookup_tensor = Variable(torch.LongTensor([words]))
        scores = model(lookup_tensor)
        question_bow[idx] = (model.bows.data.numpy())
    
    return question_bow


In [67]:
#test, get_word_embedding_layer get correct answer

test_list = []
for words, tag in data_ans_train[0:1]:
    print(type(data_ans_train[0]))
    print(list([data_ans_train[0][0]]))
    print(list([[data_ans_train[0][0][0]]]))
    test_list.append(words)
    print(test_list)

testt = 0
for idx in range(10):
    testt += get_word_embedding_layer(list([[data_ans_train[0][0][idx]]]))[0][1]
    #print(testt) 
    
#should return 2*65 matrix, and each row of matrix representing the sum of word embedding from each question
#for example: data_ans_train[0] get the 1*65 vector, get_word_embedding_layer[0]
#data_ans_train[1] get the 1*65 vector, get_word_embedding_layer[1]
print(get_word_embedding_layer(list([data_ans_train[0][0]])))              

<class 'tuple'>
[[3, 4, 5, 6, 7, 8, 9, 0, 0, 0]]
[[3]]
[[3, 4, 5, 6, 7, 8, 9, 0, 0, 0]]
[[-1.61548305 -2.07910514 -0.11669727 -2.22792268 -2.53716707 -0.26058912
  -1.24331737  4.34085369 -0.66317624 -1.63041115  2.36206007 -0.37234175
   1.80023277 -2.86193752  3.01508331 -2.57743907  0.30831712  1.1135056
  -1.45923674 -1.35851622  3.79527664  3.30250001 -0.17097592  1.11875939
   1.35550725  0.38081491  0.28333387 -1.35274363  1.9018538  -1.07246399
   0.11453389 -0.95025915 -0.32716173  1.59819341 -1.49640918 -0.89957869
  -2.44294024 -1.35186863 -2.2796948   2.23046041  0.93174827  2.602319
   3.94282389  5.30142689 -1.79519558  0.21656412 -2.32232308 -0.29674119
   3.86593056  2.95946479  0.57894468  0.74976832 -0.56801325  1.33837521
   0.55932957  0.37691545 -1.60357571 -4.47538567 -0.74917185 -4.18814659
  -1.77070522  0.6368956  -0.86265135 -1.88297462]]


In [68]:
#read from image
path_to_h5_file = DATA_PATH + 'VQA_image_features.h5'
path_to_json_file = DATA_PATH + 'VQA_img_features2id.json'

#get image feature from h5_id 
img_features = np.asarray(h5py.File(path_to_h5_file, 'r')['img_features'])

#get h5_id from image_id, which can see in the answer data 
with open(path_to_json_file, 'r') as f:
     visual_feat_mapping = json.load(f)['VQA_imgid2id']


In [69]:
#test, for example, using image_id = 228478 to get h5_id through visual_feat_mapping
h5_id = visual_feat_mapping[str(228478)]

#using h5_id to get img_feature though img_features
img_feat = img_features[h5_id]
print(img_feat.shape)
print(img_features.shape)
print(adata_train['annotations'][0])
print(0 if not('ss' in ans_dict) else ans_dict['ss'])

(2048,)
(39423, 2048)
{'question_type': 'what', 'multiple_choice_answer': 'tea', 'answers': [{'answer': 'brunch', 'answer_confidence': 'maybe', 'answer_id': 1}, {'answer': 'tea', 'answer_confidence': 'yes', 'answer_id': 2}, {'answer': 'tea time', 'answer_confidence': 'yes', 'answer_id': 3}, {'answer': 'brunch', 'answer_confidence': 'yes', 'answer_id': 4}, {'answer': 'breakfast', 'answer_confidence': 'maybe', 'answer_id': 5}, {'answer': 'tea', 'answer_confidence': 'yes', 'answer_id': 6}, {'answer': 'teatime', 'answer_confidence': 'yes', 'answer_id': 7}, {'answer': 'lunch', 'answer_confidence': 'yes', 'answer_id': 8}, {'answer': 'reception', 'answer_confidence': 'maybe', 'answer_id': 9}, {'answer': 'breakfast', 'answer_confidence': 'yes', 'answer_id': 10}], 'image_id': 228478, 'answer_type': 'other', 'question_id': 228478002, 'split': 'train'}
0


In [None]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx = 0)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
                torch.autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = nn.functional.log_softmax(tag_space)
        return tag_scores
    
#LSTMmodel = LSTMTagger(no_embeddings, no_embeddings, nwords, no_ans)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(LSTMmodel.parameters(), lr=0.05)



for epoch in range(700):  # again, normally you would NOT do 300 epochs, it is toy data

    random.shuffle(data_ans_train)
    train_loss = 0.0
    start = time.time()
    FIRST = True
    for words, tag in data_ans_train[0:10000]:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        LSTMmodel.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        LSTMmodel.hidden = LSTMmodel.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Variables of word indices.
        if len(words) > 10:
            words = words[0:10]
        
        lookup_tensor = Variable(torch.LongTensor([words])).view(-1) 
        targets = Variable(torch.LongTensor([tag]))#.expand(10)

        # Step 3. Run our forward pass.
        tag_scores = LSTMmodel(lookup_tensor).view(1, 10, no_ans)
        #tag_scores = LSTMmodel(lookup_tensor).view(10, no_ans)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        try:
            question_lengths = Variable(torch.LongTensor([words.index(0)]))
        except ValueError:
            question_lengths = Variable(torch.LongTensor([10]))    
        
        last_out = torch.gather(tag_scores, 1, question_lengths.view(-1,1,1).expand(1,1,no_ans)-1)
        last_out = last_out.view(1, no_ans)
        '''
        if (epoch % 10 == 0) and FIRST:
            print(last_out.data.numpy().argmax())
            print(targets)
            print('--------')
            FIRST = False
        '''   
        loss = loss_function(last_out, targets)
        loss.backward()
        train_loss += loss.data[0]
            
        optimizer.step()
        
    print("iter %r: train loss/sent=%.4f, time=%.2fs" % 
              (epoch, train_loss/1000, time.time()-start))

iter 0: train loss/sent=0.2142, time=134.02s
iter 1: train loss/sent=0.2667, time=135.90s
iter 2: train loss/sent=0.2390, time=138.03s
iter 3: train loss/sent=0.2779, time=134.83s
iter 4: train loss/sent=0.2120, time=134.06s
iter 5: train loss/sent=0.2200, time=133.79s
iter 6: train loss/sent=0.2421, time=120.64s
iter 7: train loss/sent=0.2552, time=124.84s
iter 8: train loss/sent=0.2651, time=132.31s
iter 9: train loss/sent=0.1893, time=133.49s
iter 10: train loss/sent=0.2388, time=133.25s
iter 11: train loss/sent=0.1923, time=135.08s
iter 12: train loss/sent=0.1972, time=114.40s
iter 13: train loss/sent=0.2066, time=103.45s
iter 14: train loss/sent=0.1937, time=128.69s
iter 15: train loss/sent=0.2152, time=110.82s
iter 16: train loss/sent=0.1991, time=107.92s
iter 17: train loss/sent=0.2001, time=131.08s
iter 18: train loss/sent=0.1978, time=134.07s
iter 19: train loss/sent=0.2244, time=133.96s
iter 20: train loss/sent=0.1878, time=160.78s
iter 21: train loss/sent=0.1563, time=127.18

iter 178: train loss/sent=9.6488, time=106.86s
iter 179: train loss/sent=10.0311, time=94.16s
iter 180: train loss/sent=10.0201, time=94.11s
iter 181: train loss/sent=10.0540, time=99.09s
iter 182: train loss/sent=10.4593, time=109.37s
iter 183: train loss/sent=10.0220, time=103.79s
iter 184: train loss/sent=9.9957, time=105.55s
iter 185: train loss/sent=9.7708, time=94.06s
iter 186: train loss/sent=9.8117, time=94.01s
iter 187: train loss/sent=9.4738, time=94.44s
iter 188: train loss/sent=9.5468, time=94.78s
iter 189: train loss/sent=9.8110, time=94.27s
iter 190: train loss/sent=9.8744, time=97.05s
iter 191: train loss/sent=10.1701, time=111.89s
iter 192: train loss/sent=10.0716, time=108.79s
iter 193: train loss/sent=10.1672, time=98.20s
iter 194: train loss/sent=9.9350, time=111.94s
iter 195: train loss/sent=9.6583, time=107.17s
iter 196: train loss/sent=9.5431, time=100.07s
iter 197: train loss/sent=9.5795, time=108.83s
iter 198: train loss/sent=9.6131, time=94.97s
iter 199: train 

KeyboardInterrupt: 

In [None]:
#get lstm result
def get_lstm_result(data):            
    question_lstm_result = np.zeros((len(data), no_ans))
    
    #transfer lstm result to numpy 
    for idx, words in enumerate(data):
        
        if len(words) > 10:
            words = words[0:10]        
        
        lookup_tensor = Variable(torch.LongTensor([words])).view(-1) 
        tag_scores = LSTMmodel(lookup_tensor).view(1, 10, no_ans)
        
        try:
            question_lengths = Variable(torch.LongTensor([words.index(0)]))
        except ValueError:
            question_lengths = Variable(torch.LongTensor([10]))    
        
        last_out = torch.gather(tag_scores, 1, question_lengths.view(-1,1,1).expand(1,1,no_ans)-1)
        question_lstm_result[idx] = last_out.view(1,no_ans).data.numpy()
        #print(question_lstm_result.argmax())
    
    return question_lstm_result

correct = 0
for idx in range(1000):
    if (data_ans_train[idx][1] == get_lstm_result(list([data_ans_train[idx][0]])).argmax()):
        correct += 1
        
print(correct/1000)


In [None]:
#build network
class Net(torch.nn.Module):
    def __init__(self, n_feature, n_hidden, n_output):
        super(Net, self).__init__()
        self.hidden = torch.nn.Linear(n_feature, n_hidden)   # hidden layer
        self.out = torch.nn.Linear(n_hidden, n_output)   # output layer

    def forward(self, x):
        x = nn.functional.relu(self.hidden(x))      # activation function for hidden layer
        x = self.out(x)
        return x

In [None]:
n_input = img_feat.shape[0] + no_embeddings
n_output = len(ans_dict)
n_hidden_size = 200
learning_rate = 0.01

word_img_model = Net(n_input, n_hidden_size, n_output)

loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(word_img_model.parameters(), lr=learning_rate)

temp_x = Variable()
temp_y = Variable()

for ITER in range(600):
    
    train_loss = 0.0
    #loss = []
    #print('start!')
    count_err = 0
    for idx, adata in enumerate(adata_train['annotations']):
        #preparing input
        #if idx % 100 == 0:
        #    print(idx)
        
        #get text vector:
        #question = qdata_train['questions'][idx]['question']
        #data_ans_train is a list which contain question word to index, and tag(answers to index)
        #only use data_ans_train[idx][0] to get the list of question word to index
        #question word_vecotr is got from the matrix from cbow, and sum it up
        question_word_vector = get_word_embedding_layer(list([data_ans_train[idx][0]])).reshape( no_embeddings, )       
        
        #get image vector
        h5_id = visual_feat_mapping[str(adata['image_id'])]        
        img_feat = img_features[h5_id]
        
        #concatenate word vecotr and image vector : (64,)+(2048,) = (2112,)    
        img_word_vector = np.concatenate((question_word_vector, img_feat), axis=0)
        
        #output_vector = np.zeros((1,1))
        #output_vector[ans_to_idx_dict[adata['multiple_choice_answer']]] = 1
        answer_index = 0 if not(adata['multiple_choice_answer'] in ans_dict) else ans_dict[adata['multiple_choice_answer']]
        output_vector = np.array([answer_index])#.reshape(1,1)
        x = Variable(torch.from_numpy(img_word_vector).type(torch.FloatTensor))#.cuda()
        y = Variable(torch.from_numpy(output_vector))#.cuda()        
        
        y_pred = word_img_model(x).view(1,n_output)
        if (ITER  % 100 == 0) or (ITER == 999):
            pass
            #print(question,' image id:', (adata['image_id']))
            #print(ans_soft_max[y_pred.data.numpy().argmax()], adata['multiple_choice_answer'])
            
        if rev_ans_dict[str(y_pred.data.numpy().argmax())] != adata['multiple_choice_answer']:
            count_err += 1
        #if idx % 1000 == 0:
        #    print('idx ' + str(idx))
        loss = loss_fn(y_pred, y)
            
        train_loss += loss.data[0]
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()   
        
        #optimizer.zero_grad()
        #loss.backward()
        #optimizer.step()       
        #loss.append()
    
    
    if (ITER  % 100 == 0):
        print('{:>5}'.format(ITER),' loss: ', train_loss)
        print('err: ', count_err)
        pass
        #print('{:>5}'.format(ITER),' loss: ', train_loss)



        

In [None]:
from IPython.display import Image
from IPython.core.display import HTML 

with open('./data/imgid2imginfo.json', 'r') as file:
    imgid2info = json.load(file)

def show_predict(idx, question = None):
    #idx = 1
    
    if question == None:       
        ans = adata_train['annotations'][idx]['multiple_choice_answer']
    else:
        ans = ''
        
    if question == None:
        question = qdata_train['questions'][idx]['question']
    img_id =  qdata_train['questions'][idx]['image_id']
    
    print(question)
    print(ans)
    word_idx = []
    question_split = question.split(' ')
    for word in question_split:
        word_idx += [w2i[word]]
    #print(w2i['question'])
    if question == None:
        question_word_vector = get_word_embedding_layer(list([data_ans_train[idx][0]])).reshape( no_embeddings, ) 
    else:
        question_word_vector = get_word_embedding_layer([word_idx]).reshape( no_embeddings, ) 
    #get image vector
    h5_id = visual_feat_mapping[str(img_id)]
    img_feat = img_features[h5_id]
    #print(img_feat)
    #concatenate word vecotr and image vector
    img_word_vector = np.concatenate((question_word_vector, img_feat), axis=0)
    
    #predict the output
    x = Variable(torch.from_numpy(img_word_vector).type(torch.FloatTensor))
    predict_y = word_img_model(x)        
    print('model: ', rev_ans_dict[str(predict_y.data.numpy().argmax())])
    #print(predict_y.max())
   
    #using image vector to predict the output
    zero_question_word_vector = np.zeros(no_embeddings,)
    img_word_vector = np.concatenate((zero_question_word_vector, img_feat), axis=0)
    x_img = Variable(torch.from_numpy(img_word_vector).type(torch.FloatTensor))#.cuda()
    predict_y_img = word_img_model(x_img)
    print('img: ', rev_ans_dict[str(predict_y_img.data.numpy().argmax())])
    
    #using word vecotr to predict the output
    zero_question_img_vector = np.zeros(2048,)
    img_word_vector = np.concatenate((question_word_vector, zero_question_img_vector), axis=0)
    x_word = Variable(torch.from_numpy(img_word_vector).type(torch.FloatTensor))#.cuda()
    predict_y_word = word_img_model(x_word)
    print('word: ', rev_ans_dict[str(predict_y_word.data.numpy().argmax())])

    display(Image(url= imgid2info[str(img_id)]['coco_url']))
for i in range(100):
    show_predict(i, 'how many people are there')
    #show_predict(i)
    