# Visual Qustion Answering Dataset

VQA Homepage http://visualqa.org/download.html

Annotations taken from [Training annotations 2017 v2.0](http://visualqa.org/data/mscoco/vqa/v2_Annotations_Train_mscoco.zip)

Questions taken from [Training questions 2017 v2.0](http://visualqa.org/data/mscoco/vqa/v2_Questions_Train_mscoco.zip)

![title](img/vqa_examples.jpg)

In [47]:
import json
import zipfile
import random
import numpy as np
import h5py
from collections import Counter, defaultdict
from time import time
from collections import defaultdict
import time
import random
import torch
from torch.autograd import Variable
import torch.nn as nn
import gzip
import os

GENSIM = True
DATA_PATH = '..' + os.sep + 'NLP1-2017-VQA' + os.sep + 'data' + os.sep

if GENSIM:
    from gensim.models.keyedvectors import KeyedVectors
    # Load Google's pre-trained Word2Vec model.
    w2v_model = KeyedVectors.load_word2vec_format(DATA_PATH + 'GoogleNews-vectors-negative300.bin', binary=True, limit=100000)

In [None]:
with zipfile.ZipFile('./data/v2_Questions_Train_mscoco.zip', 'r') as file:
    qdata = json.load(file.open(file.namelist()[0]))

with zipfile.ZipFile('./data/v2_Annotations_Train_mscoco.zip', 'r') as file:
    adata = json.load(file.open(file.namelist()[0])) 

### Preprocessing

* Spelling correction (using Bing Speller) of question and answer strings
* Question normalization (first char uppercase, last char ‘?’)
* Answer normalization (all chars lowercase, no period except as decimal point, number words —> digits, strip articles (a, an the))
* Adding apostrophe if a contraction is missing it (e.g., convert "dont" to "don't")

## Data Exploration

### Annotation Data

In [1]:
print("# Datapoints: ", len(adata['annotations']))
print("Datapoint keys: ", adata['annotations'][0].keys())

NameError: name 'adata' is not defined

Let's look at some datapoints:

In [4]:
print("#1: ", adata['annotations'][0])
print("\n#2: ", adata['annotations'][1])
print("\n#3: ", adata['annotations'][2])

#1:  {'question_type': 'what is this', 'multiple_choice_answer': 'net', 'answers': [{'answer': 'net', 'answer_confidence': 'maybe', 'answer_id': 1}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 2}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 3}, {'answer': 'netting', 'answer_confidence': 'yes', 'answer_id': 4}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 5}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 6}, {'answer': 'mesh', 'answer_confidence': 'maybe', 'answer_id': 7}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 8}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 9}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 10}], 'image_id': 458752, 'answer_type': 'other', 'question_id': 458752000}

#2:  {'question_type': 'what', 'multiple_choice_answer': 'pitcher', 'answers': [{'answer': 'pitcher', 'answer_confidence': 'yes', 'answer_id': 1}, {'answer': 'catcher', 'answer_confidence': 'no', 'answer_

### Question Data

In [5]:
print("# Datapoints: ", len(qdata['questions']))
print("\nDatapoint keys: ", qdata['questions'][0].keys())

# Datapoints:  443757

Datapoint keys:  dict_keys(['image_id', 'question', 'question_id'])


Let's look at some datapoints

In [6]:
print("#1: ", qdata['questions'][0])
print("\n#2: ", qdata['questions'][1])
print("\n#3: ", qdata['questions'][2])

#1:  {'image_id': 458752, 'question': 'What is this photo taken looking through?', 'question_id': 458752000}

#2:  {'image_id': 458752, 'question': 'What position is this man playing?', 'question_id': 458752001}

#3:  {'image_id': 458752, 'question': 'What color is the players shirt?', 'question_id': 458752002}


### Dataset Statistics

In [7]:
question_types = set()
multiple_choice_answers = set()
answer2count = defaultdict(int)
answer_types = set()
answertypes2count = defaultdict(int)
top_answers_per_type = defaultdict(lambda: defaultdict(int))
for ann in adata['annotations']:
    question_types.add(ann['question_type'])
    
    multiple_choice_answers.add(ann['multiple_choice_answer'])
    
    answer2count[ann['multiple_choice_answer']] += 1
    answer_types.add(ann['answer_type'])
    
    answertypes2count[ann['answer_type']] += 1
    top_answers_per_type[ann['answer_type']][ann['multiple_choice_answer']] += 1

#### Question Types

In [8]:
print("# Unique Question Types: ", len(question_types))
print(question_types)

# Unique Question Types:  65
{'is that a', 'is this', 'does this', 'is', 'what color is', 'is the person', 'what does the', 'what is the name', 'what sport is', 'what animal is', 'how many people are in', 'what number is', 'what is', 'is this an', 'what is in the', 'was', 'what color is the', 'what is the man', 'are these', 'what room is', 'is it', 'what', 'is the man', 'is there a', 'is he', 'what is on the', 'can you', 'what kind of', 'is this a', 'none of the above', 'why is the', 'what brand', 'where is the', 'are there any', 'which', 'what is the color of the', 'what color are the', 'what is the woman', 'who is', 'do', 'are the', 'what type of', 'what color', 'what are the', 'what time', 'is this person', 'are there', 'are', 'what is the person', 'what is this', 'how many people are', 'is the', 'where are the', 'could', 'has', 'how many', 'why', 'is there', 'what are', 'are they', 'do you', 'does the', 'is the woman', 'how', 'what is the'}


#### Answer Types

In [9]:
print("Answer Types: ", answer_types)
print("Answer Type Counts: ", Counter(answertypes2count).most_common())
for t in list(answer_types):
    print("\nType '%s' Top 50 Answers %s" %(t, Counter(top_answers_per_type[t]).most_common(50)))

Answer Types:  {'other', 'number', 'yes/no'}
Answer Type Counts:  [('other', 219269), ('yes/no', 166882), ('number', 57606)]

Type 'other' Top 50 Answers [('white', 8915), ('blue', 5455), ('red', 5201), ('black', 5066), ('brown', 3814), ('green', 3750), ('yellow', 2792), ('gray', 2113), ('nothing', 1814), ('right', 1760), ('frisbee', 1641), ('baseball', 1597), ('left', 1563), ('none', 1562), ('tennis', 1502), ('wood', 1449), ('orange', 1425), ('bathroom', 1230), ('pizza', 1203), ('pink', 1201), ('kitchen', 1093), ('cat', 933), ('dog', 890), ('water', 888), ('man', 885), ('skateboarding', 884), ('grass', 879), ('skiing', 866), ('kite', 793), ('silver', 773), ('black and white', 766), ('surfing', 762), ('horse', 708), ('living room', 702), ('skateboard', 701), ('phone', 697), ('snow', 641), ('wii', 636), ('giraffe', 636), ('woman', 632), ('standing', 627), ('surfboard', 622), ('eating', 607), ('cake', 601), ('food', 599), ('apple', 586), ('sunny', 584), ('broccoli', 572), ('table', 564),

#### Answers

In [10]:
print("# Unique Answers: ", len(multiple_choice_answers))
print("\nSome Answers: ", list(np.random.choice(list(multiple_choice_answers), 100)))
print("\nTop 100 Common Answers: ", Counter(answer2count).most_common(100))

# Unique Answers:  22531

Some Answers:  ['congress', 'colonials', 'dragonair', 'african american', 'cigarette', 'comic', 'chiquita and del monte', 'tilted', '3 days', 'cosmic ln', 'tennis clothes', 'emmanuel n photo', 'supply', 'mon-sat 8am-6pm', '007', 'love seat', 'medical', 'posing for photo', 'because they slaughter them for meat', 'syrup', 'changes in traffic', 'circus', 'green bay', 'airplanes', '488', 'taos', '2:14', 'coke and water', 'v', 'paddle', 'sheep and goat', '350', 'instruments', '05:04', 'building sandcastle', 'white, blue, and red', 'riding', 'on bed', 'housecat', 'roman', 'chicken, broccoli, pasta', 'taking stretch', 'spt', 'pillowcase', '617-497-4111', 'burlap', 'a place to stand', 'casino', '1890', 'crochet', 'no ball', 'tusk holes', 'eric berne', 'cake sale', 'chip wagon', 'stay back', '2 brunette', 'near city', 'at beach', 'under mom', 'independent', 'to play', 'boy on right', 'cleanliness', '1st base', 'wwwclaykessackcom', 'uphill', 'apple identification', 'sid

## Dataset Creation

The subset will follow the same structure as the original VQA dataset. This is:

* Answer
    * Question Type
    * Majority Answer
    * Answer Type
    * Answer Candidates
        * Given Answer
        * Confidence
        * Answerer ID
        
        
* Question
    * Question
    * Image ID
   
   
* Images
    * ResNet Image Features (Size: 2048)
    

In order to train your models on your machine with a CPU (or if you have a GPU), we need to reduce the size of the Dataset. We will reduce the original dataset in the following way:
* 20k Q/A of answer type _yes/no_
* 20k Q/A of answer type _number_
* 20k Q/A of answer type _other_

The total number of Q/A will then be 60000. We will divide into training, validation and test split. The ratio between the splits will be approximately: 80%, 15%, 5% respectively.

In [11]:
start_time = time()
idx = list(range(0,len(qdata['questions'])))
random.seed(42)
random.shuffle(idx)

np.random.seed(42)
splits = ['train', 'valid', 'test']

n = 20000
qdata_small = {'questions': list()}
adata_small = {'annotations': list()}
a_type_counts = {'yes/no': 0, 'number': 0, 'other': 0}

while len(qdata_small['questions']) < 3*n:
    i = idx.pop()
    
    at = adata['annotations'][i]['answer_type'] 
    
    if a_type_counts[at] < n:
        
        if at == 'yes/no' and adata['annotations'][i]['multiple_choice_answer'] not in ['yes', 'no']:
            continue
            
        adata_small['annotations'].append(adata['annotations'][i])
        qdata_small['questions'].append(qdata['questions'][i])
        
        split = np.random.choice(splits, p=(.8, .15, .05))
        adata_small['annotations'][-1]['split'] = split
        qdata_small['questions'][-1]['split'] = split
        
        a_type_counts[at] += 1
        
# Tests
assert len(qdata_small['questions']) == len(adata_small['annotations']) == 3*n, "Inconsitent Lengths."
a_type_counts = {'yes/no': 0, 'number': 0, 'other': 0}
for ann in adata_small['annotations']:
    a_type_counts[ann['answer_type']] += 1
assert a_type_counts['yes/no'] == a_type_counts['number'] == a_type_counts['other'] == n, "Inconsistent Answer Type Lengths."

print("Data Creation Looks good! Time Taken %.2f" %(time()-start_time))

Data Creation Looks good! Time Taken 2.33


Let's look at some examples to verify this is the same data. Calculating the statistics again.

#### Annotations Small Dataset

In [12]:
print("# Datapoints: ", len(adata_small['annotations']))
print("\nDatapoint keys: ", adata_small['annotations'][0].keys())
print("\n#1: ", adata_small['annotations'][0])
print("\n#2: ", adata_small['annotations'][1])
print("\n#3: ", adata_small['annotations'][2])

# Datapoints:  60000

Datapoint keys:  dict_keys(['question_type', 'multiple_choice_answer', 'answers', 'image_id', 'answer_type', 'question_id', 'split'])

#1:  {'question_type': 'what', 'multiple_choice_answer': 'tea', 'answers': [{'answer': 'brunch', 'answer_confidence': 'maybe', 'answer_id': 1}, {'answer': 'tea', 'answer_confidence': 'yes', 'answer_id': 2}, {'answer': 'tea time', 'answer_confidence': 'yes', 'answer_id': 3}, {'answer': 'brunch', 'answer_confidence': 'yes', 'answer_id': 4}, {'answer': 'breakfast', 'answer_confidence': 'maybe', 'answer_id': 5}, {'answer': 'tea', 'answer_confidence': 'yes', 'answer_id': 6}, {'answer': 'teatime', 'answer_confidence': 'yes', 'answer_id': 7}, {'answer': 'lunch', 'answer_confidence': 'yes', 'answer_id': 8}, {'answer': 'reception', 'answer_confidence': 'maybe', 'answer_id': 9}, {'answer': 'breakfast', 'answer_confidence': 'yes', 'answer_id': 10}], 'image_id': 228478, 'answer_type': 'other', 'question_id': 228478002, 'split': 'train'}

#2:  

#### Questions Small Dataset

In [13]:
print("# Datapoints: ", len(qdata_small['questions']))
print("\nDatapoint keys: ", qdata_small['questions'][0].keys())
print("\n#1: ", qdata_small['questions'][0])
print("\n#2: ", qdata_small['questions'][1])
print("\n#3: ", qdata_small['questions'][2])

# Datapoints:  60000

Datapoint keys:  dict_keys(['image_id', 'question', 'question_id', 'split'])

#1:  {'image_id': 228478, 'question': 'What English meal is this likely for?', 'question_id': 228478002, 'split': 'train'}

#2:  {'image_id': 540769, 'question': 'Is there a bell on the train?', 'question_id': 540769000, 'split': 'test'}

#3:  {'image_id': 111756, 'question': 'What color is his uniform?', 'question_id': 111756005, 'split': 'train'}


### Dataset Statistics Small Dataset

In [14]:
question_types = set()
multiple_choice_answers = set()
answer2count = defaultdict(int)
answer_types = set()
answertypes2count = defaultdict(int)
top_answers_per_type = defaultdict(lambda: defaultdict(int))
for ann in adata_small['annotations']:
    question_types.add(ann['question_type'])
    
    multiple_choice_answers.add(ann['multiple_choice_answer'])
    
    answer2count[ann['multiple_choice_answer']] += 1
    answer_types.add(ann['answer_type'])
    
    answertypes2count[ann['answer_type']] += 1
    top_answers_per_type[ann['answer_type']][ann['multiple_choice_answer']] += 1

#### Quesiton Types Small Dataset

In [15]:
print("# Unique Question Types: ", len(question_types))
print(question_types)

# Unique Question Types:  65
{'is that a', 'is this', 'does this', 'what color is', 'is', 'is the person', 'what is the name', 'what sport is', 'what does the', 'what animal is', 'how many people are in', 'what number is', 'what is', 'is this an', 'what is in the', 'what color is the', 'was', 'what is the man', 'are these', 'what room is', 'is it', 'is there a', 'what', 'is the man', 'is he', 'what is on the', 'can you', 'what kind of', 'is this a', 'none of the above', 'why is the', 'what brand', 'where is the', 'are there any', 'which', 'what is the color of the', 'what color are the', 'who is', 'what is the woman', 'do', 'what type of', 'are the', 'what are the', 'what color', 'is this person', 'what time', 'are there', 'are', 'what is the person', 'what is this', 'how many people are', 'is the', 'where are the', 'could', 'has', 'how many', 'why', 'is there', 'what are', 'are they', 'do you', 'does the', 'is the woman', 'how', 'what is the'}


#### Answer Types Small Dataset

In [16]:
print("Answer Types: ", answer_types)
print("Answer Type Counts: ", Counter(answertypes2count).most_common())
for t in list(answer_types):
    print("\nType '%s' Top 50 Answers %s" %(t, Counter(top_answers_per_type[t]).most_common(50)))

Answer Types:  {'other', 'number', 'yes/no'}
Answer Type Counts:  [('other', 20000), ('yes/no', 20000), ('number', 20000)]

Type 'other' Top 50 Answers [('white', 823), ('red', 494), ('black', 460), ('blue', 449), ('green', 355), ('brown', 331), ('yellow', 266), ('gray', 190), ('right', 154), ('frisbee', 152), ('nothing', 151), ('left', 144), ('baseball', 134), ('none', 132), ('orange', 130), ('wood', 127), ('tennis', 123), ('pink', 119), ('pizza', 118), ('kitchen', 113), ('bathroom', 106), ('cat', 90), ('water', 86), ('dog', 85), ('skiing', 84), ('grass', 84), ('surfing', 80), ('skateboarding', 78), ('horse', 75), ('black and white', 74), ('kite', 73), ('surfboard', 72), ('silver', 71), ('man', 69), ('living room', 66), ('woman', 65), ('giraffe', 64), ('table', 63), ('wii', 61), ('apple', 58), ('snow', 58), ('phone', 57), ('skateboard', 56), ('hat', 56), ('broccoli', 54), ('snowboarding', 53), ('eating', 53), ('cow', 52), ('standing', 51), ('sunny', 50)]

Type 'number' Top 50 Answers 

#### Answers Small Dataset

In [17]:
print("# Unique Answers: ", len(multiple_choice_answers))
print("\nSome Answers: ", list(np.random.choice(list(multiple_choice_answers), 100)))
print("\nTop 100 Common Answers: ", Counter(answer2count).most_common(100))

# Unique Answers:  5691

Some Answers:  ['38', 'lift', '6 5 4 3', 'happy 50th birthday', 'cutting board', '8 ft', 'cook', 'fresh oil', 'bakery', 'stars and hearts', 'street cleaner', 'ahc 442', 'colorado', 'owner', 'surfing', 'fashion show', 'mile', 'champion', 'headband', 'portable', 'luggage room', 'green and white', '3:10', 'rackets', '10:00 am', 'ducati', 'mocking', 'cemetery', 'grapefruit', 'fire department', 'movement', '2 people', 'hippie drum circle', 'fresh fruit', '7502', 'kite', 'relaxed', 'monday', "o'neill", 'on counter', '100% fatto mano', '365', 'cigar', 'brother', 'bob', '2:28', 'shaggy', 'kitty litter', 'carrot cake', 'horseback riding', 'sandwich and chips', '11:58', 'tennis dress', 'back left', '2 towels', 'hungry', 'behind head', 'james bond', '055', 'crouching', 'one sweet ride', 'fist', 'rainbow', '95', '0870 400 4000', 'boardwalk', '258', '592', '1126', 'bucket in shower', 'overpass', 'old fashioned', 'forsythia', "1940's", 'fast', 'tim hortons', 'jollibee', 'top

## Saving

In [2]:
import gzip

### Splitting

In [19]:
qdata_small_splits = {\
                      'train': {'questions': list()}, 
                      'valid': {'questions': list()}, 
                      'test': {'questions': list()}
                     }

adata_small_splits = {\
                      'train': {'annotations': list()}, 
                      'valid': {'annotations': list()}, 
                      'test': {'annotations': list()}
                     }

for i in range(len(qdata_small['questions'])):
    
    split = qdata_small['questions'][i]['split']
    assert split == adata_small['annotations'][i]['split'], "Inconsistent Splits."
    assert adata_small['annotations'][i]['question_id'] == qdata_small['questions'][i]['question_id'], "Inconsistent IDs."
    
    qdata_small_splits[split]['questions'].append(qdata_small['questions'][i])
    adata_small_splits[split]['annotations'].append(adata_small['annotations'][i])
    
        
print("Training Set Size: %i" %(len(qdata_small_splits['train']['questions'])))
print("\nValidation Set Size: %i" %(len(qdata_small_splits['valid']['questions'])))
print("\nTest Set Size: %i" %(len(qdata_small_splits['test']['questions'])))

Training Set Size: 48061

Validation Set Size: 8977

Test Set Size: 2962


### Write out the files

In [1]:
for split in ['train', 'valid', 'test']:
    
    with gzip.GzipFile('data/vqa_annotatons_' + split + '.gzip', 'w') as file:
        file.write(json.dumps(adata_small_splits[split]).encode('utf-8'))
        
    with gzip.GzipFile('data/vqa_questions_' + split + '.gzip', 'w') as file:
        file.write(json.dumps(qdata_small_splits[split]).encode('utf-8'))

NameError: name 'gzip' is not defined

Get list of all image ids

In [2]:
image_ids = set()
for q in qdata_small['questions']:
    image_ids.add(q['image_id'])

image_ids_json = {'image_ids': list(image_ids)}
with open('data/image_ids_vqa.json', 'w') as file:
with gzip.open('file.txt.gz', 'rb') as f:
    file_content = f.read()
    json.dump(image_ids_json, file)

IndentationError: expected an indented block (<ipython-input-2-ac0637ffca19>, line 7)

In [28]:
#read data
with gzip.open(DATA_PATH + 'vqa_questions_train.gzip', 'rb') as file:
    file_content = file.read().decode('utf-8')
    qdata_train = json.loads(file_content)

with gzip.GzipFile(DATA_PATH + 'vqa_annotatons_train.gzip', 'r') as file:
    adata_train = json.loads(file.read().decode('utf-8')) 
    
with gzip.open(DATA_PATH + 'vqa_questions_test.gzip', 'rb') as file:
    file_content = file.read().decode('utf-8')
    qdata_test = json.loads(file_content)
    
with gzip.GzipFile(DATA_PATH + 'vqa_annotatons_test.gzip', 'r') as file:
    adata_test = json.loads(file.read().decode('utf-8')) 
    


In [29]:
print(qdata_train['questions'][0])
print("#1: ", adata_train['annotations'][0]['question_type'])

question_types = set()
multiple_choice_answers = set()
answer2count = defaultdict(int)
answer_types = set()
answertypes2count = defaultdict(int)
top_answers_per_type = defaultdict(lambda: defaultdict(int))
for ann in adata_train['annotations']:
    question_types.add(ann['question_type'])
    
    multiple_choice_answers.add(ann['multiple_choice_answer'])
    
    answer2count[ann['multiple_choice_answer']] += 1
    answer_types.add(ann['answer_type'])
    
    answertypes2count[ann['answer_type']] += 1
    top_answers_per_type[ann['answer_type']][ann['multiple_choice_answer']] += 1

{'image_id': 228478, 'question': 'What English meal is this likely for?', 'question_id': 228478002, 'split': 'train'}
#1:  what


In [30]:
from collections import defaultdict
import time
import random
import torch
from torch.autograd import Variable
import torch.nn as nn

In [31]:
torch.manual_seed(1)

# Functions to read in the corpus
w2i = defaultdict(lambda: len(w2i))
question_types_dict = defaultdict(lambda: len(question_types_dict))
UNK = w2i["<unk>"]


def question_type_to_idx(question_types):
    for question_type in (question_types):
        question_types_dict[question_type]
    return question_types_dict


def read_data(train_data, train_answer_data):
    for idx in range(len(train_data)):
        question = train_data[idx]['question']
        question = question.lower().split("?",1)[0]
        question_type = train_answer_data[idx]['question_type']
        yield ([w2i[x] for x in question.split(" ")], question_types_dict[question_type])
        
# transering the tag(question type) to index
question_types_dict = question_type_to_idx(question_types)

#get the train questions, and then transfer to index,also add the tag index for every quesiton into the list
train = list(read_data(qdata_train['questions'], adata_train['annotations']))
w2i = defaultdict(lambda: UNK, w2i)
#get the test questions, and then transfer to index,also add the tag index for every quesiton into the list
dev = list(read_data(qdata_test['questions'], adata_test['annotations']))

nwords = len(w2i)
ntags = len(question_types_dict)



In [32]:
if not GENSIM:
    # The parameters for our BoW-model
    dtype = torch.FloatTensor  # enable CUDA here if you like
    w = Variable(torch.randn(nwords, ntags).type(dtype), requires_grad=True)
    b = Variable(torch.randn(ntags).type(dtype), requires_grad=True)


    # A function to calculate scores for one sentence
    def calc_scores(words):
        lookup_tensor = Variable(torch.LongTensor(words))
        embed = w[lookup_tensor]
        score = torch.sum(embed, 0) + b
        return score.view((1, -1))

    for ITER in range(10):

        # train
        random.shuffle(train)
        train_loss = 0.0
        start = time.time()
        for words, tag in train:

            # forward pass
            scores = calc_scores(words)
            target = Variable(torch.LongTensor([tag]))        
            loss = nn.CrossEntropyLoss()
            output = loss(scores, target)
            train_loss += output.data[0]        

            # backward pass (compute gradients)
            output.backward()

            # update weights with SGD
            lr = 0.01
            w.data -= lr * w.grad.data
            b.data -= lr * b.grad.data

            # clear gradients for next step
            w.grad.data.zero_()
            b.grad.data.zero_()

        print("iter %r: train loss/sent=%.4f, time=%.2fs" % 
              (ITER, train_loss/len(train), time.time()-start))

        # evaluate
        correct = 0.0
        for words, tag in dev:
            scores = calc_scores(words)
            #print(scores)
            predict = scores.data.numpy().argmax(axis=1)
            if predict == tag:
                correct += 1

        print("iter %r: test acc=%.4f" % 
              (ITER, correct/len(dev)))

In [33]:
print(question_types_dict)

defaultdict(<function <lambda> at 0x7fa4315386a8>, {'is it': 0, 'is the man': 1, 'is he': 2, 'what is': 3, 'is that a': 4, 'what animal is': 5, 'is the woman': 6, 'what brand': 7, 'is this a': 8, 'is there': 9, 'are the': 10, 'is this person': 11, 'what': 12, 'what is the woman': 13, 'what color are the': 14, 'what kind of': 15, 'do you': 16, 'what is on the': 17, 'do': 18, 'is the person': 19, 'what color is': 20, 'is there a': 21, 'was': 22, 'what is the name': 23, 'why': 24, 'who is': 25, 'is this an': 26, 'does this': 27, 'has': 28, 'how': 29, 'what are': 30, 'what is the color of the': 31, 'what is the person': 32, 'is the': 33, 'can you': 34, 'what is this': 35, 'is': 36, 'are they': 37, 'none of the above': 38, 'is this': 39, 'are there any': 40, 'what sport is': 41, 'what time': 42, 'are there': 43, 'what is the man': 44, 'could': 45, 'what are the': 46, 'what does the': 47, 'how many': 48, 'are these': 49, 'what is in the': 50, 'why is the': 51, 'what color': 52, 'how many peo

In [34]:
if not GENSIM:
    # evaluate
    i2w_dict = {}
    def i2w():
        for k, v in w2i.items():
            i2w_dict[v] = k

    def sent2i(sent):
        sent = sent.lower().split("?",1)[0]
        yield ([w2i[x] for x in sent.split(" ")])

    sent = 'how are you ?'
    test_sent = list(sent2i(sent))[0]
    print(test_sent)
    scores = calc_scores(test_sent)
    #print(scores)
    print(type(scores))
    predict = scores.data.numpy().argmax(axis=1)
    scores_numpy = scores.data.numpy()
    print(scores_numpy.shape)
    print(predict)


In [35]:
#read from image
path_to_h5_file = DATA_PATH + 'VQA_image_features.h5'
path_to_json_file = DATA_PATH + 'VQA_img_features2id.json'


img_features = np.asarray(h5py.File(path_to_h5_file, 'r')['img_features'])
with open(path_to_json_file, 'r') as f:
     visual_feat_mapping = json.load(f)['VQA_imgid2id']


h5_id = visual_feat_mapping[str(228478)]
img_feat = img_features[h5_id]

In [36]:
print(img_feat.shape)
print(img_features.shape)

(2048,)
(39423, 2048)


In [74]:
#if not GENSIM:
#softmax to cover the answer
ans_soft_max = []
for t in list(answer_types):
    print(t)
    top_answers = Counter(top_answers_per_type[t])
    #trying to covering at most 80% possible answers
    if (t == 'other'):
        print('123')
        covering_answer_index = 800#round((len(top_answers)*4/5))
    else :
        covering_answer_index = 100
    covering_answer_index = len(top_answers)
    for x in top_answers.most_common(covering_answer_index):
        ans_soft_max.append(x[0])
    #print(ans_soft_max)
    #print('--------------------------')


print(len(ans_soft_max))
print(len(Counter((ans_soft_max))))

number
other
123
yes/no
4949
4878


In [75]:
#if not GENSIM:
def answer_to_idx(ans_dict):
    ans_to_idx = defaultdict(lambda: len(ans_to_idx))
    for ans in (ans_dict):
        ans_to_idx[ans]
    return ans_to_idx

#ans_to_idx_dict is a dictionary convering all 80% possible answers, which are representing in index
ans_to_idx_dict = dict(answer_to_idx(ans_soft_max))
#print('example: answer: tea to index', ans_to_idx_dict['teaaaaaaaaaaaaa!!!!!!!!!!!!'])

#print('1' in ans_to_idx_dict)
#network parameter
print(len(Counter(ans_soft_max)))
n_output = len(ans_to_idx_dict)
n_input =  (img_features.shape[1]) + 65
n_hidden_size = 8
learning_rate = 0.005
input_size = len(adata_train['annotations'])

print('input_size', input_size)
print('output len: ', n_output)
print('input len ', n_input)
print(len(Counter(ans_soft_max)))
if 'computer repair' in ans_to_idx_dict:
    print('123')

4878
input_size 48061
output len:  4878
input len  2113
4878
123


In [76]:

#build network
'''
class Net(nn.Module):
    
    def __init__(self, input_size, hidden_size, num_classes):
        super(Net, self).__init__()
        if hidden_size == 0:
            self.fc1 = nn.Linear(input_size, num_classes)
            self.sigmoid = nn.Sigmoid()
            self.hidden_size = hidden_size
        else:
            self.fc1 = nn.Linear(input_size, hidden_size)
            #self.sigmoid = nn.Sigmoid()
            self.fc2 = nn.Linear(hidden_size, num_classes)
            #self.sigmoid = nn.Sigmoid()
        
    
    def forward(self, x):
        hidden_size = 10
        if hidden_size == 0:
            out = self.fc1(x)
            #out = self.sigmoid(out)
        else:
            out = self.fc1(x)
            out = nn.functional.relu(out)
            #out = self.sigmoid(out)
            out = self.fc2(out)
            #out = self.sigmoid(out)
        return out
'''

class Net(torch.nn.Module):
    def __init__(self, n_feature, n_hidden, n_output):
        super(Net, self).__init__()
        self.hidden = torch.nn.Linear(n_feature, n_hidden)   # hidden layer
        self.out = torch.nn.Linear(n_hidden, n_output)   # output layer

    def forward(self, x):
        x = nn.functional.relu(self.hidden(x))      # activation function for hidden layer
        x = self.out(x)
        return x

In [None]:
model = Net(2348, n_hidden_size, n_output)

#optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
#loss_fn = torch.nn.MSELoss(size_average=False)
#loss_fn = torch.nn.MultiLabelSoftMarginLoss()


loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

temp_x = Variable()
temp_y = Variable()

for ITER in range(10000):
    
    train_loss = 0.0
    #loss = []
    #print('start!')
    for idx, adata in enumerate(adata_train['annotations'][0:100]):
        #preparing input
        #if idx % 100 == 0:
        #    print(idx)
        #get work vecotr
        question = qdata_train['questions'][idx]['question']
        #print(question)
        
        #question_word_vector = calc_scores(list(sent2i(question))[0]).data.numpy().reshape(65,)
        
        question_word_vector = None
        question_split = question.split(' ')
        for question_word in question_split:            
            if question_word not in w2v_model:                                
                question_word = question_word[:-1]
                if question_word not in w2v_model:
                    #print(question_word)
                    continue
            if question_word_vector is None:
                question_word_vector = np.array(w2v_model[question_word])
            else:
                question_word_vector += np.array(w2v_model[question_word])
            #print(question_word_vector.shape)
        
        #input('---------------')
        #get image vector
        h5_id = visual_feat_mapping[str(adata['image_id'])]
        img_feat = img_features[h5_id]
        #concatenate word vecotr and image vector
        
        img_word_vector = np.concatenate((question_word_vector, img_feat), axis=0)
        #print(img_word_vector.shape)
        if not(adata['multiple_choice_answer'] in ans_to_idx_dict):
            continue
        #output_vector = np.zeros((1,1))
        #output_vector[ans_to_idx_dict[adata['multiple_choice_answer']]] = 1
        output_vector = np.array([ans_to_idx_dict[adata['multiple_choice_answer']]])#.reshape(1,1)
        x = Variable(torch.from_numpy(img_word_vector).type(torch.FloatTensor))#.cuda()
        y = Variable(torch.from_numpy(output_vector))#.cuda()
        
        
        y_pred = model(x).view(1,n_output)
        if (ITER  % 100 == 0) or (ITER == 999):
            print(question,' image id:', (adata['image_id']))
            print(ans_soft_max[y_pred.data.numpy().argmax()], adata['multiple_choice_answer'])
        loss = loss_fn(y_pred, y)
            
        train_loss += loss.data[0]
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()   
        
        #optimizer.zero_grad()
        #loss.backward()
        #optimizer.step()       
        #loss.append()
    if (ITER  % 100 == 0):
        print('{:>5}'.format(ITER),' loss: ', train_loss)


    

What English meal is this likely for?  image id: 228478
passenger tea
What color is his uniform?  image id: 111756
kids blue
Which girl is wearing glasses?  image id: 376241
kids right
What is the person doing?  image id: 434045
marlins sunbathing
How does the weather appear in this photo?  image id: 167330
kids sunny
What kind of facility are the people standing in?  image id: 158225
food and drinks greenhouse
What shape is this?  image id: 113236
kids octagon
What color is the Frisbee in the man's hand?  image id: 277284
horse and carriage red
What is this person riding?  image id: 496740
kids motorcycle
What color are the frames of the glasses?  image id: 143959
kids brown
What is the dog looking out of?  image id: 482789
horse and carriage window
How many people in the shot?  image id: 62707
1250 12
What is this animal?  image id: 397976
white one giraffe
What is lined up on the counter behind the man?  image id: 328427
1250 wine bottles
What type of food is the man eating?  image 

Is this a propeller or jet plane?  image id: 578232
red and white jet
What is the woman cooking?  image id: 185777
us airways express souffle
How many trees can be seen?  image id: 141414
yes 8
Is the giraffe drinking water or eating?  image id: 101418
real neither
What animal is the tallest?  image id: 233566
living room giraffe
What type of room is this?  image id: 421661
2 bathroom
How many keyboards are shown?  image id: 369712
brown 1
What is the plane doing?  image id: 366770
right nothing
Is the tap running?  image id: 300470
no no
Is this variation one that would be considered a meat-lovers variation?  image id: 534900
no no
Do the elephants know the man is there?  image id: 485769
no no
How deep is the water the birds are in?  image id: 489942
1 shallow
What color is his bat?  image id: 418557
brown brown
Do both clock have identical times?  image id: 297415
yes yes
How many handicap parking spaces are visible?  image id: 111955
rackets 0
What is the fence made out of?  image 

How many keyboards are shown?  image id: 369712
1 1
What is the plane doing?  image id: 366770
right nothing
Is the tap running?  image id: 300470
no no
Is this variation one that would be considered a meat-lovers variation?  image id: 534900
no no
Do the elephants know the man is there?  image id: 485769
no no
How deep is the water the birds are in?  image id: 489942
green shallow
What color is his bat?  image id: 418557
brown brown
Do both clock have identical times?  image id: 297415
yes yes
How many handicap parking spaces are visible?  image id: 111955
brown 0
What is the fence made out of?  image id: 268185
trees metal
Is this chocolate cake?  image id: 186265
no no
Has she hit the ball yet?  image id: 318310
yes yes
Is this a room in a house?  image id: 433296
yes yes
Do the horses have red ears?  image id: 38097
no no
Are there clouds up ahead?  image id: 169360
no no
Is the man wearing glasses?  image id: 84352
yes yes
Is that at a park?  image id: 294758
yes yes
What sport is

Is there a foreign language written on the picture?  image id: 331524
yes yes
Is this a propeller or jet plane?  image id: 578232
straw jet
What is the woman cooking?  image id: 185777
us airways express souffle
How many trees can be seen?  image id: 141414
teddy bear 8
Is the giraffe drinking water or eating?  image id: 101418
real neither
What animal is the tallest?  image id: 233566
living room giraffe
What type of room is this?  image id: 421661
kitchen bathroom
How many keyboards are shown?  image id: 369712
1 1
What is the plane doing?  image id: 366770
right nothing
Is the tap running?  image id: 300470
no no
Is this variation one that would be considered a meat-lovers variation?  image id: 534900
no no
Do the elephants know the man is there?  image id: 485769
no no
How deep is the water the birds are in?  image id: 489942
shallow shallow
What color is his bat?  image id: 418557
brown brown
Do both clock have identical times?  image id: 297415
yes yes
How many handicap parking s

How many trees can be seen?  image id: 141414
8 8
Is the giraffe drinking water or eating?  image id: 101418
real neither
What animal is the tallest?  image id: 233566
living room giraffe
What type of room is this?  image id: 421661
kitchen bathroom
How many keyboards are shown?  image id: 369712
1 1
What is the plane doing?  image id: 366770
right nothing
Is the tap running?  image id: 300470
no no
Is this variation one that would be considered a meat-lovers variation?  image id: 534900
no no
Do the elephants know the man is there?  image id: 485769
no no
How deep is the water the birds are in?  image id: 489942
shallow shallow
What color is his bat?  image id: 418557
brown brown
Do both clock have identical times?  image id: 297415
yes yes
How many handicap parking spaces are visible?  image id: 111955
0 0
What is the fence made out of?  image id: 268185
walking metal
Is this chocolate cake?  image id: 186265
no no
Has she hit the ball yet?  image id: 318310
yes yes
Is this a room in

Who is the child dressed as?  image id: 277064
prom buzz lightyear
Is this a dual monitor?  image id: 187379
yes yes
Are there any flower pots on the ground?  image id: 247177
no no
How many kids are there?  image id: 460885
2 2
Is this an old western photo?  image id: 222361
no no
Is there a foreign language written on the picture?  image id: 331524
yes yes
Is this a propeller or jet plane?  image id: 578232
straw jet
What is the woman cooking?  image id: 185777
us airways express souffle
How many trees can be seen?  image id: 141414
8 8
Is the giraffe drinking water or eating?  image id: 101418
real neither
What animal is the tallest?  image id: 233566
living room giraffe
What type of room is this?  image id: 421661
kitchen bathroom
How many keyboards are shown?  image id: 369712
1 1
What is the plane doing?  image id: 366770
right nothing
Is the tap running?  image id: 300470
no no
Is this variation one that would be considered a meat-lovers variation?  image id: 534900
no no
Do the

Is the bus parked?  image id: 160559
no no
Of what airline is the closest plane in the background?  image id: 503028
verizon world
What season was this photo likely taken in?  image id: 387379
trees winter
Can you see the desktop of the computer?  image id: 369630
yes yes
Is there a stop sign?  image id: 475656
yes yes
Is the plane taking off?  image id: 347671
yes yes
What kind of creature is on the right?  image id: 32738
bathroom cat
Are the giraffes in the wild?  image id: 66987
no no
What is the boy reaching for?  image id: 984
flowers banana
What is the material on the ground?  image id: 116851
red and white brick
Is this a bat or golf club?  image id: 124578
real neither
Is the room busy?  image id: 416878
no no
Is there carrots on the plate?  image id: 369071
yes yes
Is there anything in this picture than can transfer data to another computer?  image id: 324977
yes yes
Is there a sandy beach in the horizon?  image id: 467154
no no
Are these types of planes currently used?  imag

Who is the child dressed as?  image id: 277064
prom buzz lightyear
Is this a dual monitor?  image id: 187379
yes yes
Are there any flower pots on the ground?  image id: 247177
no no
How many kids are there?  image id: 460885
2 2
Is this an old western photo?  image id: 222361
no no
Is there a foreign language written on the picture?  image id: 331524
yes yes
Is this a propeller or jet plane?  image id: 578232
straw jet
What is the woman cooking?  image id: 185777
us airways express souffle
How many trees can be seen?  image id: 141414
8 8
Is the giraffe drinking water or eating?  image id: 101418
real neither
What animal is the tallest?  image id: 233566
living room giraffe
What type of room is this?  image id: 421661
kitchen bathroom
How many keyboards are shown?  image id: 369712
1 1
What is the plane doing?  image id: 366770
right nothing
Is the tap running?  image id: 300470
no no
Is this variation one that would be considered a meat-lovers variation?  image id: 534900
no no
Do the

What English meal is this likely for?  image id: 228478
box tea
What color is his uniform?  image id: 111756
blue blue
Which girl is wearing glasses?  image id: 376241
right right
What is the person doing?  image id: 434045
ox sunbathing
How does the weather appear in this photo?  image id: 167330
food sunny
What kind of facility are the people standing in?  image id: 158225
sailing greenhouse
What shape is this?  image id: 113236
door octagon
What color is the Frisbee in the man's hand?  image id: 277284
red red
What is this person riding?  image id: 496740
cake motorcycle
What color are the frames of the glasses?  image id: 143959
brown brown
What is the dog looking out of?  image id: 482789
teddy bear window
How many people in the shot?  image id: 62707
12 12
What is this animal?  image id: 397976
living room giraffe
What is lined up on the counter behind the man?  image id: 328427
tuna wine bottles
What type of food is the man eating?  image id: 482913
orange pizza
Is there more me

How many trees can be seen?  image id: 141414
8 8
Is the giraffe drinking water or eating?  image id: 101418
real neither
What animal is the tallest?  image id: 233566
living room giraffe
What type of room is this?  image id: 421661
kitchen bathroom
How many keyboards are shown?  image id: 369712
1 1
What is the plane doing?  image id: 366770
right nothing
Is the tap running?  image id: 300470
no no
Is this variation one that would be considered a meat-lovers variation?  image id: 534900
no no
Do the elephants know the man is there?  image id: 485769
no no
How deep is the water the birds are in?  image id: 489942
shallow shallow
What color is his bat?  image id: 418557
brown brown
Do both clock have identical times?  image id: 297415
yes yes
How many handicap parking spaces are visible?  image id: 111955
0 0
What is the fence made out of?  image id: 268185
walking metal
Is this chocolate cake?  image id: 186265
no no
Has she hit the ball yet?  image id: 318310
yes yes
Is this a room in

Is the giraffe drinking water or eating?  image id: 101418
real neither
What animal is the tallest?  image id: 233566
living room giraffe
What type of room is this?  image id: 421661
kitchen bathroom
How many keyboards are shown?  image id: 369712
1 1
What is the plane doing?  image id: 366770
right nothing
Is the tap running?  image id: 300470
no no
Is this variation one that would be considered a meat-lovers variation?  image id: 534900
no no
Do the elephants know the man is there?  image id: 485769
no no
How deep is the water the birds are in?  image id: 489942
shallow shallow
What color is his bat?  image id: 418557
brown brown
Do both clock have identical times?  image id: 297415
yes yes
How many handicap parking spaces are visible?  image id: 111955
0 0
What is the fence made out of?  image id: 268185
walking metal
Is this chocolate cake?  image id: 186265
no no
Has she hit the ball yet?  image id: 318310
yes yes
Is this a room in a house?  image id: 433296
yes yes
Do the horses 

Is this a propeller or jet plane?  image id: 578232
straw jet
What is the woman cooking?  image id: 185777
us airways express souffle
How many trees can be seen?  image id: 141414
8 8
Is the giraffe drinking water or eating?  image id: 101418
real neither
What animal is the tallest?  image id: 233566
living room giraffe
What type of room is this?  image id: 421661
kitchen bathroom
How many keyboards are shown?  image id: 369712
1 1
What is the plane doing?  image id: 366770
right nothing
Is the tap running?  image id: 300470
no no
Is this variation one that would be considered a meat-lovers variation?  image id: 534900
no no
Do the elephants know the man is there?  image id: 485769
no no
How deep is the water the birds are in?  image id: 489942
shallow shallow
What color is his bat?  image id: 418557
brown brown
Do both clock have identical times?  image id: 297415
yes yes
How many handicap parking spaces are visible?  image id: 111955
0 0
What is the fence made out of?  image id: 2681

How many kids are there?  image id: 460885
2 2
Is this an old western photo?  image id: 222361
no no
Is there a foreign language written on the picture?  image id: 331524
yes yes
Is this a propeller or jet plane?  image id: 578232
straw jet
What is the woman cooking?  image id: 185777
us airways express souffle
How many trees can be seen?  image id: 141414
8 8
Is the giraffe drinking water or eating?  image id: 101418
real neither
What animal is the tallest?  image id: 233566
living room giraffe
What type of room is this?  image id: 421661
kitchen bathroom
How many keyboards are shown?  image id: 369712
1 1
What is the plane doing?  image id: 366770
right nothing
Is the tap running?  image id: 300470
no no
Is this variation one that would be considered a meat-lovers variation?  image id: 534900
no no
Do the elephants know the man is there?  image id: 485769
no no
How deep is the water the birds are in?  image id: 489942
shallow shallow
What color is his bat?  image id: 418557
brown bro

What English meal is this likely for?  image id: 228478
box tea
What color is his uniform?  image id: 111756
blue blue
Which girl is wearing glasses?  image id: 376241
right right
What is the person doing?  image id: 434045
ox sunbathing
How does the weather appear in this photo?  image id: 167330
food sunny
What kind of facility are the people standing in?  image id: 158225
sailing greenhouse
What shape is this?  image id: 113236
door octagon
What color is the Frisbee in the man's hand?  image id: 277284
red red
What is this person riding?  image id: 496740
cake motorcycle
What color are the frames of the glasses?  image id: 143959
brown brown
What is the dog looking out of?  image id: 482789
teddy bear window
How many people in the shot?  image id: 62707
12 12
What is this animal?  image id: 397976
living room giraffe
What is lined up on the counter behind the man?  image id: 328427
tuna wine bottles
What type of food is the man eating?  image id: 482913
orange pizza
Is there more me

Is this Fox Theater?  image id: 80883
no no
What color is the bench?  image id: 18555
green green
What color is the roof?  image id: 273840
blue blue
Why are the sheep in a graveyard??  image id: 83587
roses grazing
What is the child  sitting on?  image id: 436649
rackets coffee maker
Is this woman riding?  image id: 445091
no no
Is that a baby giraffe?  image id: 356201
yes yes
How many women compared to men are playing the game?  image id: 320432
2 2
Is anybody on their bikes?  image id: 421684
no no
Is this an overcast day?  image id: 107959
yes yes
Is there snow on the trees?  image id: 400094
no no
What animal is in front of the camera?  image id: 101094
eating elephant
What utensils are shown here?  image id: 33793
car fork
What color is her snowboard?  image id: 119157
blue blue
Is this a dog?  image id: 29091
no no
What color is the table?  image id: 273979
brown brown
Who is the child dressed as?  image id: 277064
prom buzz lightyear
Is this a dual monitor?  image id: 187379
y

In [None]:
idx = 5
question = qdata_train['questions'][idx]['question']
img_id =  qdata_train['questions'][idx]['image_id']
print(question)


with open(DATA_PATH + 'imgid2imginfo.json', 'r') as file:
    imgid2info = json.load(file)


from IPython.display import Image
from IPython.core.display import HTML 



#question_word_vector = calc_scores(list(sent2i(question))[0]).data.numpy().reshape(65,)

question_word_vector = None
question_split = question.split(' ')
for question_word in question_split:            
    if question_word not in w2v_model:                                
        question_word = question_word[:-1]
        if question_word not in w2v_model:
            #print(question_word)
            continue
    if question_word_vector is None:
        question_word_vector = np.array(w2v_model[question_word])
    else:
        question_word_vector += np.array(w2v_model[question_word])

        #get image vector
h5_id = visual_feat_mapping[str(adata_train['annotations'][idx]['image_id'])]
img_feat = img_features[h5_id]
        #concatenate word vecotr and image vector
img_word_vector = np.concatenate((question_word_vector, img_feat), axis=0)
x = Variable(torch.from_numpy(img_word_vector).type(torch.FloatTensor))#.cuda()

predict_y = model(x)
print(ans_soft_max[predict_y.data.numpy().argmax()])
print(predict_y.max())

Image(url= imgid2info[str(img_id)]['coco_url'])