# VQA Data Generation

## Introduction

The code of this file comes from: https://github.com/cvlab-tohoku/Dense-CoAttention-Network
Before run this file, please run the file preprocess/preprocess.ipynb to genertate the following file:
- mscoco_train.json
- mscoco_val.json
- mscoco_trainval.json
- mscoco_testdev.json
- mscoco_test.json

In [17]:
import sys
sys.path.append("./")
from data_util import *
import json
import os
import pickle

## Data Map

In [18]:
data_map_vqa = {
	"train": "mscoco_train.json",
	"val": "mscoco_val.json",
	"trainval": "mscoco_trainval.json",
	"testdev": "mscoco_testdev.json",
	"test": "mscoco_test.json",
	"train_comp_path": "/Users/david/desktop/CV_research/VQA/pairs/v2_mscoco_train2014_complementary_pairs.json",
	"val_comp_path": "/Users/david/desktop/CV_research/VQA/pairs/v2_mscoco_val2014_complementary_pairs.json",
}

data_path = './dataset'
num_occurs = 8
glove_path = '/Users/david/desktop/CV_research/VQA/dataset/glove_840B.pt'
max_ques = 14
max_ans = None

## Preprocess Trainval Dataset

In [3]:
print("Process train dataset...")
train_set = json.load(open(os.path.join(data_path, data_map_vqa["train"]), "r"))
comp_train = json.load(open(data_map_vqa["train_comp_path"], "r"))

ans2idx, idx2ans, word2idx, idx2word, train_set, max_len_ques, poss_answers = process_dataset(train_set, num_occurs, glove_path, max_ques, max_ans)

print("Process val dataset...")
val_set = json.load(open(os.path.join(data_path, data_map_vqa["val"]), "r"))
comp_val = json.load(open(data_map_vqa["val_comp_path"], "r"))

val_set = filter_answers(val_set, ans2idx)
val_set = process_text(val_set)
val_set = filter_unk_word(val_set, word2idx)
val_set = encode_ans(val_set, ans2idx)

Process train dataset...
Number of unique answers: 22531
Total number of answers: 443757
Top 2412 answers account for 92.117307%
Sample frequent answers:
('net', 33)
('pitcher', 92)
('orange', 1425)
('yes', 84978)
('white', 8916)
('skiing', 866)
('red', 5201)
('frisbee', 1641)
('brushing teeth', 124)
('no', 82516)
('black and white', 766)
('skateboard', 701)
('1', 12540)
('blue', 5455)
('green', 3750)
('motorcycle', 490)
('gray', 2113)
('2', 12215)
('purse', 84)
('skis', 292)
Tokenizing questions and answers...
[['what', 'is', 'this', 'photo', 'taken', 'looking', 'through', '?']]
[[(['net'], 1)]]
[['what', 'position', 'is', 'this', 'man', 'playing', '?']]
[[(['pitcher'], 1), (['catcher'], 0.3)]]
[['what', 'color', 'is', 'the', 'players', 'shirt', '?']]
[[(['orange'], 1)]]
[['is', 'this', 'man', 'a', 'professional', 'baseball', 'player', '?']]
[[(['yes'], 1), (['no'], 0.3)]]
[['what', 'color', 'is', 'the', 'snow', '?']]
[[(['white'], 1)]]
gloves type: <class 'dict'>9.15% done)	
Most fre

In [9]:
dataset = train_set
train_dict = dict()
for data in dataset:
    ques_dict = dict()
    ques_dict[data['ques_id'][0]] = {"question": " ".join(data["processed_ques"][0][0:14]), \
                                    "answer": data["processed_ans"][0],\
                                    "answer_id": data['ans_id'][0]}
    
    if str(data['id']) in train_dict:
        train_dict[str(data['id'])].append(ques_dict)
    if str(data['id']) not in train_dict:
        train_dict[str(data['id'])] = [ques_dict]
        
with open("./VQA_pickles/train.pickle","wb") as file:
    pickle.dump(train_dict, file)

In [10]:
dataset = val_set
val_dict = dict()
for data in dataset:
    ques_dict = dict()
    ques_dict[data['ques_id'][0]] = {"question": " ".join(data["processed_ques"][0][0:14]), \
                                    "answer": data["processed_ans"][0],\
                                    "answer_id": data['ans_id'][0]}
    
    if str(data['id']) in val_dict:
        val_dict[str(data['id'])].append(ques_dict)
    if str(data['id']) not in val_dict:
        val_dict[str(data['id'])] = [ques_dict]
        
with open("./VQA_pickles/val.pickle","wb") as file:
    pickle.dump(val_dict, file)

## Preprocess Trainval 

In [19]:
print("Process trainval dataset...")
trainval_set = json.load(open(os.path.join(data_path, data_map_vqa["trainval"]), "r"))
comp_trainval = json.load(open(data_map_vqa["train_comp_path"], "r"))
comp_trainval.extend(json.load(open(data_map_vqa["val_comp_path"], "r")))

ans2idx, idx2ans, word2idx, idx2word, trainval_set, max_len_ques, poss_answers = \
    process_dataset(trainval_set, num_occurs, glove_path, max_ques, max_ans)

Process trainval dataset...
Number of unique answers: 29332
Total number of answers: 658111
Top 3133 answers account for 93.132921%
Sample frequent answers:
('net', 51)
('pitcher', 123)
('orange', 2211)
('yes', 125706)
('white', 13227)
('skiing', 1265)
('red', 7684)
('frisbee', 2397)
('brushing teeth', 170)
('no', 122598)
('black and white', 1220)
('skateboard', 1005)
('1', 18633)
('blue', 8188)
('green', 5736)
('motorcycle', 742)
('gray', 3213)
('2', 17960)
('purse', 117)
('skis', 410)
Tokenizing questions and answers...
[['what', 'is', 'this', 'photo', 'taken', 'looking', 'through', '?']]
[[(['net'], 1)]]
[['what', 'position', 'is', 'this', 'man', 'playing', '?']]
[[(['pitcher'], 1), (['catcher'], 0.3)]]
[['what', 'color', 'is', 'the', 'players', 'shirt', '?']]
[[(['orange'], 1)]]
[['is', 'this', 'man', 'a', 'professional', 'baseball', 'player', '?']]
[[(['yes'], 1), (['no'], 0.3)]]
[['what', 'color', 'is', 'the', 'snow', '?']]
[[(['white'], 1)]]
gloves type: <class 'dict'>8.77% done

In [20]:
dataset = trainval_set
trainval_dict = dict()
for data in dataset:
    ques_dict = dict()
    ques_dict[data['ques_id'][0]] = {"question": " ".join(data["processed_ques"][0][0:14]), \
                                    "answer": data["processed_ans"][0],\
                                    "answer_id": data['ans_id'][0]}
    
    if str(data['id']) in trainval_dict:
        trainval_dict[str(data['id'])].append(ques_dict)
    if str(data['id']) not in trainval_dict:
        trainval_dict[str(data['id'])] = [ques_dict]
        
with open("./VQA_pickles/trainval.pickle","wb") as file:
    pickle.dump(trainval_dict, file)

with open("./VQA_pickles/idx2ans.pickle","wb") as file:
    pickle.dump(idx2ans, file)

## Preprocess test

In [14]:
print("Process testdev dataset...")
testdev_set = json.load(open(os.path.join(data_path, data_map_vqa["testdev"]), "r"))

testdev_set = process_text(testdev_set, without_ans=True)
testdev_set = filter_unk_word(testdev_set, word2idx, without_ans=True)

print("Process test dataset...")
test_set = json.load(open(os.path.join(data_path, data_map_vqa["test"]), "r"))

test_set = process_text(test_set, without_ans=True)
test_set = filter_unk_word(test_set, word2idx, without_ans=True)

Process testdev dataset...
Tokenizing questions and answers...
[['what', 'credit', 'card', 'company', 'is', 'on', 'the', 'banner', 'in', 'the', 'background', '?']]
[['is', 'the', 'pitcher', 'wearing', 'a', 'hat', '?']]
[['is', 'the', 'ball', 'flying', 'towards', 'the', 'batter', '?']]
[['are', 'the', 'horses', 'playing', 'a', 'game', '?']]
[['what', 'is', 'the', 'color', 'of', 'water', 'in', 'the', 'image', '?']]
Process test dataset...4 (93.12% done)	
Tokenizing questions and answers...
[['is', 'the', 'ball', 'flying', 'towards', 'the', 'batter', '?']]
[['what', 'sport', 'is', 'this', '?']]
[['can', 'you', 'see', 'the', 'ball', '?']]
[['is', 'the', 'pitcher', 'wearing', 'a', 'hat', '?']]
[['will', 'he', 'catch', 'the', 'ball', 'in', 'time', '?']]
processing 440000/447793 (98.26% done)	

In [15]:
test_dev_data = testdev_set
test_dev_dict = dict()
for data in test_dev_data:
    ques_dict = dict()
    ques_dict[data['ques_id'][0]] = {"question": " ".join(data["processed_ques"][0][0:14])}
    
    if str(data['id']) in test_dev_dict:
        test_dev_dict[str(data['id'])].append(ques_dict)
    if str(data['id']) not in test_dev_dict:
        test_dev_dict[str(data['id'])] = [ques_dict]
        
with open("./VQA_pickles/test_dev.pickle","wb") as file:
    pickle.dump(test_dev_dict, file)

In [16]:
test_data = test_set
test_dict = dict()
for data in test_data:
    ques_dict = dict()
    ques_dict[data['ques_id'][0]] = {"question": " ".join(data["processed_ques"][0][0:14])}
    
    if str(data['id']) in test_dict:
        test_dict[str(data['id'])].append(ques_dict)
    if str(data['id']) not in test_dict:
        test_dict[str(data['id'])] = [ques_dict]

with open("./VQA_pickles/test.pickle","wb") as file:
    pickle.dump(test_dict, file)