In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

In [3]:
import json
import pickle

import nltk
from tqdm.autonotebook import tqdm
from PIL import Image



## Val

In [6]:
with open(os.path.join('/storage1/datasets/CLEVR_Uni_v1.0/', 'questions', f'CLEVR_uni_val_questions.json'), 'r') as f:
    val_questions = json.load(f)

In [7]:
for q in tqdm(val_questions['questions']):
    answer = q['answer']
    
    if answer is True:
        q['answer'] = 'yes'
    elif answer is False:
        q['answer'] = 'no'
    else:
        q['answer'] = str(answer)

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))




In [10]:
with open(os.path.join('/storage1/datasets/CLEVR_Uni_v1.0/', 'questions', f'CLEVR_uni_val_questions.json'), 'w') as f:
    json.dump(val_questions, f)

## Train

In [11]:
with open(os.path.join('/storage1/datasets/CLEVR_Uni_v1.0/', 'questions', f'CLEVR_uni_train_questions.json'), 'r') as f:
    train_questions = json.load(f)

In [12]:
for q in tqdm(train_questions['questions']):
    answer = q['answer']
    
    if answer is True:
        q['answer'] = 'yes'
    elif answer is False:
        q['answer'] = 'no'
    else:
        q['answer'] = str(answer)

HBox(children=(IntProgress(value=0, max=699998), HTML(value='')))




In [13]:
with open(os.path.join('/storage1/datasets/CLEVR_Uni_v1.0/', 'questions', f'CLEVR_uni_train_questions.json'), 'w') as f:
    json.dump(train_questions, f)

## Process

In [9]:
def process_question(root, split, word_dic=None, answer_dic=None):
    if word_dic is None:
        word_dic = {}

    if answer_dic is None:
        answer_dic = {}

    with open(os.path.join(root, 'questions', 'CLEVR_{}_questions.json'.format(split))) as f:
        data = json.load(f)

    result = []
    word_index = 1
    answer_index = 0

    for question in tqdm(data['questions']):
        words = nltk.word_tokenize(question['question'])
        question_token = []

        for word in words:
            try:
                question_token.append(word_dic[word])

            except:
                question_token.append(word_index)
                word_dic[word] = word_index
                word_index += 1
                
                print('New word', word)

        answer_word = question['answer']

        try:
            answer = answer_dic[answer_word]

        except:
            answer = answer_index
            answer_dic[answer_word] = answer_index
            answer_index += 1
            
            print('New answer', answer_word)

        result.append((question['image_filename'], question_token, answer, question['question_family_index']))

    with open(os.path.join(root, 'data/{}.pkl'.format(split)), 'wb') as f:
        pickle.dump(result, f)

    return word_dic, answer_dic

In [10]:
root = '/storage1/datasets/CLEVR_Uni_v1.0/'
# Use same dictionaries as CLEVR_v1.0
with open(os.path.join(root, 'data', 'dic.pkl'), 'rb') as f:
    dictionaries = pickle.load(f)  
word_dic, answer_dic = dictionaries['word_dic'], dictionaries['answer_dic']

In [11]:
print(answer_dic), print(len(answer_dic))

{'large': 4, 'green': 16, 'metal': 17, 'cube': 8, 'red': 22, '9': 26, 'gray': 7, '4': 19, '1': 11, '7': 25, 'small': 15, 'brown': 10, '6': 21, 'yellow': 12, 'no': 2, '10': 27, '8': 24, '5': 23, '0': 5, '2': 1, 'sphere': 6, 'purple': 13, 'cyan': 20, 'yes': 0, 'rubber': 3, 'cylinder': 14, '3': 18, 'blue': 9}
28


(None, None)

In [13]:
process_question(root, 'uni_train', word_dic, answer_dic);

HBox(children=(IntProgress(value=0, max=699998), HTML(value='')))




In [12]:
process_question(root, 'uni_val', word_dic, answer_dic);

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))




# Checks question files

In [23]:
with open(os.path.join(root, 'data/dic.pkl'), 'wb') as f:
    pickle.dump({'word_dic': word_dic, 'answer_dic': answer_dic}, f)

In [24]:
answer_dic

{'1': 0,
 'no': 1,
 'cube': 2,
 'rubber': 3,
 'gray': 4,
 'large': 5,
 '0': 6,
 'small': 7,
 'yellow': 8,
 'cylinder': 9,
 'green': 10,
 'brown': 11,
 'metal': 12,
 'sphere': 13,
 'yes': 14,
 '2': 15,
 '3': 16,
 'red': 17,
 'blue': 18,
 'cyan': 19,
 'purple': 20,
 '4': 21,
 '5': 22,
 '6': 23,
 '7': 24,
 '8': 25,
 '9': 26,
 '10': 27}

In [25]:
with open(os.path.join('/storage1/datasets/CLEVR_Uni_v1.0/', 'questions', f'CLEVR_uni_train_questions.json'), 'r') as f:
    train_questions = json.load(f)

In [26]:
set([q['answer'] for q in train_questions['questions']])

{'0',
 '1',
 '10',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 'blue',
 'brown',
 'cube',
 'cyan',
 'cylinder',
 'gray',
 'green',
 'large',
 'metal',
 'no',
 'purple',
 'red',
 'rubber',
 'small',
 'sphere',
 'yellow',
 'yes'}

In [27]:
with open(os.path.join('/storage1/datasets/CLEVR_Uni_v1.0/', 'questions', f'CLEVR_uni_val_questions.json'), 'r') as f:
    val_questions = json.load(f)

In [28]:
set([q['answer'] for q in val_questions['questions']])

{'0',
 '1',
 '10',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 'blue',
 'brown',
 'cube',
 'cyan',
 'cylinder',
 'gray',
 'green',
 'large',
 'metal',
 'no',
 'purple',
 'red',
 'rubber',
 'small',
 'sphere',
 'yellow',
 'yes'}