In [10]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [37]:
from os import path as osp
import json
import pickle

from PIL import Image


In [3]:
import sys

sys.path.insert(0, '../NSCL-PyTorch-Release/')
sys.path.insert(0, 'my_imp/')

In [45]:
from my_imp.data import gen_image_transform, gen_bbox_transform
from my_imp.copied.scene_annotation import annotate_objects
from my_imp.copied.program_translator import clevr_to_nsclseq, nsclseq_to_nsclqsseq
import nltk


from vocab import Vocab

In [17]:
from torch.utils.data import Dataset

In [32]:
img_size = 256

ans_dict_json = 'data/clevr_sample_mod/CLEVR_built_dictionaries.json'

train_ds_root = 'data/clevr_sample_mod/train/'
image_root = 'data/clevr_sample_mod/train/images/'
vocab_json = 'data/clevr_sample_mod/train/vocab.json'
train_scenes_json = 'data/clevr_sample_mod/train/scenes.json'
train_questions_json = 'data/clevr_sample_mod/train/questions.json'

val_ds_root = 'data/clevr_sample_mod/val/'
val_image_root = 'data/clevr_sample_mod/val/images/'
val_vocab_json = 'data/clevr_sample_mod/val/vocab.json'
val_scenes_json = 'data/clevr_sample_mod/val/scenes.json'
val_questions_json = 'data/clevr_sample_mod/val/questions.json'

dataset_args = {
    'train': {
        'scenes_json': train_scenes_json,
        'questions_json': train_questions_json,
        'image_root': image_root,
        'vocab_json': vocab_json,
    },
    'val': {
        'scenes_json': val_scenes_json,
        'questions_json': val_questions_json,
        'image_root': val_image_root,
        'vocab_json': val_vocab_json,
    },
}

image_transform = gen_image_transform(img_size)
bbox_transform = gen_bbox_transform(img_size)

In [54]:
class DatasetV2(Dataset):
    def __init__(self,
                 image_root,
                 vocab_json,
                 ans_dict_json,
                 bbox_transform,
                 scenes_json='',
                 questions_json='',
                 ds_root=None,
                 image_transform=None,
                ):
        super().__init__()

        self.image_root = image_root
        self.image_transform = image_transform
        self.bbox_transform = bbox_transform

        print('Loading vocab from: "{}".'.format(vocab_json))
        self.vocab = Vocab.from_json(vocab_json)
        print('Loading answers from: "{}"'.format(ans_dict_json))
        self.ans = Vocab.from_json(ans_dict_json)

        cached_scenes = osp.join(
            ds_root, 'scenes_cache.pkl') if ds_root else ''
        if cached_scenes and osp.exists(cached_scenes):
            print('==> using cached scenes: {}'.format(cached_scenes))
            with open(cached_scenes, 'rb') as f:
                self.scenes = pickle.load(f)
        else:
            with open(scenes_json, 'r') as f:
                self.scenes = json.load(f)
            self.prepare_scenes()
            with open(cached_scenes, 'wb') as f:
                pickle.dump(self.scenes, f)

        cached_questions = osp.join(
            ds_root, 'questions_cache.pkl') if ds_root else ''
        if cached_questions and osp.exists(cached_questions):
            print('==> using cached questions: {}'.format(cached_questions))
            with open(cached_questions, 'rb') as f:
                self.questions = pickle.load(f)
        else:
            with open(questions_json, 'r') as f:
                self.questions = json.load(f)
            self.prepare_questions()
            with open(cached_questions, 'wb') as f:
                pickle.dump(self.questions, f)

    def prepare_scenes(self):
        print('Preparing scenes')
        if type(self.scenes) is list:
            dummy_fp = osp.join(
                self.image_root, self.scenes[0]['image_filename'])
            scene_iter = self.scenes
        elif type(self.scenes) is dict:
            self.scenes = {int(key): val for key, val in self.scenes.items()}
            dummy_fp = osp.join(self.image_root, list(
                self.scenes.values())[0]['image_filename'])
            scene_iter = self.scenes.values()
        else:
            raise Exception(
                f"Scenes type is '{type(self.scenes)}', expeceted 'list' or 'dict'")
        dummy_image = Image.open(dummy_fp).convert('RGB')
        for i, scene in enumerate(scene_iter):
            # scene = scenes['scenes'][i]
            print(f'\r{i + 1}/{len(self.scenes)}', end='')
            objects = annotate_objects(scene)['objects']
            # scene['objects_raw'] = scene['objects']
            scene['objects'] = self.bbox_transform(dummy_image, objects)
            scene['scene_size'] = len(scene['objects'])
            scene['image_filename'] = osp.join(
                self.image_root, scene['image_filename'])

            del scene['relationships']
            del scene['objects_detection']
            del scene['directions']
        print()

    def prepare_questions(self):
        print('\nPreparing questions')
        for i, q in enumerate(self.questions):
            print(f'\r{i + 1}/{len(self.questions)}', end='')
            q['program_size'] = len(q['program'])
            q['question_raw'] = q['question']
            q['question'] = np.array(self.vocab.map_sequence(nltk.word_tokenize(q['question'].lower())), dtype='int32')
            q['answer_raw'] = q['answer']
            q['answer'] = self.ans.word2idx[q['answer']]
            # q['program_raw'] = q['program']
            # q['program_seq'] = clevr_to_nsclseq(q['program'])
            program_seq = clevr_to_nsclseq(q['program'])
            q['program_qsseq'] = nsclseq_to_nsclqsseq(program_seq)
            q['question_type'] = program_seq[-1]['op']

            del q['program']
            del q['image_filename']

        print()
        
        
    def __getitem__(self, index):
        # fd = self.questions[index]
        # scene = self.scenes[fd['image_index']]

        # image = Image.open(scene['image_filename']).convert('RGB')
        # image = self.image_transform(image)

        # Testing without vars because of memory leaks
        return {
            'image': self.image_transform(Image.open(self.scenes[self.questions[index]['image_index']]['image_filename']).convert('RGB')),
            **self.questions[index],
            'objects': self.scenes[self.questions[index]['image_index']]['objects'],
            }

    def _get_metainfo(self, index):
        return {
            'question': self.questions[index],
            'scene': self.scenes[self.questions[index]['image_index']],
        }
    
    def __len__(self):
        return len(self.questions)
        
ds = DatasetV2(
    image_root=image_root,
    vocab_json=vocab_json,
    image_transform=image_transform,
    bbox_transform=bbox_transform,
    ds_root=train_ds_root,
    scenes_json=train_scenes_json,
    ans_dict_json=ans_dict_json,
    questions_json=train_questions_json
)

Loading vocab from: "data/clevr_sample_mod/train/vocab.json".
Loading answers from: "data/clevr_sample_mod/CLEVR_built_dictionaries.json"
==> using cached scenes: data/clevr_sample_mod/train/scenes_cache.pkl
==> using cached questions: data/clevr_sample_mod/train/questions_cache.pkl


In [55]:
ds._get_metainfo(0)

{'question': {'answer': 3,
  'image_index': 3,
  'question': array([109,  68,  58, 100, 104,  33,  99,  58, 105, 100,  86,  78, 100,
          28, 102,  12], dtype=int32),
  'question_family_index': 76,
  'question_index': 30,
  'split': 'train',
  'program_size': 8,
  'question_raw': 'What material is the tiny cube that is to the right of the blue thing?',
  'answer_raw': 'rubber',
  'program_qsseq': [{'op': 'scene', 'inputs': []},
   {'op': 'filter',
    'concept': ['blue'],
    'inputs': [0],
    'concept_idx': 0,
    'concept_values': [['blue'], ['small', 'cube']]},
   {'op': 'relate',
    'relational_concept': ['right'],
    'inputs': [1],
    'relational_concept_idx': 0,
    'relational_concept_values': [['right']]},
   {'op': 'filter',
    'concept': ['small', 'cube'],
    'inputs': [2],
    'concept_idx': 1,
    'concept_values': [['blue'], ['small', 'cube']]},
   {'op': 'query',
    'attribute': 'material',
    'inputs': [3],
    'attribute_idx': 0,
    'attribute_values': ['m

In [43]:
ds.scenes

{'3': {'image_index': 3,
  'objects': tensor([[254.4000,  81.6000, 284.0000, 115.2000],
          [231.2000,  69.6000, 261.6000,  88.0000],
          [ 80.8000, 122.4000, 120.8000, 164.8000],
          [ 60.8000,  67.2000, 118.4000, 132.8000],
          [153.6000, 144.0000, 226.4000, 216.0000],
          [207.2000,  86.4000, 264.8000, 160.8000],
          [183.2000,  39.2000, 223.2000,  81.6000],
          [ 96.0000, 144.8000, 132.0000, 186.4000],
          [312.0000, 101.6000, 349.6000, 136.0000],
          [147.2000,  57.6000, 193.6000, 103.2000]]),
  'image_filename': 'data/clevr_sample_mod/train/images/CLEVR_train_000003.png',
  'split': 'train',
  'scene_size': 10},
 '4': {'image_index': 4,
  'objects': tensor([[163.2000,  56.8000, 184.8000,  84.0000],
          [197.6000,  83.2000, 231.2000, 116.8000],
          [ 54.4000, 103.2000,  82.4000, 129.6000]]),
  'image_filename': 'data/clevr_sample_mod/train/images/CLEVR_train_000004.png',
  'split': 'train',
  'scene_size': 3},
 '5':

In [27]:
with open(ans_dict_json, 'r') as f:
    data = json.load(f)

In [28]:
data

{'yes': 0,
 '2': 1,
 'no': 2,
 'rubber': 3,
 'large': 4,
 '0': 5,
 'sphere': 6,
 'gray': 7,
 'cube': 8,
 'blue': 9,
 'brown': 10,
 '1': 11,
 'yellow': 12,
 'purple': 13,
 'cylinder': 14,
 'small': 15,
 'green': 16,
 'metal': 17,
 '3': 18,
 '4': 19,
 'cyan': 20,
 '6': 21,
 'red': 22,
 '5': 23,
 '8': 24,
 '7': 25,
 '9': 26,
 '10': 27}