In [6]:
import os

import json
import h5py
import pickle

import numpy as np

import torch
from torch.utils import data

In [3]:
dir_data = '/Users/sebamenabar/Documents/datasets/GQA/data/'

In [9]:
objects = h5py.File(os.path.join(dir_data, 'gqa_objects.h5'), 'r')
with open(os.path.join(dir_data, 'gqa_objects_merged_info.json'), 'r') as f:
    objects_info = json.load(f)
    
bboxes, features = objects['bboxes'], objects['features']

In [70]:
bboxes, features

(<HDF5 dataset "bboxes": shape (148855, 100, 4), type "<f4">,
 <HDF5 dataset "features": shape (148855, 100, 2048), type "<f4">)

In [15]:
img_name = '2370799'
img_info = objects_info[img_name]
h, w = img_info['height'], img_info['width']
idx = img_info['index']

In [18]:
img_info

{'width': 500, 'objectsNum': 24, 'height': 333, 'index': 94550}

In [17]:
bboxes[idx] / (w, h, w, h), features[idx]

(array([[0.30714624, 0.52350225, 0.41901828, 0.6913585 ],
        [0.02196447, 0.59035279, 0.999     , 0.99833336],
        [0.23879129, 0.25973763, 0.30886682, 0.32710385],
        [0.        , 0.40495108, 0.57442682, 0.99833336],
        [0.13181677, 0.37334655, 0.46121591, 0.69785898],
        [0.        , 0.        , 0.5654046 , 0.54376876],
        [0.5510199 , 0.        , 0.999     , 0.85366235],
        [0.09523612, 0.5220325 , 0.19586964, 0.68011539],
        [0.111952  , 0.47931201, 0.38251608, 0.66491007],
        [0.42950867, 0.42984036, 0.999     , 0.67337875],
        [0.84432758, 0.53210005, 0.91870337, 0.98770893],
        [0.48518463, 0.36124899, 0.5497226 , 0.52273436],
        [0.19949901, 0.28778523, 0.33975223, 0.55120511],
        [0.30184235, 0.        , 0.82712531, 0.25445041],
        [0.48387924, 0.43663589, 0.56033472, 0.57743116],
        [0.192427  , 0.28988709, 0.30061288, 0.47601085],
        [0.48078641, 0.32971061, 0.52877185, 0.43281628],
        [0.298

In [82]:
class GQADataset(data.Dataset):
    def __init__(self, data_dir, split='train', sample=False, use_feats='spatial'):

        self.use_feats = use_feats
        self.sample = sample
        if sample:
            sample = '_sample'
        else:
            sample = ''
        with open(os.path.join(data_dir, '{}{}.pkl'.format(split, sample)), 'rb') as f:
            self.data = pickle.load(f)
        with open(os.path.join(data_dir, f'gqa_{self.use_feats}_merged_info.json')) as f:
            self.info = json.load(f)
        self.features = h5py.File(os.path.join(data_dir, f'gqa_{self.use_feats}.h5'), 'r')
        
        if self.use_feats == 'spatial':
            self.features = self.data['features']
        elif self.use_feats == 'objects':
            self.features, self.bboxes = self.features['features'], self.features['bboxes']

    def __getitem__(self, index):
        imgid, question, answer, group, questionid = self.data[index]
        img_info = self.info[imgid]
        imgidx = img_info['index']
        
        if self.use_feats == 'spatial':
            img = torch.from_numpy(self.img[imgidx])
        elif self.use_feats == 'objects':
            h, w = img_info['height'], img_info['width']
            bboxes = self.bboxes[imgidx] / (w, h, w, h)
            img = self.features[idx]
            
            bboxes = bboxes[:img_info['objectsNum']]
            img = img[:img_info['objectsNum']]
                        
            print(img.dtype)
                
            img = torch.from_numpy(np.concatenate((img, bboxes), axis=1)).to(torch.float32)
            
            print(img)

        return img, question, len(question), answer, group, questionid, imgid

    def __len__(self):
        return len(self.data)

In [83]:
ds = GQADataset(dir_data, split='val', use_feats='objects')

In [84]:
ds[0][0].size(), ds[50][0]

float32
tensor([[0.0000, 0.0000, 6.0012,  ..., 0.4750, 0.9987, 0.9983],
        [0.0492, 0.8963, 1.6789,  ..., 0.5736, 0.9987, 0.9983],
        [0.0000, 0.0000, 0.0087,  ..., 0.0000, 0.8312, 0.6644],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.3709, 0.5507, 0.5526],
        [0.0000, 0.0000, 0.0000,  ..., 0.3092, 0.5869, 0.3735],
        [0.0000, 0.0000, 0.0000,  ..., 0.0341, 0.4152, 0.1459]])
float32
tensor([[0.0000, 0.0000, 6.0012,  ..., 0.3912, 0.9983, 0.9524],
        [0.0492, 0.8963, 1.6789,  ..., 0.0000, 0.9492, 0.5575],
        [0.0000, 0.0000, 0.0087,  ..., 0.2069, 0.6742, 0.4229],
        ...,
        [0.0000, 0.0000, 0.7556,  ..., 0.7152, 0.9687, 0.7497],
        [0.0000, 0.1647, 0.0826,  ..., 0.0092, 0.5005, 0.2082],
        [0.0000, 0.0000, 0.0102,  ..., 0.4943, 0.9082, 0.5449]])


(torch.Size([36, 2052]),
 tensor([[0.0000, 0.0000, 6.0012,  ..., 0.3912, 0.9983, 0.9524],
         [0.0492, 0.8963, 1.6789,  ..., 0.0000, 0.9492, 0.5575],
         [0.0000, 0.0000, 0.0087,  ..., 0.2069, 0.6742, 0.4229],
         ...,
         [0.0000, 0.0000, 0.7556,  ..., 0.7152, 0.9687, 0.7497],
         [0.0000, 0.1647, 0.0826,  ..., 0.0092, 0.5005, 0.2082],
         [0.0000, 0.0000, 0.0102,  ..., 0.4943, 0.9082, 0.5449]]))

In [65]:
torch.nn.utils.rnn.pad_sequence([ds[0][0], ds[50][0]], batch_first=True).size()

torch.Size([2, 36, 2052])

In [8]:
%load_ext autoreload
%autoreload 2

In [9]:
import os
import sys

sys.path.insert(0, 'code')

In [10]:
import json

import csv
import numpy as np
import pandas as pd

import torch
import torch.nn as nn

from easydict import EasyDict as edict

from mac import MACNetwork
from utils import load_vocab
from datasets import GQADataset, collate_fn_gqa, ClevrDataset, collate_fn

In [11]:
cfg = edict(
    DATASET=edict(
        DATA_DIR='/Users/sebamenabar/Documents/datasets/GQA/data',
    )
)

In [12]:
vocab = load_vocab(cfg)

In [13]:
pd.Series(vocab['question_token_to_idx'])

Do              1
you             2
see             3
any             4
trays           5
             ... 
buys         2892
sticking     2893
breaking     2894
pistachio    2895
<PAD>           0
Length: 2896, dtype: int64

In [96]:
tokens = list(vocab['question_token_to_idx'].keys())
print('Case sensitive:', len(set(tokens)))
print('Case insensitive:', len(set([t.lower() for t in tokens])) )

Case sensitive: 2896
Case insensitive: 2855


In [None]:
tok2id = pd.Series(vocab['question_token_to_idx'])
id2tok = pd.Series(index=tok2id.values, data=tok2id.index)

In [98]:
embs = nn.Embedding(num_embeddings=len(tokens), embedding_dim=300)

In [110]:
glove = pd.read_csv('/Users/sebamenabar/Documents/datasets/glove.840B.300d.txt', sep=" ",
                        index_col=0, header=None, quoting=csv.QUOTE_NONE)

In [None]:
glove = pd.read_msgpack('/Users/sebamenabar/Documents/datasets/glove.840B.300d.msg')

In [114]:
tok2id.index.str.lower()

Index(['do', 'you', 'see', 'any', 'trays', 'or', 'cds', '?', 'where', 'was',
       ...
       'slanted', 'trumpets', 'half-full', 'did', 'showers', 'buys',
       'sticking', 'breaking', 'pistachio', '<pad>'],
      dtype='object', length=2896)

In [120]:
set(tok2id.index.str.lower())

{'garlands',
 'pistachio',
 'leads',
 'bright',
 'shaped',
 'kiwi',
 'shelves',
 'flip',
 'slicing',
 'powerlines',
 'cast',
 'wildflowers',
 'chase',
 'made',
 'toolbox',
 'boys',
 'pen',
 'approaches',
 'fallen',
 'antennas',
 'sunglasses',
 'helicopter',
 'lie',
 'hilltop',
 'dream',
 'storage',
 'raspberry',
 'cookie',
 'near',
 'earphones',
 'jockeys',
 'charger',
 'shaking',
 'upwards',
 'frog',
 'athletes',
 'hairy',
 'artichokes',
 'open',
 'engineer',
 'moon',
 'buffet',
 'blank',
 'burgers',
 'metal',
 'try',
 'roll',
 'nike',
 'speaker',
 'pose',
 'toilet',
 'shirtless',
 'almond',
 'lit',
 'peacocks',
 'produce',
 'vendor',
 'keyboard',
 'dragon',
 'donuts',
 'pills',
 'name',
 'sliding',
 'avocadoes',
 'bunch',
 'mustard',
 'almonds',
 'patchy',
 'pens',
 'lime',
 'unlit',
 'dishes',
 'people',
 'decorates',
 'splashes',
 'tomato',
 'pea',
 'spoon',
 'dishwasher',
 'basket',
 'hardwood',
 'bending',
 'hydrant',
 'books',
 'anchovies',
 'showing',
 'berry',
 'shop',
 'scram

In [122]:
toks_in_vocab = glove[glove.index.isin(tok2id.index.str.lower())]
print('Vocabulary in glove:', f'{len(toks_in_vocab)}/{len(set(tok2id.index.str.lower()))}')

Vocabulary in glove: 2839/2855


In [123]:
toks_in_vocab = glove[glove.index.isin(tok2id.index)]

In [129]:
tok2id[toks_in_vocab.index].values

array([  74,   11,   29, ..., 2657, 1283, 1517])

In [130]:
toks_in_vocab

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
",",-0.082752,0.672040,-0.149870,-0.064983,0.056491,0.402280,0.002775,-0.331100,-0.306910,2.08170,...,-0.143310,0.018267,-0.186430,0.207090,-0.355980,0.053380,-0.050821,-0.191800,-0.378460,-0.065890
the,0.272040,-0.062030,-0.188400,0.023225,-0.018158,0.006719,-0.138770,0.177080,0.177090,2.58820,...,-0.428100,0.168990,0.225110,-0.285570,-0.102800,-0.018168,0.114070,0.130150,-0.183170,0.132300
and,-0.185670,0.066008,-0.252090,-0.117250,0.265130,0.064908,0.122910,-0.093979,0.024321,2.49260,...,-0.593960,-0.097729,0.200720,0.170550,-0.004736,-0.039709,0.324980,-0.023452,0.123020,0.331200
to,0.319240,0.063160,-0.278580,0.261200,0.079248,-0.214620,-0.104950,0.154950,-0.033530,2.48340,...,-0.129770,0.371300,0.188880,-0.004274,-0.106450,-0.258100,-0.044629,0.082745,0.097801,0.250450
of,0.060216,0.217990,-0.042490,-0.386180,-0.153880,0.034635,0.222430,0.217180,0.006848,2.43750,...,-0.424840,0.116060,0.004813,-0.396290,-0.268230,0.329200,-0.175970,0.117090,-0.166920,-0.094085
a,0.043798,0.024779,-0.209370,0.497450,0.360190,-0.375030,-0.052078,-0.605550,0.036744,2.20850,...,-0.103470,0.003363,0.217600,-0.204090,0.092415,0.080421,-0.061246,-0.300990,-0.145840,0.281880
in,0.089187,0.257920,0.262820,-0.029365,0.471870,-0.103890,-0.100130,0.081230,0.208830,2.57260,...,-0.097179,-0.054541,0.192290,-0.481280,-0.203040,0.193680,-0.325460,0.144210,-0.169000,0.265010
is,-0.084961,0.502000,0.002382,-0.167550,0.307210,-0.237620,0.160690,-0.367860,-0.058347,2.49900,...,0.087003,-0.078201,-0.069673,-0.169930,0.235980,0.275500,-0.067180,-0.215110,-0.263040,-0.006017
for,-0.172240,0.182340,-0.278470,-0.084666,0.254420,-0.194030,0.406860,0.150830,0.146700,2.02350,...,-0.200700,-0.413930,0.290530,-0.277880,-0.139430,-0.122380,-0.081083,-0.126680,-0.438560,0.387650
that,0.098520,0.250010,-0.270180,-0.231860,0.022378,0.045321,-0.052444,-0.058867,-0.031937,3.01210,...,-0.242670,0.119590,0.013652,-0.112430,-0.089702,-0.196140,-0.270970,-0.062639,0.244240,0.177790


In [225]:
glove_values = h5py.File('/Users/sebamenabar/Documents/datasets/glove.840B.300d.h5', 'r')['glove']
glove_idxs = pd.read_msgpack('/Users/sebamenabar/Documents/datasets/glove.840B.300d.tok2idx.msg')

def init_embeddings_from_pretrained(pretrained_idx, pretrained_values, token2id, vocab_size, emb_dim=300):
    toks_in_vocab = pretrained_idx[pretrained_idx.index.isin(token2id.index)]
    print('Vocabulary in glove:', f'{len(toks_in_vocab)}/{len(token2id)}')

    emb = nn.Embedding(num_embeddings=len(token2id), embedding_dim=emb_dim)
    emb.weight.requires_grad = False
    # Para los tokens que si estan en glove los preinicializo
    emb.weight[token2id[toks_in_vocab.index].values] = torch.from_numpy(pretrained_values[toks_in_vocab.values.tolist()])
    emb.weight.requires_grad = True

    return emb

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  exec(code_obj, self.user_global_ns, self.user_ns)


In [227]:
emb = init_embeddings_from_pretrained(glove_idxs, glove_values, tok2id, len(tok2id))

Vocabulary in glove: 2880/2896


In [229]:
torch.clone(emb)

TypeError: clone(): argument 'input' (position 1) must be Tensor, not Embedding

In [4]:
import pandas as pd

In [5]:
glove_values = h5py.File('/Users/sebamenabar/Documents/datasets/glove.840B.300d.h5', 'r')['vocab']
glove_idxs = pd.read_msgpack('/Users/sebamenabar/Documents/datasets/glove.840B.300d.tok2idx.msg')

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  exec(code_obj, self.user_global_ns, self.user_ns)


In [233]:
glove_values.file.close()

In [1]:
import h5py
h5file = h5py.File('/Users/sebamenabar/Documents/datasets/glove.840B.300d.h5', "r+");
h5file['vocab'] = h5file['glove'];
h5file.close();

In [2]:
import h5py
h5file = h5py.File('/Users/sebamenabar/Documents/datasets/glove.840B.300d.h5', "r+");
del h5file['glove'];
h5file.close();

In [216]:
toks_in_vocab = glove_idxs[glove_idxs.index.isin(tok2id.index)]
toks_in_vocab

,                   0
the                 2
and                 3
to                  4
of                  5
               ...   
mooses         568340
vacua          685726
octopodes     1226685
lying-down    1641454
burritoes     2007540
Length: 2880, dtype: int64

In [193]:
emb = nn.Embedding(num_embeddings=len(tok2id), embedding_dim=300)
emb.weight.requires_grad = False

In [210]:
emb.weight[tok2id[toks_in_vocab.index].values] = torch.from_numpy(glove_values[toks_in_vocab.values.tolist()])

In [214]:
glove_values[toks_in_vocab.values.tolist()]

array([[-0.082752,  0.67204 , -0.14987 , ..., -0.1918  , -0.37846 ,
        -0.06589 ],
       [ 0.27204 , -0.06203 , -0.1884  , ...,  0.13015 , -0.18317 ,
         0.1323  ],
       [-0.18567 ,  0.066008, -0.25209 , ..., -0.023452,  0.12302 ,
         0.3312  ],
       ...,
       [ 0.41802 , -0.84775 ,  0.41584 , ...,  0.48558 , -0.15376 ,
        -0.18453 ],
       [ 0.43224 , -0.99616 , -0.51371 , ...,  0.64368 , -0.015651,
        -0.92468 ],
       [ 0.35219 , -0.48543 , -0.49622 , ...,  0.30279 , -0.12397 ,
        -0.18831 ]], dtype=float32)

In [213]:
emb.weight[tok2id[toks_in_vocab.index].values]

tensor([[-0.0828,  0.6720, -0.1499,  ..., -0.1918, -0.3785, -0.0659],
        [ 0.2720, -0.0620, -0.1884,  ...,  0.1302, -0.1832,  0.1323],
        [-0.1857,  0.0660, -0.2521,  ..., -0.0235,  0.1230,  0.3312],
        ...,
        [ 0.4180, -0.8478,  0.4158,  ...,  0.4856, -0.1538, -0.1845],
        [ 0.4322, -0.9962, -0.5137,  ...,  0.6437, -0.0157, -0.9247],
        [ 0.3522, -0.4854, -0.4962,  ...,  0.3028, -0.1240, -0.1883]])

In [208]:
glove_values[toks_in_vocab.values.tolist()].shape

(2880, 300)

In [195]:
tok2id[toks_in_vocab.index].values

array([  74,   11,   29, ..., 2657, 1283, 1517])

In [157]:
import h5py

In [158]:
len(glove)

2196017

In [163]:
with h5py.File('/Users/sebamenabar/Documents/datasets/glove.840B.300d.h5', 'w') as f:
    dataset = f.create_dataset('vocab', (len(glove), 300), dtype=np.float32)
    
    dataset[:] = glove.values.astype(np.float32)

In [169]:
idx2tok_glove = glove.reset_index()[0]

In [185]:
tok2idx_glove = pd.Series(index=idx2tok_glove.values, data=idx2tok_glove.index)

In [187]:
idx2tok_glove.to_msgpack('/Users/sebamenabar/Documents/datasets/glove.840B.300d.tok2idx.msg')

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  """Entry point for launching an IPython kernel.


In [189]:
tok2idx_glove.to_msgpack('/Users/sebamenabar/Documents/datasets/glove.840B.300d.tok2idx.msg')

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  """Entry point for launching an IPython kernel.


In [190]:
tok2idx_glove

,                       0
.                       1
the                     2
and                     3
to                      4
                   ...   
xtremecaffeine    2196012
yildirim          2196013
z/28              2196014
zipout            2196015
zulchzulu         2196016
Length: 2196017, dtype: int64

In [184]:
idx2tok_glove.

TypeError: 'int' object is not iterable

In [172]:
idx2tok_glove.to_msgpack('/Users/sebamenabar/Documents/datasets/idx2tok_glove.msg')

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  """Entry point for launching an IPython kernel.


In [166]:
sglove.reset_index()[0]

0          ,
1          .
2        the
3        and
4         to
       ...  
95        We
96        no
97       any
98         >
99    people
Name: 0, Length: 100, dtype: object