In [1]:
import numpy as np
import h5py
import json
import subprocess

In [2]:
np.random.seed(0)

In [3]:
data_types = ['train', 'val', 'test']
answer_types = ['answer', 'rationale']

In [4]:
# 'sample' | 'mini'
size_type = 'sample'
if size_type == 'sample':
    data_sizes = [50000, 10000, 10000]
elif size_type == 'mini':
    data_sizes = [5000, 1000, 1000]
else:
    raise ValueError

In [5]:
# Replace model dataset with the newly created one
replace = True

In [6]:
def create_data_samples(data_type, data_size, size_type, replace=False):
    with open('{}-orig.jsonl'.format(data_type), 'r') as f_in:
        data = [json.loads(s) for s in f_in]
    keep_ind = np.random.choice(range(len(data)), size=data_size)
    data_sample = [data[i] for i in keep_ind]
    
    with open('{}-{}.jsonl'.format(data_type, size_type), 'w') as f_out:
        for line in data_sample:
            f_out.write(json.dumps(line)+'\n')
    
    if replace:
        data_folder = '../../data/'
        source = '{}-{}.jsonl'.format(data_type, size_type)
        dest = '{}.jsonl'.format(data_type)
        subprocess.call('cp {} {}'.format(source, dest), shell=True)
        subprocess.call('rm {} {}'.format(source, dest), cwd=data_folder, shell=True)
        subprocess.call('ln -s ../vcr1/vcr1annots/{} {}'.format(source, source), cwd=data_folder, shell=True)
        subprocess.call('ln -s ../vcr1/vcr1annots/{} {}'.format(dest, dest), cwd=data_folder, shell=True)
    
    return keep_ind

In [7]:
indices = {}
for data_type, data_size in zip(data_types, data_sizes):
    print(data_type)
    indices[data_type] = create_data_samples(data_type, data_size, size_type, replace)

train
val
test


In [8]:
def create_embedding_samples(answer_type, data_type, size_type, replace=False):
    group_items = []
    data_folder = '../../data/'
    with h5py.File('{}bert_da_{}_{}_orig.h5'.format(data_folder, answer_type, data_type), 'r') as f:
        for ind in indices[data_type]:
            group_items.append({k: np.array(v, dtype=np.float16) for k, v in f[str(ind)].items()})

    with h5py.File('{}bert_da_{}_{}_{}.h5'.format(data_folder, answer_type, data_type, size_type), 'w') as f:
        for ind in range(len(group_items)):
            group = f.create_group(str(ind))
            for k, v in group_items[ind].items():
                group.create_dataset(k, data=v)
    
    if replace:
        source = 'bert_da_{}_{}_{}.h5'.format(answer_type, data_type, size_type)
        dest = 'bert_da_{}_{}.h5'.format(answer_type, data_type)
        subprocess.call('cp {} {}'.format(source, dest), cwd=data_folder, shell=True)
        subprocess.call('cd ../vcr1/vcr1annots', cwd=data_folder, shell=True)

In [9]:
for answer_type in answer_types:
    print(answer_type)
    for data_type in data_types:
        print('\t'+data_type)
        create_embedding_samples(answer_type, data_type, size_type, replace)
    print()

answer
	train
	val
	test

rationale
	train
	val
	test

