In [1]:
import numpy as np
import h5py
import json
import subprocess
from collections import OrderedDict

In [2]:
np.random.seed(0)

In [3]:
train_size = 0.85
answer_types = ['answer', 'rationale']

In [4]:
# 'sample' | 'mini'
size_type = 'orig'

In [5]:
def split_train_val(train_size, size_type):
    with open('train-orig.jsonl', 'r') as f_in:
        data = np.array([json.loads(s) for s in f_in])
    
    data_grp_movies = OrderedDict()
    for i in range(len(data)):
        data_grp_movies.setdefault(data[i]['movie'], []).append(i)
    
    movies = list(data_grp_movies.keys())
    
    if 0. <= train_size <= 1.:
        train_size = int(train_size*len(movies))
    
    train_movie_ind = np.random.choice(range(len(movies)), size=train_size, replace=False)
    val_movie_ind = np.setdiff1d(range(len(movies)), train_movie_ind)
    
    print(len(train_movie_ind), len(val_movie_ind))
    
    train, train_ind = [], []
    for i in range(len(movies)):
        if i in train_movie_ind:
            train_ind.extend(data_grp_movies[movies[i]])
            train.extend(data[data_grp_movies[movies[i]]])

    val, val_ind = [], []
    for i in range(len(movies)):
        if i in val_movie_ind:
            val_ind.extend(data_grp_movies[movies[i]])
            val.extend(data[data_grp_movies[movies[i]]])
            
    indices = {'train': train_ind, 'val': val_ind}
    split_data_dict = {'train': train, 'val': val}
    
    assert len(train) == len(train_ind)
    assert len(val) == len(val_ind)
    
    print(len(train_ind), len(val_ind))
    
    for data_type in split_data_dict.keys():
        with open('{}-{}.jsonl'.format(data_type, size_type), 'w') as f_out:
            for line in split_data_dict[data_type]:
                f_out.write(json.dumps(line)+'\n')
    
    return train, val, indices

In [6]:
train, val, indices = split_train_val(train_size, size_type)

1653 292
183413 29510


In [6]:
def create_embedding_split(answer_type, size_type, indices):
    group_items = {'train': [], 'val': []}
    data_folder = '../../data/'
    
    with h5py.File('{}bert_da_{}_train_orig.h5'.format(data_folder, answer_type), 'r') as f:
        for data_type in ['train', 'val']:
            print(data_type)
            for ind in indices[data_type]:
                group_items[data_type].append({k: np.array(v, dtype=np.float16) \
                                               for k, v in f[str(ind)].items()})

    for data_type in ['train', 'val']:
        print(data_type)
        with h5py.File('{}bert_da_{}_{}_{}.h5'.format(data_folder, answer_type, data_type, size_type), 'w') as f:
            for ind in range(len(group_items[data_type])):
                group = f.create_group(str(ind))
                for k, v in group_items[data_type][ind].items():
                    group.create_dataset(k, data=v)

In [8]:
for answer_type in answer_types:
    print(answer_type)
    create_embedding_split(answer_type, size_type, indices)
    print()

answer
train
val
train
val
rationale
train
val
train
val


In [None]:
data_folder = '../../data/'
answer_type = 'answer'

In [None]:
group_items1 = {'train': [], 'val': []}
for data_type in ['train', 'val']:
    with h5py.File('{}bert_da_{}_{}_{}2.h5'.format(data_folder, answer_type, data_type, size_type), 'r') as f:
        group_items1[data_type].append({k: np.array(v, dtype=np.float16) \
                                               for k, v in f[str(0)].items()})

In [None]:
group_items2 = {'train': [], 'val': []}
with h5py.File('{}bert_da_{}_train_orig.h5'.format(data_folder, answer_type), 'r') as f:
    for data_type in ['train', 'val']:
        group_items2[data_type].append({k: np.array(v, dtype=np.float16) \
                                       for k, v in f[str(indices[data_type][0])].items()})

In [None]:
for data_type in ['train', 'val']:
    for k, v in group_items1[data_type][0].items():
        assert np.all(v == group_items2[data_type][0][k])