In [1]:
from os.path import join
import numpy as np

In [2]:
np.random.seed = 42

In [3]:
data_dir = '/nfs/team/nlp/users/rgupta/NMT/code/GNN-Semantic-Similarity/data/'

In [4]:
langs = ['en', 'de']
vocab_sizes = [500, 10000, 50000]
underscore = "▁"

In [5]:
corpora = {}
for lang in langs:
    for v in vocab_sizes:
        with open(join(data_dir, f'corpora/iwslt14-en-de-{v}-full.{lang}')) as f:
            corpora[f'{v}|{lang}'] = np.array(f.readlines())
    corpora[f'words|{lang}'] = np.array([''.join(pieces).replace(underscore, ' ')[1:] for pieces in corpora[f'{v}|{lang}']])

In [6]:
for k, v in corpora.items():
    print('{:<10}|{:<10}|{:<10}'.format(k, len(v), v[0]))

500|en    |166837    |▁It ▁can ▁be ▁a ▁very ▁com p l ic at ed ▁thing , ▁the ▁o ce an .

10000|en  |166837    |▁It ▁can ▁be ▁a ▁very ▁complicated ▁thing , ▁the ▁ocean .

50000|en  |166837    |▁It ▁can ▁be ▁a ▁very ▁complicated ▁thing , ▁the ▁ocean .

words|en  |166837    |It  can  be  a  very  complicated  thing ,  the  ocean .

500|de    |166837    |▁Das ▁M e er ▁kann ▁z ie m lich ▁k om p l iz iert ▁sein .

10000|de  |166837    |▁Das ▁Meer ▁kann ▁ziemlich ▁kompliziert ▁sein .

50000|de  |166837    |▁Das ▁Meer ▁kann ▁ziemlich ▁kompliziert ▁sein .

words|de  |166837    |Das  Meer  kann  ziemlich  kompliziert  sein .



In [7]:
train_prop = 0.9
dev_prop = 0.05
test_prop = 0.05
data_size = len(corpora[f'{vocab_sizes[0]}|{langs[0]}'])

In [8]:
permute = np.random.permutation(data_size)

In [9]:
permute[:10]

array([ 30702,  82337, 160452,  53966,  96283, 135867,  34459, 150766,
        26612, 108407])

In [10]:
train_thresh = int(data_size * train_prop)
dev_thresh = int(data_size * (train_prop + dev_prop))
train_ids = permute[:train_thresh]
dev_ids = permute[train_thresh:dev_thresh]
test_ids = permute[dev_thresh:]

In [11]:
len(train_ids), len(dev_ids), len(test_ids)

(150153, 8342, 8342)

In [13]:
for v in vocab_sizes + ['words']:
    for lang in langs:
        for split_ids, split in zip([train_ids, dev_ids, test_ids], ['train', 'dev', 'test']):
            with open(join(data_dir, f'corpora/iwslt14-en-de-{v}-{split}.{lang}'), 'w') as f:
                f.writelines(corpora[f'{v}|{lang}'][split_ids])