http://www.manythings.org/anki/

In [1]:
import os, spacy, pickle
import numpy as np
from time import time
from torchtext import data
from torchtext import datasets

# Split data

In [43]:
path = '/home/rhermoza/data/translation/spa-eng/spa.txt'
out_train = '/home/rhermoza/data/translation/spa-eng/spa_train.txt'
out_test = '/home/rhermoza/data/translation/spa-eng/spa_test.txt'
test = 0.1

with open(path) as f:
    lines = f.readlines()

total_lines = len(lines)
test_lines = int(total_lines * test)
train_lines = total_lines - test_lines
train_lines, test_lines

idxs_test = np.random.choice(total_lines, test_lines, replace=False)
idxs_train = [i for i in range(total_lines) if i not in idxs_test]

with open(out_train, 'w') as f:
    for i in idxs_train:
        f.writelines(lines[i])
        
with open(out_test, 'w') as f:
    for i in idxs_test:
        f.writelines(lines[i])

In [44]:
!ls -lh /home/rhermoza/data/translation/spa-eng/

total 15M
-rw-rw-r-- 1 rhermoza rhermoza 763K Oct 16 20:24 spa_test.txt
-rw-rw-r-- 1 rhermoza rhermoza 6.7M Oct 16 20:24 spa_train.txt
-rw-r--r-- 1 rhermoza rhermoza 7.4M Sep 29 00:01 spa.txt


# Build dataset

In [51]:
path = '/home/rhermoza/data/translation/spa-eng/spa'
out_path = 'data/dataset.pkl'
spacy_es = spacy.load('es')
spacy_en = spacy.load('en')

def tokenize_es(text):
    return [tok.text for tok in spacy_es.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

ES = data.Field(tokenize=tokenize_es, lower=True)
EN = data.Field(tokenize=tokenize_en, lower=True,
                init_token='<sos>', eos_token='<eos>')

t0 = time()
print('Building dataset...')
ds_train, ds_test = data.TabularDataset.splits(path, train='_train.txt', test='_test.txt',
                                               format='tsv' , fields=[('en', EN), ('es', ES)])
ES.build_vocab(ds_train.es)
EN.build_vocab(ds_train.en)
print('Elapsed: %.2fs\n' % (time() - t0))
print(f'train size: {len(ds_train)}\ntest size: {len(ds_test)}')
print('examples:')
print(vars(ds_train[2000]))
print(vars(ds_train[5000]))
print('\nMost common on ES vocabulary:')
print(ES.vocab.freqs.most_common(10))
print('\nMost common on EN vocabulary:')
print(EN.vocab.freqs.most_common(10))

out = {'train': ds_train.examples, 'test': ds_test.examples}

with open(out_path, 'wb') as f:
    pickle.dump(out, f)
    
print('\nSaved on: ' + out_path)



    Only loading the 'es' tokenizer.



    Only loading the 'en' tokenizer.

Building dataset...
Elapsed: 9.43s

train size: 102620
test size: 11402
examples:
{'en': ['go', 'on', 'inside', '.'], 'es': ['ve', 'adentro', '.']}
{'en': ['it', "'s", 'not', 'funny', '.'], 'es': ['no', 'es', 'divertido', '.']}

Most common on ES vocabulary:
[('.', 88950), ('de', 19994), ('que', 18297), ('a', 17861), ('no', 17282), ('tom', 16907), ('la', 15808), ('el', 13731), ('?', 13438), ('¿', 13428)]

Most common on EN vocabulary:
[('.', 89390), ('i', 29382), ('the', 25015), ('to', 22940), ('you', 20325), ('tom', 17797), ('a', 15674), ('is', 13497), ('?', 13422), ("n't", 11755)]

Saved on: data/dataset.pkl


In [3]:
!ls -lh data

total 27M
-rw-rw-r-- 1 rhermoza rhermoza 27M Oct 16 14:55 dataset.pkl


# Load Dataset

In [79]:
from iseq2seq.utils import load_anki_dataset

out_path = 'data/dataset.pkl'
ds_train, ds_test, ES, EN = load_anki_dataset(out_path)
print(f'train size: {len(ds_train)}\ntest size: {len(ds_test)}')
print('examples:')
print(vars(ds_train[2000]))
print(vars(ds_train[5000]))
print('\nMost common on ES vocabulary:')
print(ES.vocab.freqs.most_common(10))
print('\nMost common on EN vocabulary:')
print(EN.vocab.freqs.most_common(10))



    Only loading the 'es' tokenizer.



    Only loading the 'en' tokenizer.

train size: 102620
test size: 11402
examples:
{'en': ['go', 'on', 'inside', '.'], 'es': ['ve', 'adentro', '.']}
{'en': ['it', "'s", 'not', 'funny', '.'], 'es': ['no', 'es', 'divertido', '.']}

Most common on ES vocabulary:
[('.', 88950), ('de', 19994), ('que', 18297), ('a', 17861), ('no', 17282), ('tom', 16907), ('la', 15808), ('el', 13731), ('?', 13438), ('¿', 13428)]

Most common on EN vocabulary:
[('.', 89390), ('i', 29382), ('the', 25015), ('to', 22940), ('you', 20325), ('tom', 17797), ('a', 15674), ('is', 13497), ('?', 13422), ("n't", 11755)]


# Testing iterator

In [140]:
train_buckets = data.BucketIterator(ds_train, 32,
    sort_key=lambda x: data.interleave_keys(len(x.es), len(x.en)))
test_buckets = data.BucketIterator(ds_test, 500,
    sort_key=lambda x: data.interleave_keys(len(x.es), len(x.en)))
train_iter = iter(train_buckets)
test_iter = iter(test_buckets)
x_train = next(train_iter)
x_test = next(test_iter)
x_train.es.size(), x_train.en.size(), x_test.es.size(), x_test.en.size()

(torch.Size([7, 32]),
 torch.Size([11, 32]),
 torch.Size([7, 500]),
 torch.Size([8, 500]))

In [141]:
es_dict = {v:k for k,v in ES.vocab.stoi.items()}
en_dict = {v:k for k,v in EN.vocab.stoi.items()}
x_es = x_train.es.data.cpu().numpy()
x_en = x_train.en.data.cpu().numpy()

for i in range(10):
    print(x_es[:, i], end=' | ')
    print(x_en[:, i])
    print(' '.join([es_dict[w] for w in x_es[:, i]]), end=' | ')
    print(' '.join([en_dict[w] for w in x_en[:, i]]))

[ 99   4  71   5   8 253   2] | [  2   8  25   7  49   7   6 266   4   3   1]
tienes que ir a la fiesta . | <sos> you have to go to the party . <eos> <pad>
[7531   23  245  149    8 2488    2] | [   2    5 2012   23  280   36    6 2677    4    3    1]
derramé mi café sobre la alfombra . | <sos> i spilled my coffee on the carpet . <eos> <pad>
[   5  155    6   15 1053  732    2] | [  2  20  54  13 719 462   7  24   4   3   1]
a mí no me suena estúpido . | <sos> that does n't sound stupid to me . <eos> <pad>
[   7   91    4   16 1040  354    2] | [  2   9 127  20  14  21 728 273   4   3   1]
tom dijo que se sentía cansado . | <sos> tom said that he was feeling tired . <eos> <pad>
[  99    4 1465   38  998 1586    2] | [   2    8   25    7  867   35  357 3162    4    3    1]
tienes que elegir tu propio destino . | <sos> you have to choose your own destiny . <eos> <pad>
[    7    13   131   732    32 15434     2] | [   2    9   11  104  462    7   39 1218    4    3    1]
tom es demasiado e