http://www.manythings.org/anki/

In [1]:
import os, spacy, pickle
import numpy as np
from time import time
from torchtext import data
from torchtext import datasets

# Build dataset

In [2]:
path = '/home/rhermoza/data/translation/spa-eng/spa.txt'
out_path = 'data/dataset.pkl'
spacy_es = spacy.load('es')
spacy_en = spacy.load('en')

def tokenize_es(text):
    return [tok.text for tok in spacy_es.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

ES = data.Field(tokenize=tokenize_es, lower=True)
EN = data.Field(tokenize=tokenize_en, lower=True,
                init_token='<sos>', eos_token='<eos>')

t0 = time()
print('Building dataset...')
dataset = data.TabularDataset(path, format='tsv' , fields=[('en', EN), ('es', ES)])
ES.build_vocab(dataset.es)
EN.build_vocab(dataset.en)
print('Elapsed: %.2fs\n' % (time() - t0))
print(f'dataset size: {len(dataset)}')
print('examples:')
print(vars(dataset[2000]))
print(vars(dataset[5000]))
print('\nMost common on ES vocabulary:')
print(ES.vocab.freqs.most_common(10))
print('\nMost common on EN vocabulary:')
print(EN.vocab.freqs.most_common(10))

with open(out_path, 'wb') as f:
    pickle.dump(dataset.examples, f)
    
print('\nSaved on: ' + out_path)



    Only loading the 'es' tokenizer.



    Only loading the 'en' tokenizer.

Building dataset...
Elapsed: 10.17s

dataset size: 114022
examples:
{'en': ['we', 'like', 'him', '.'], 'es': ['él', 'nos', 'agrada', '.']}
{'en': ['his', 'head', 'ached', '.'], 'es': ['le', 'dolía', 'la', 'cabeza', '.']}

Most common on ES vocabulary:
[('.', 98820), ('de', 22274), ('que', 20415), ('a', 19901), ('no', 19194), ('tom', 18815), ('la', 17641), ('el', 15238), ('?', 14929), ('¿', 14920)]

Most common on EN vocabulary:
[('.', 99317), ('i', 32669), ('the', 27856), ('to', 25513), ('you', 22629), ('tom', 19805), ('a', 17490), ('is', 14982), ('?', 14910), ("n't", 13114)]

Saved on: data/dataset.pkl


In [3]:
!ls -lh data

total 27M
-rw-rw-r-- 1 rhermoza rhermoza 27M Oct 16 14:55 dataset.pkl


# Load Dataset

In [4]:
from iseq2seq.utils import load_anki_dataset

out_path = 'data/dataset.pkl'
dataset, ES, EN = load_anki_dataset(out_path)
print(f'dataset size: {len(dataset)}')
print('examples:')
print(vars(dataset[2000]))
print(vars(dataset[5000]))
print('\nMost common on ES vocabulary:')
print(ES.vocab.freqs.most_common(10))
print('\nMost common on EN vocabulary:')
print(EN.vocab.freqs.most_common(10))



    Only loading the 'es' tokenizer.



    Only loading the 'en' tokenizer.

dataset size: 114022
examples:
{'en': ['we', 'like', 'him', '.'], 'es': ['él', 'nos', 'agrada', '.']}
{'en': ['his', 'head', 'ached', '.'], 'es': ['le', 'dolía', 'la', 'cabeza', '.']}

Most common on ES vocabulary:
[('.', 98820), ('de', 22274), ('que', 20415), ('a', 19901), ('no', 19194), ('tom', 18815), ('la', 17641), ('el', 15238), ('?', 14929), ('¿', 14920)]

Most common on EN vocabulary:
[('.', 99317), ('i', 32669), ('the', 27856), ('to', 25513), ('you', 22629), ('tom', 19805), ('a', 17490), ('is', 14982), ('?', 14910), ("n't", 13114)]


# Testing iterator

In [5]:
train_buckets = data.BucketIterator(dataset, 32, 
    sort_key=lambda x: data.interleave_keys(len(x.es), len(x.en)))
train_iter = iter(train_buckets)
x = next(train_iter)
x.es.size(), x.en.size()

(torch.Size([9, 32]), torch.Size([12, 32]))

In [6]:
es_dict = {v:k for k,v in ES.vocab.stoi.items()}
en_dict = {v:k for k,v in EN.vocab.stoi.items()}
x_es = x.es.data.cpu().numpy()
x_en = x.en.data.cpu().numpy()

for i in range(10):
    print(x_es[:, i], end=' | ')
    print(x_en[:, i])
    print(' '.join([es_dict[w] for w in x_es[:, i]]), end=' | ')
    print(' '.join([en_dict[w] for w in x_en[:, i]]))


[ 11  15  71 341   5 266   3  52  10] | [ 2 47  8 96 23 70 68 19 65 12  3  1]
¿ me puede ayudar a salir de aquí ? | <sos> can you help me get out of here ? <eos> <pad>
[    7    41     9  2180     3 13653    34  2681     2] | [   2    9   52    6 1478   19 3532   33 2653    4    3    1]
tom tiene el hábito de morderse las uñas . | <sos> tom has the habit of biting his nails . <eos> <pad>
[   9  260 4329   36  149   12    9 4599    2] | [   2    6  924 1213    6  149   36    6 3109    4    3    1]
el policía arrestó al hombre en el acto . | <sos> the policeman arrested the man on the spot . <eos> <pad>
[   7   41 2114   32 2688    4   24  685    2] | [   2    9   52  401 7001   20   14   18  248    4    3    1]
tom tiene dificultad para admitir que está equivocado . | <sos> tom has trouble admitting that he 's wrong . <eos> <pad>
[   9  167    4   18 1706    6   13 1772    2] | [  2   6 110  14  18 752  11  38  33   4   3   1]
el coche que él conduce no es suyo . | <sos> the car he 's d