http://www.manythings.org/anki/

In [1]:
import os, spacy, pickle
import numpy as np
from time import time
from torchtext import data
from torchtext import datasets

# Split data

In [2]:
path = '/home/rhermoza/data/translation/spa-eng/spa.txt'
out_train = '/home/rhermoza/data/translation/spa-eng/spa_train.txt'
out_test = '/home/rhermoza/data/translation/spa-eng/spa_test.txt'
test = 0.1

with open(path) as f:
    lines = f.readlines()

total_lines = len(lines)
test_lines = int(total_lines * test)
train_lines = total_lines - test_lines
train_lines, test_lines

idxs_test = np.random.choice(total_lines, test_lines, replace=False)
idxs_train = [i for i in range(total_lines) if i not in idxs_test]

with open(out_train, 'w') as f:
    for i in idxs_train:
        f.writelines(lines[i])
        
with open(out_test, 'w') as f:
    for i in idxs_test:
        f.writelines(lines[i])

In [3]:
!ls -lh /home/rhermoza/data/translation/spa-eng/

total 15M
-rw-rw-r-- 1 rhermoza rhermoza 752K Oct 28 21:23 spa_test.txt
-rw-rw-r-- 1 rhermoza rhermoza 6.7M Oct 28 21:23 spa_train.txt
-rw-r--r-- 1 rhermoza rhermoza 7.4M Sep 29 00:01 spa.txt


# Build dataset

In [5]:
path = '/home/rhermoza/data/translation/spa-eng'
out_path = 'data/dataset.pkl'
spacy_es = spacy.load('es')
spacy_en = spacy.load('en')

def tokenize_es(text):
    return [tok.text for tok in spacy_es.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

ES = data.Field(tokenize=tokenize_es, lower=True)
EN = data.Field(tokenize=tokenize_en, lower=True,
                init_token='<sos>', eos_token='<eos>')

t0 = time()
print('Building dataset...')
ds_train, ds_test = data.TabularDataset.splits(path, train='spa_train.txt', test='spa_test.txt',
                                               format='tsv' , fields=[('en', EN), ('es', ES)])
ES.build_vocab(ds_train.es)
EN.build_vocab(ds_train.en)
print('Elapsed: %.2fs\n' % (time() - t0))
print(f'train size: {len(ds_train)}\ntest size: {len(ds_test)}')
print('examples:')
print(vars(ds_train[2000]))
print(vars(ds_train[5000]))
print('\nMost common on ES vocabulary:')
print(ES.vocab.freqs.most_common(10))
print('\nMost common on EN vocabulary:')
print(EN.vocab.freqs.most_common(10))

out = {'train': ds_train.examples, 'test': ds_test.examples}

with open(out_path, 'wb') as f:
    pickle.dump(out, f)
    
print('\nSaved on: ' + out_path)



    Only loading the 'es' tokenizer.



    Only loading the 'en' tokenizer.

Building dataset...
Elapsed: 9.39s

train size: 102620
test size: 11402
examples:
{'en': ['he', 'loved', 'her', '.'], 'es': ['él', 'la', 'quería', '.']}
{'en': ['leave', 'us', 'alone', '.'], 'es': ['déjanos', 'solas', '.']}

Most common on ES vocabulary:
[('.', 88897), ('de', 20109), ('que', 18443), ('a', 17875), ('no', 17274), ('tom', 16932), ('la', 15942), ('el', 13728), ('?', 13482), ('¿', 13474)]

Most common on EN vocabulary:
[('.', 89361), ('i', 29478), ('the', 25108), ('to', 23010), ('you', 20509), ('tom', 17817), ('a', 15757), ('is', 13465), ('?', 13464), ("n't", 11828)]

Saved on: data/dataset.pkl


In [6]:
!ls -lh data

total 27M
-rw-rw-r-- 1 rhermoza rhermoza 27M Oct 28 21:24 dataset.pkl


# Load Dataset

In [1]:
from iseq2seq.utils import load_anki_dataset

out_path = 'data/dataset.pkl'
ds_train, ds_test, ES, EN = load_anki_dataset(out_path)
print(f'train size: {len(ds_train)}\ntest size: {len(ds_test)}')
print('examples:')
print(vars(ds_train[2000]))
print(vars(ds_train[5000]))
print('\nMost common on ES vocabulary:')
print(ES.vocab.freqs.most_common(10))
print('\nMost common on EN vocabulary:')
print(EN.vocab.freqs.most_common(10))



    Only loading the 'es' tokenizer.



    Only loading the 'en' tokenizer.

train size: 102620
test size: 11402
examples:
{'en': ['he', 'loved', 'her', '.'], 'es': ['él', 'la', 'quería', '.']}
{'en': ['leave', 'us', 'alone', '.'], 'es': ['déjanos', 'solas', '.']}

Most common on ES vocabulary:
[('.', 88897), ('de', 20109), ('que', 18443), ('a', 17875), ('no', 17274), ('tom', 16932), ('la', 15942), ('el', 13728), ('?', 13482), ('¿', 13474)]

Most common on EN vocabulary:
[('.', 89361), ('i', 29478), ('the', 25108), ('to', 23010), ('you', 20509), ('tom', 17817), ('a', 15757), ('is', 13465), ('?', 13464), ("n't", 11828)]


# Testing iterator

In [4]:
from torchtext import data

train_buckets = data.BucketIterator(ds_train, 32,
    sort_key=lambda x: data.interleave_keys(len(x.es), len(x.en)))
test_buckets = data.BucketIterator(ds_test, 500,
    sort_key=lambda x: data.interleave_keys(len(x.es), len(x.en)))
train_iter = iter(train_buckets)
test_iter = iter(test_buckets)
x_train = next(train_iter)
x_test = next(test_iter)
x_train.es.size(), x_train.en.size(), x_test.es.size(), x_test.en.size()

(torch.Size([7, 32]),
 torch.Size([11, 32]),
 torch.Size([7, 500]),
 torch.Size([13, 500]))

In [5]:
es_dict = {v:k for k,v in ES.vocab.stoi.items()}
en_dict = {v:k for k,v in EN.vocab.stoi.items()}
x_es = x_train.es.data.cpu().numpy()
x_en = x_train.en.data.cpu().numpy()

for i in range(10):
    print(x_es[:, i], end=' | ')
    print(x_en[:, i])
    print(' '.join([es_dict[w] for w in x_es[:, i]]), end=' | ')
    print(' '.join([en_dict[w] for w in x_en[:, i]]))

[ 369    9  237    3   42 1584    2] | [   2    5   45    6  230   19   26 1346    4    3    1]
conozco el nombre de este animal . | <sos> i know the name of this animal . <eos> <pad>
[  38 1717  811   73   32  153    2] | [   2   35 1700   52  339 1796    7   24    4    3    1]
tu amistad vale mucho para mí . | <sos> your friendship has great value to me . <eos> <pad>
[ 39 189   3 434 188  71   2] | [  2   5 101 190   7  81   8 181   4   3   1]
estoy feliz de verte otra vez . | <sos> i am happy to see you again . <eos> <pad>
[   7   24 7614  421 8246  278    2] | [   2    9   11 3387   10  313  835  654    4    3    1]
tom está enfrentando unos serios problemas . | <sos> tom is facing a few serious problems . <eos> <pad>
[  91    4   30 4760   12   47    2] | [  2   5  66   8  61 245  64  20   4   3   1]
creo que te equivocas en eso . | <sos> i think you 're wrong about that . <eos> <pad>
[  18   16 5991   36  160  431    2] | [   2   14   11 2480    7    6  134  357    4    3    1]
é