http://www.manythings.org/anki/

In [1]:
import os, spacy, pickle
import numpy as np
from time import time
from torchtext import data
from torchtext import datasets

import matplotlib.pyplot as plt
%matplotlib inline

# Split data

In [2]:
path = '/home/rhermoza/data/translation/spa-eng/spa.txt'
out_train = '/home/rhermoza/data/translation/spa-eng/spa_train.txt'
out_test = '/home/rhermoza/data/translation/spa-eng/spa_test.txt'
out_validation = '/home/rhermoza/data/translation/spa-eng/spa_validation.txt'
test = 0.05
validation = 0.05
max_length = 200

with open(path) as f:
    lines = f.readlines()

# filter lines
lines = [l for l in lines if len(l) <= max_length]

# Divide datasets
total_lines = len(lines)
test_lines = int(total_lines * test)
validation_lines = int(total_lines * validation)
train_lines = total_lines - test_lines - validation_lines
print(train_lines, test_lines, validation_lines)

tidxs = np.random.choice(total_lines, test_lines+validation_lines, replace=False)
idxs_train = [i for i in range(total_lines) if i not in tidxs]
idxs_test = tidxs[:test_lines]
idxs_validation = tidxs[test_lines:]

102417 5689 5689


In [3]:
with open(out_train, 'w') as f:
    for i in idxs_train:
        f.writelines(lines[i])
        
with open(out_test, 'w') as f:
    for i in idxs_test:
        f.writelines(lines[i])
        
with open(out_validation, 'w') as f:
    for i in idxs_validation:
        f.writelines(lines[i])

In [4]:
!ls -lh /home/rhermoza/data/translation/spa-eng/

total 15M
-rw-rw-r-- 1 rhermoza rhermoza 376K Nov  1 18:47 spa_test.txt
-rw-rw-r-- 1 rhermoza rhermoza 6.6M Nov  1 18:47 spa_train.txt
-rw-r--r-- 1 rhermoza rhermoza 7.4M Sep 29 00:01 spa.txt
-rw-rw-r-- 1 rhermoza rhermoza 378K Nov  1 18:47 spa_validation.txt


# Build dataset

In [5]:
path = '/home/rhermoza/data/translation/spa-eng'
out_path = 'data/dataset.pkl'
spacy_es = spacy.load('es')
spacy_en = spacy.load('en')

def tokenize_es(text):
    return [tok.text for tok in spacy_es.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

ES = data.Field(tokenize=tokenize_es, lower=True)
EN = data.Field(tokenize=tokenize_en, lower=True,
                init_token='<sos>', eos_token='<eos>')

t0 = time()
print('Building dataset...')
ds_train, ds_test, ds_val = data.TabularDataset.splits(path,
                                               train='spa_train.txt',
                                               test='spa_test.txt',
                                               validation='spa_validation.txt',
                                               format='tsv' , fields=[('en', EN), ('es', ES)])

ES.build_vocab(ds_train.es, min_freq=3)
EN.build_vocab(ds_train.en, min_freq=3)
print('Elapsed: %.2fs\n' % (time() - t0))
print(f'train size: {len(ds_train)}')
print(f'test size: {len(ds_test)}')
print(f'validation size: {len(ds_val)}')
print('examples:')
print(vars(ds_train[2000]))
print(vars(ds_train[5000]))
print('\nMost common on ES vocabulary:')
print(ES.vocab.freqs.most_common(10))
print('\nMost common on EN vocabulary:')
print(EN.vocab.freqs.most_common(10))

out = {'train': ds_train.examples, 'test': ds_test.examples, 'val': ds_val.examples}

with open(out_path, 'wb') as f:
    pickle.dump(out, f)
    
print('\nSaved on: ' + out_path)



    Only loading the 'es' tokenizer.



    Only loading the 'en' tokenizer.

Building dataset...
Elapsed: 9.11s

train size: 102417
test size: 5689
validation size: 5689
examples:
{'en': ['have', 'another', '.'], 'es': ['tomá', 'otro', '.']}
{'en': ['leave', 'it', 'to', 'me', '.'], 'es': ['déjamelo', 'a', 'mí', '.']}

Most common on ES vocabulary:
[('.', 88715), ('de', 19732), ('que', 18177), ('a', 17807), ('no', 17178), ('tom', 16931), ('la', 15682), ('el', 13664), ('?', 13410), ('¿', 13402)]

Most common on EN vocabulary:
[('.', 89157), ('i', 29296), ('the', 24826), ('to', 22929), ('you', 20279), ('tom', 17811), ('a', 15686), ('is', 13489), ('?', 13396), ("n't", 11702)]

Saved on: data/dataset.pkl


In [6]:
!ls -lh data

total 27M
-rw-rw-r-- 1 rhermoza rhermoza 27M Nov  1 18:47 dataset.pkl


# Load Dataset

In [1]:
from iseq2seq.utils import load_anki_dataset

out_path = 'data/dataset.pkl'
ds_train, ds_test, ds_val, ES, EN = load_anki_dataset(out_path)
print(f'train size: {len(ds_train)}')
print(f'test size: {len(ds_test)}')
print(f'validation size: {len(ds_val)}')
print('examples:')
print(vars(ds_train[2000]))
print(vars(ds_train[5000]))
print('\nMost common on ES vocabulary:')
print(ES.vocab.freqs.most_common(10))
print('\nMost common on EN vocabulary:')
print(EN.vocab.freqs.most_common(10))



    Only loading the 'es' tokenizer.



    Only loading the 'en' tokenizer.

train size: 102417
test size: 5689
validation size: 5689
examples:
{'en': ['have', 'another', '.'], 'es': ['tomá', 'otro', '.']}
{'en': ['leave', 'it', 'to', 'me', '.'], 'es': ['déjamelo', 'a', 'mí', '.']}

Most common on ES vocabulary:
[('.', 88715), ('de', 19732), ('que', 18177), ('a', 17807), ('no', 17178), ('tom', 16931), ('la', 15682), ('el', 13664), ('?', 13410), ('¿', 13402)]

Most common on EN vocabulary:
[('.', 89157), ('i', 29296), ('the', 24826), ('to', 22929), ('you', 20279), ('tom', 17811), ('a', 15686), ('is', 13489), ('?', 13396), ("n't", 11702)]


# Testing iterator

In [2]:
from torchtext import data

train_buckets = data.BucketIterator(ds_train, 32,
    sort_key=lambda x: data.interleave_keys(len(x.es), len(x.en)))
test_buckets = data.BucketIterator(ds_test, 500,
    sort_key=lambda x: data.interleave_keys(len(x.es), len(x.en)))
train_iter = iter(train_buckets)
test_iter = iter(test_buckets)
x_train = next(train_iter)
x_test = next(test_iter)
x_train.es.size(), x_train.en.size(), x_test.es.size(), x_test.en.size()

(torch.Size([7, 32]),
 torch.Size([8, 32]),
 torch.Size([7, 500]),
 torch.Size([12, 500]))

In [3]:
es_dict = {v:k for k,v in ES.vocab.stoi.items()}
en_dict = {v:k for k,v in EN.vocab.stoi.items()}
x_es = x_train.es.data.cpu().numpy()
x_en = x_train.en.data.cpu().numpy()

for i in range(10):
    print(x_es[:, i], end=' | ')
    print(x_en[:, i])
    print(' '.join([es_dict[w] for w in x_es[:, i]]), end=' | ')
    print(' '.join([en_dict[w] for w in x_en[:, i]]))

[  11  162   13   22  232 1006   10] | [  2  30  18  35 368 275  12   3]
¿ cuál es su comida favorita ? | <sos> what 's your favorite food ? <eos>
[  11   90   54  767   33 2693   10] | [   2   91   47    5   70 1524   12    3]
¿ dónde puedo conseguir las entradas ? | <sos> where can i get tickets ? <eos>
[  80 2686   14  701    3 9476    2] | [   2   29  196   10 3930  920    4    3]
nos dimos un baño de lodo . | <sos> we took a mud bath . <eos>
[ 24 271   3   7 306   0   2] | [   2    9   18  312   75 3325    4    3]
los padres de tom estaban advertí . | <sos> tom 's parents were devastated . <eos>
[ 28 579   5   7  21 611   2] | [  2   5 228   9  10 534   4   3]
le di a tom una oportunidad . | <sos> i gave tom a chance . <eos>
[813  18   7 942   8 115   2] | [   2    9 1444  120    6  319    4    3]
finalmente , tom contó la verdad . | <sos> tom eventually told the truth . <eos>
[  11   23   38 1643   12   51   10] | [  2  11  35 703  43 115  12   3]
¿ está tu esposo en casa ? | <so