In [1]:
import spacy
from torchtext import data, datasets

In [2]:
de = spacy.load('de')
en = spacy.load('en')

In [4]:
def tokenize_de(text):
    return [tok.text for tok in de.tokenizer(text)]

In [5]:
def tokenize_en(text):
    return [tok.text for tok in en.tokenizer(text)]

In [6]:
begin_word = '<s>'
end_word = '</s>'
blank_word = "<blank>"

In [7]:
source = data.Field(
    tokenize=tokenize_de,
    pad_token=blank_word
)

target = data.Field(
    tokenize=tokenize_en, init_token = begin_word, 
    eos_token = end_word, pad_token=blank_word
)

In [8]:
max_length = 100
train, val, test = datasets.IWSLT.splits(
    exts=('.de', '.en'), fields=(source, target), 
    filter_pred=lambda x: len(vars(x)['src']) <= \
    max_length and len(vars(x)['trg']) <= max_length
)

downloading de-en.tgz


de-en.tgz: 100%|██████████| 24.2M/24.2M [00:14<00:00, 1.71MB/s]


.data/iwslt/de-en/IWSLT16.TED.dev2010.de-en.de.xml
.data/iwslt/de-en/IWSLT16.TED.dev2010.de-en.en.xml
.data/iwslt/de-en/IWSLT16.TED.tst2010.de-en.de.xml
.data/iwslt/de-en/IWSLT16.TED.tst2010.de-en.en.xml
.data/iwslt/de-en/IWSLT16.TED.tst2011.de-en.de.xml
.data/iwslt/de-en/IWSLT16.TED.tst2011.de-en.en.xml
.data/iwslt/de-en/IWSLT16.TED.tst2012.de-en.de.xml
.data/iwslt/de-en/IWSLT16.TED.tst2012.de-en.en.xml
.data/iwslt/de-en/IWSLT16.TED.tst2013.de-en.de.xml
.data/iwslt/de-en/IWSLT16.TED.tst2013.de-en.en.xml
.data/iwslt/de-en/IWSLT16.TED.tst2014.de-en.de.xml
.data/iwslt/de-en/IWSLT16.TED.tst2014.de-en.en.xml
.data/iwslt/de-en/IWSLT16.TEDX.dev2012.de-en.de.xml
.data/iwslt/de-en/IWSLT16.TEDX.dev2012.de-en.en.xml
.data/iwslt/de-en/IWSLT16.TEDX.tst2013.de-en.de.xml
.data/iwslt/de-en/IWSLT16.TEDX.tst2013.de-en.en.xml
.data/iwslt/de-en/IWSLT16.TEDX.tst2014.de-en.de.xml
.data/iwslt/de-en/IWSLT16.TEDX.tst2014.de-en.en.xml
.data/iwslt/de-en/train.tags.de-en.de
.data/iwslt/de-en/train.tags.de-en.en


In [9]:
minimum_frequency = 2
source.build_vocab(train.src, min_freq=minimum_frequency)
target.build_vocab(train.trg, min_freq=minimum_frequency)