In [None]:
%%markdown 

To get the tokenizers working, you need to install `spacy` then install the relevant tokenizers from the command line:

```bash
python -m spacy download en_core_web_sm 
python -m spacy download de_core_news_sm
```

In [9]:
import torchtext
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import vocab
from torchtext.utils import download_from_url, extract_archive
import io

url_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'
train_urls = ('train.de.gz', 'train.en.gz')
val_urls = ('val.de.gz', 'val.en.gz')
test_urls = ('test_2016_flickr.de.gz', 'test_2016_flickr.en.gz')

train_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in train_urls]
val_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]
test_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]

de_tokenizer = get_tokenizer('spacy', language='de')
en_tokenizer = get_tokenizer('spacy', language='en')

100%|██████████| 637k/637k [00:00<00:00, 5.98MB/s]
100%|██████████| 569k/569k [00:00<00:00, 3.36MB/s]
100%|██████████| 24.7k/24.7k [00:00<00:00, 3.47MB/s]
100%|██████████| 21.6k/21.6k [00:00<00:00, 9.43MB/s]
100%|██████████| 22.9k/22.9k [00:00<00:00, 3.77MB/s]
100%|██████████| 21.1k/21.1k [00:00<00:00, 7.34MB/s]


OSError: [E050] Can't find model 'de_core_news_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [23]:
from torchtext.vocab import vocab

def build_vocab(filepath, tokenizer):
  counter = Counter()
  with io.open(filepath, encoding="utf8") as f:
    for string_ in f:
      counter.update(tokenizer(string_))
  vocab_dict = vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
  vocab_dict.set_default_index(vocab_dict['<unk>'])

  return vocab_dict

de_vocab = build_vocab(train_filepaths[0], de_tokenizer)
en_vocab = build_vocab(train_filepaths[1], en_tokenizer)

In [26]:
def data_process(filepaths):
  raw_de_iter = iter(io.open(filepaths[0], encoding="utf8"))
  raw_en_iter = iter(io.open(filepaths[1], encoding="utf8"))
  data = []
  for (raw_de, raw_en) in zip(raw_de_iter, raw_en_iter):
    de_tensor_ = torch.tensor([de_vocab[token] for token in de_tokenizer(raw_de)],
                            dtype=torch.long)
    en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en)],
                            dtype=torch.long)
    data.append((de_tensor_, en_tensor_))
  return data

train_data = data_process(train_filepaths)
val_data = data_process(val_filepaths)
test_data = data_process(test_filepaths)