In [4]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator

import spacy

import random
import math
import time

In [5]:
SEED = 1234

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings (tokens) and reverses it
    """
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

SRC = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), 
                                                    fields = (SRC, TRG))

print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:03<00:00, 442kB/s] 


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 92.4kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 80.1kB/s]


Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000


In [14]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)
print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (de) vocabulary: 7855
Unique tokens in target (en) vocabulary: 5893


In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

In [17]:
train_iterator

<torchtext.data.iterator.BucketIterator at 0x113349908>

In [27]:
for i, batch in enumerate(train_iterator):        
        src = batch.src
        trg = batch.trg
        print(f"src size: :{src.size()}")
        print(f"trg size: :{trg.size()}")

src size: :torch.Size([38, 128])
trg size: :torch.Size([36, 128])
src size: :torch.Size([27, 128])
trg size: :torch.Size([34, 128])
src size: :torch.Size([27, 128])
trg size: :torch.Size([34, 128])
src size: :torch.Size([46, 128])
trg size: :torch.Size([37, 128])
src size: :torch.Size([25, 128])
trg size: :torch.Size([28, 128])
src size: :torch.Size([23, 128])
trg size: :torch.Size([23, 128])
src size: :torch.Size([30, 128])
trg size: :torch.Size([34, 128])
src size: :torch.Size([33, 128])
trg size: :torch.Size([32, 128])
src size: :torch.Size([27, 128])
trg size: :torch.Size([27, 128])
src size: :torch.Size([37, 128])
trg size: :torch.Size([40, 128])
src size: :torch.Size([31, 128])
trg size: :torch.Size([31, 128])
src size: :torch.Size([29, 128])
trg size: :torch.Size([24, 128])
src size: :torch.Size([34, 128])
trg size: :torch.Size([40, 128])
src size: :torch.Size([28, 128])
trg size: :torch.Size([28, 128])
src size: :torch.Size([29, 128])
trg size: :torch.Size([31, 128])
src size: 

In [22]:
trg.size()

torch.Size([30, 128])

In [25]:
batch


[torchtext.data.batch.Batch of size 128 from MULTI30K]
	[.src]:[torch.LongTensor of size 28x128]
	[.trg]:[torch.LongTensor of size 30x128]