In [1]:
# Source: 
# https://github.com/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%20to%20Sequence%20Learning%20with
# %20Neural%20Networks.ipynb

In [2]:

import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator

import spacy

import random
import math
import time

In [3]:
SEED = 1234

random.seed(SEED)
torch.manual_seed(SEED)

# note about reproducibility in CUDA: 
# https://hyp.is/dOiOAAuzEeqcu2OxIZfR4w/pytorch.org/docs/stable/notes/randomness.html
torch.backends.cudnn.deterministic = True 

In [4]:
# Download the spacy models via command line:
# conda activate pynlp_env
# cd /development/.../NLPStudy/data
# python -m spacy download en
# python -m spacy download de

# Then load the models
spacyDE = spacy.load('de')
spacyEN = spacy.load('en')

In [None]:
# Creating the tokenizer functions
# NOTE: reversing due to optimization ease

def tokenizeGerman(germanText: str):
    """
    Tokenizes German text from a string into a list of strings (as tokens)
    and reverses it
    :param germanText: 
    :return: 
    """
    return [tok.text for tok in spacyDE.tokenizer(germanText)][::-1]

def tokenizeEnglish(englishText: str):
    """
    Tokenizes English text from a string into a list of strings (as tokens)
    and reverses it
    
    :param englishText: 
    :return: 
    """
    return [tok.text for tok in spacyEN.tokenizer(englishText)]