In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import time
from argparse import ArgumentParser
from pathlib import Path

import torch

from trext.datamodules import DeEnDataModule
#from trext.loggers import NeptuneLogger
from trext.models import (
    TransformerTranslator,
    TransformerEncoder,
    TransformerDecoder,
)
from trext.trainer import Trainer
from trext.utils import Editor, Vocabulary


args = dict(
    batch_size=64,
    decoder_dropout_p=0.5,
    decoder_hidden_dim=128,
    decoder_embedding_dim=128,
    device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    encoder_dropout_p=0.5,
    encoder_hidden_dim=128,
    encoder_embedding_dim=128,
    max_epoch=10,
    verbose=True,
    version='0.1',
)

In [4]:
start_time = time.time()
print(f"Device is: {args['device']}")

print("Preparing datamodule...")
datamodule = DeEnDataModule(
    data_dir=Path('data/homework_machine_translation_de-en'),
    batch_size=args['batch_size'],
    num_workers=4,
)
datamodule.setup()
print(f"Datamodule is prepared ({time.time() - start_time} seconds)")

Device is: cuda
Preparing datamodule...
Datamodule is prepared (10.21592116355896 seconds)


In [None]:
encoder = TransformerEncoder(
    input_dim=len(datamodule.de_vocabulary),
    embedding_dim=args['encoder_embedding_dim'],
    encoder_hidden_dim=args['encoder_hidden_dim'],
    decoder_hidden_dim=args['decoder_hidden_dim'],
    dropout_p=args['encoder_dropout_p'],
)
decoder = TransformerDecoder(
    output_dim=len(datamodule.en_vocabulary),
    embedding_dim=args['decoder_embedding_dim'],
    encoder_hidden_dim=args['encoder_hidden_dim'],
    decoder_hidden_dim=args['decoder_hidden_dim'],
    dropout_p=args['decoder_dropout_p'],
    attention=attention,
)
translator = TransformerTranslator(
    encoder=encoder,
    decoder=decoder,
    learning_rate=3e-4,
    device=args['device'],
).to(args['device'])

In [None]:


trainer = Trainer(
    logger=None,
    max_epoch=args['max_epoch'],
    verbose=args['verbose'],
    version=args['version'],
)

print('Let\'s start training!')
trainer.fit(
    model=translator,
    datamodule=datamodule,
)

print('Predicts!')
predicts = trainer.predict(
    model=translator,
    datamodule=datamodule,
)

In [7]:
from torchtext.data import Field, BucketIterator

In [8]:
def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings
    """
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]



SRC = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = True)



In [13]:
from torchtext.datasets import Multi30k
import spacy

train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), 
                                                    fields = (SRC, TRG))

ModuleNotFoundError: No module named 'spacy'

In [12]:
!python -m spacy download en
!python -m spacy download de

/home/s-kim/anaconda3/bin/python: No module named spacy
/home/s-kim/anaconda3/bin/python: No module named spacy


In [22]:
from torchtext.datasets import TranslationDataset

DATA_PATH = Path("data/homework_machine_translation_de-en")
SRC = Field(tokenize = lambda x: x.split(),
            tokenizer_language="de",
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)

TRG = Field(tokenize = lambda x: x.split(),
            tokenizer_language="en",
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)

train_data = TranslationDataset(str(DATA_PATH / 'train.de-en.'), ['de', 'en'], fields=(SRC, TRG))
valid_data = TranslationDataset(str(DATA_PATH / 'val.de-en.'), ['de', 'en'], fields=(SRC, TRG))
test_data = TranslationDataset(str(DATA_PATH / 'test1.de-en.'), ['de', 'de'], fields=(SRC, SRC))

SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [23]:
from torch.utils.data import DataLoader

a = BucketIterator(
    train_data,
    batch_size=64,
    sort_key=lambda x: len(x.comment_text), # the BucketIterator needs to be told what function it should use to group the data.
)

In [24]:
for i, ai in enumerate(a):
    print(ai)


[torchtext.data.batch.Batch of size 64]
	[.src]:[torch.LongTensor of size 59x64]
	[.trg]:[torch.LongTensor of size 66x64]

[torchtext.data.batch.Batch of size 64]
	[.src]:[torch.LongTensor of size 69x64]
	[.trg]:[torch.LongTensor of size 74x64]

[torchtext.data.batch.Batch of size 64]
	[.src]:[torch.LongTensor of size 64x64]
	[.trg]:[torch.LongTensor of size 66x64]

[torchtext.data.batch.Batch of size 64]
	[.src]:[torch.LongTensor of size 76x64]
	[.trg]:[torch.LongTensor of size 82x64]

[torchtext.data.batch.Batch of size 64]
	[.src]:[torch.LongTensor of size 43x64]
	[.trg]:[torch.LongTensor of size 46x64]

[torchtext.data.batch.Batch of size 64]
	[.src]:[torch.LongTensor of size 49x64]
	[.trg]:[torch.LongTensor of size 53x64]

[torchtext.data.batch.Batch of size 64]
	[.src]:[torch.LongTensor of size 71x64]
	[.trg]:[torch.LongTensor of size 77x64]

[torchtext.data.batch.Batch of size 64]
	[.src]:[torch.LongTensor of size 58x64]
	[.trg]:[torch.LongTensor of size 58x64]

[torchtext.data