In [62]:
import torch
import torch.nn as nn
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator
import numpy as np
import spacy
import random
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from utils import save_checkpoint

In [2]:
device = 'gpu' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [3]:
# !python -m spacy download de_core_news_sm
# !python -m spacy download en_core_web_sm

In [4]:
spacy_ger = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

In [5]:
def ger_tokenizer(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]

def en_tokenizer(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [6]:
german = Field(tokenize=ger_tokenizer, lower=True, init_token='<sos>', eos_token='<eos>')
english = Field(tokenize=en_tokenizer, lower=True, init_token='<sos>', eos_token='<eos>')

In [7]:
train_data, valid_data, test_data = Multi30k.splits(train='train',test='test',validation='val',
    exts=(".de", ".en"), fields=(german, english)
)

In [9]:
german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)

In [74]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super().__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
        
    def forward(self, x):
        embedding = self.dropout(self.embedding(x))
        outputs, (hidden, cell) = self.rnn(embedding)
        
        return hidden, cell

    
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, p):
        super().__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers,dropout=p)
        self.fc = nn.Linear(hidden_size, output_size)
        
        
    def forward(self, x, hidden, cell):
        x = x.unsqueeze(0)
        embedding = self.dropout(self.embedding(x))
        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        
        predictions = self.fc(outputs)
        
        predictions = predictions.squeeze(0)
        
        return predictions, hidden, cell

    
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, source, target, teacher_force_ratio = 0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(english.vocab)
        
        outputs = torch.zeros(target_len, batch_size, target_vocab_size)
        outputs = outputs.cuda() if device == 'gpu' else outputs
        hidden, cell = self.encoder(source)
        x = target[0] # <sos> token
        
        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            outputs[t] = output
            best_guess = output.argmax(1)
            x = target[t] if random.random() < teacher_force_ratio else best_guess
        
        return outputs


In [75]:
num_epochs = 100
learning_rate = 0.001
batch_size = 64

load_model = False
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

writer = SummaryWriter(f"runs/loss_plot")
step = 0

In [76]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=batch_size,
    sort_within_batch = True,
    sort_key= lambda x: len(x.src),
    device=device
)

In [88]:
encoder_net = Encoder(
    input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout
)
encoder_net = encoder_net.cuda() if device == 'gpu' else encoder_net
encoder_net

Encoder(
  (dropout): Dropout(p=0.5, inplace=False)
  (embedding): Embedding(7853, 300)
  (rnn): LSTM(300, 1024, num_layers=2, dropout=0.5)
)

In [89]:
decoder_net = Decoder(
    input_size_decoder,
    decoder_embedding_size,
    hidden_size,
    output_size,
    num_layers,
    dec_dropout
)
decoder_net = decoder_net.cuda() if device == 'gpu' else decoder_net
decoder_net

Decoder(
  (dropout): Dropout(p=0.5, inplace=False)
  (embedding): Embedding(5893, 300)
  (rnn): LSTM(300, 1024, num_layers=2, dropout=0.5)
  (fc): Linear(in_features=1024, out_features=5893, bias=True)
)

In [90]:
model = Seq2Seq(encoder_net, decoder_net)
model = model.cuda() if device == 'gpu' else model
model

Seq2Seq(
  (encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(7853, 300)
    (rnn): LSTM(300, 1024, num_layers=2, dropout=0.5)
  )
  (decoder): Decoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(5893, 300)
    (rnn): LSTM(300, 1024, num_layers=2, dropout=0.5)
    (fc): Linear(in_features=1024, out_features=5893, bias=True)
  )
)

In [80]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [81]:
pad_idx = english.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [82]:
sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen."

In [91]:
for epoch in range(num_epochs):
    checkpoint = {'state_dict': model.state_dict(),
                  'optimizer': optimizer.state_dict()}
    save_checkpoint(checkpoint)
#     model.eval()
#     translated_sentence = 
    model.train()
    for batch_idx, batch in enumerate(tqdm(train_iterator)):
        inp_data = batch.src.cuda(device) if device == 'gpu' else batch.src
        target = batch.trg.cuda(device) if device == 'gpu' else batch.trg
        output = model(inp_data, target)
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)
        
        optimizer.zero_grad()
        loss = criterion(output, target)
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        
        optimizer.step()
        writer.add_scalar("Training Loss", loss, global_step=step)
        step +=1


=> Saving checkpoint


  1%|▋                                                                                 | 4/454 [00:05<09:36,  1.28s/it]


KeyboardInterrupt: 