In [45]:
import torch
import torch.nn as nn
import torch.optim as optim

import numpy as np
import spacy
import random
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
from torch.utils.tensorboard import SummaryWriter


In [2]:
!python -m spacy download de_core_news_sm
import en_core_web_sm
import de_core_news_sm

spacy_en = en_core_web_sm.load()
spacy_ger = de_core_news_sm.load()

Collecting de_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9MB)
[K     |████████████████████████████████| 14.9MB 10.3MB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-cp37-none-any.whl size=14907057 sha256=ae3722f5eb0fd251c4883a856f9914677b288bf29db1b2bfb43656da4e2bae33
  Stored in directory: /tmp/pip-ephem-wheel-cache-cgvmeutg/wheels/ba/3f/ed/d4aa8e45e7191b7f32db4bfad565e7da1edbf05c916ca7a1ca
Successfully built de-core-news-sm
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')


In [3]:
def tokenizer_ger(text):
  return [tok.text for tok in spacy_ger.tokenizer(text)]

def tokenizer_en(text):
  return [tok.text for tok in spacy_en.tokenizer(text)]

In [4]:
german = Field(tokenize=tokenizer_ger, lower=True, init_token="<sos>", eos_token="<eos>")
english = Field(tokenize=tokenizer_en, lower=True, init_token="<sos>", eos_token="<eos>")

In [5]:
train, val, test = Multi30k.splits(exts = ('.de', '.en'), fields=(german, english))
german.build_vocab(train, max_size = 10000, min_freq = 2)
english.build_vocab(train, max_size = 10000, min_freq = 2)

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:01<00:00, 970kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 273kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 268kB/s]


In [16]:
device = torch.device("cuda")
train_gen, val_gen, test_gen = BucketIterator.splits((train, val, test), sort_within_batch = True, sort_key = lambda x: len(x.src), batch_size=128,device=device)

In [47]:
class Encoder(nn.Module):
  def __init__(self, input_size, embd_dim, hid_size, num_layers, rate):
    super().__init__()
    self.hid_size = hid_size
    self.num_layers = num_layers
    self.drop = nn.Dropout(rate)
    self.emb = nn.Embedding(input_size, embd_dim)
    self.rnn = nn.LSTM(embd_dim, hid_size, num_layers, dropout = rate)

  def forward(self, x):
    #sent len x batch
    emb = self.drop(self.emb(x)) #sent len x batch x emb_sizetown 
    out, (hidden, cell) = self.rnn(emb)
    return hidden, cell

class Decoder(nn.Module):
  def __init__(self, input_size, emb_size, hid_size, out, num_layers, dropout):
    super().__init__()
    self.hidden_size = hid_size
    self.num_layers = num_layers
    self.dropout = nn.Dropout(dropout)
    self.embedding = nn.Embedding(input_size, emb_size)
    self.rnn = nn.LSTM(emb_size, hid_size, num_layers, dropout = dropout)
    self.fc = nn.Linear(hid_size, out)


  def forward(self, x,hidden, cell):
    #shape of x: (N) we want (1,N)
    x = x.unsqueeze(0)
    embed = self.dropout(self.embedding(x)) #(1, N, embed_size)
    out, (hidden, cell) = self.rnn(embed, (hidden, cell))
    preds = self.fc(out) #(1, N, len of vocab)
    preds = preds.squeeze(0)
    return preds, hidden, cell

class seq2seq(nn.Module): # combine
  def __init__(self, encoder, decoder):
    super().__init__()
    self.enc = encoder
    self.dec = decoder

  def forward(self, source, target, teacher = 0.5):
    batch = source.shape[1]
    target_len = target.shape[0]
    target_vocab_size = len(english.vocab)
    outputs = torch.zeros(target_len, batch, target_vocab_size)

    hidden, cell = self.enc(source)
    x = target[0] #start
    for t in range(1, target_len):
      out, hidden, cell = self.dec(x, hidden, cell)
      outputs[t] = out
      best_guess = out.argmax(1)
      x = target[t] if random.random() < teacher else best_guess
    return outputs

In [48]:
epochs = 20
lr = 0.001
batch_size = 64
load_model = False
device = torch.device("cuda")
input_size_enc = len(german.vocab)
input_size_dec = len(english.vocab)
out_size = len(english.vocab)
encoder_emb_size = 300
dec_emb_size = 300
hidden_size = 1024
num_layers = 2
enc_drop = 0.5
dec_drop = 0.5

In [49]:
encoder_net = Encoder(input_size_enc, encoder_emb_size, hidden_size, num_layers, enc_drop).to(device)
decoder_net = Decoder(input_size_dec, dec_emb_size, hidden_size, out=out_size, num_layers = num_layers, dropout = dec_drop).to(device)

In [57]:
pad_idx = english.vocab.stoi['<pad>']
crit = nn.CrossEntropyLoss(ignore_index=pad_idx)
model = seq2seq(encoder_net, decoder_net).to(device)
optimizer = optim.Adam(model.parameters())
for epoch in range(epochs):
  print(f"Epoch {epoch}/{epochs}")

  for batch_idx, batch in enumerate(train_gen):
    inp_data = batch.src.to(device)
    target = batch.trg.to(device)
    out = model(inp_data, target)
    out = out[1:].view(-1, out.shape[-1]).to(device)
    target = target[1:].view(-1)

    optimizer.zero_grad()
    loss = crit(out, target)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
    optimizer.step()

Epoch 0/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20


In [59]:
sentence = "ein boot mir mehreren"
model(sentence, "one boat with several")

AttributeError: ignored