# Imports

## Downloads

In [1]:
# run code then restart runtime
!python -m spacy download en_core_web_md
!python -m spacy download de_core_news_md
!pip install torchtext==0.6.0

Collecting en_core_web_md==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz (96.4MB)
[K     |████████████████████████████████| 96.4MB 66.8MB/s 
Building wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-md: filename=en_core_web_md-2.2.5-cp36-none-any.whl size=98051304 sha256=114f267ea51565db4715506f1665a54b909b3ebcb200d26403ead35ba7a3979b
  Stored in directory: /tmp/pip-ephem-wheel-cache-kot3j__5/wheels/df/94/ad/f5cf59224cea6b5686ac4fd1ad19c8a07bc026e13c36502d81
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')
Collecting de_core_news_md==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/

## Librairies

In [1]:
# Torch
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
from torchtext.data.metrics import bleu_score

# Usual
import numpy as np
import spacy
import random
import sys

# Utils

In [2]:
def translate_sentence(model, sentence, german, english, device, max_length=50):

    # Load german tokenizer
    #spacy_ger = spacy.load("de_core_news_md")

    # tokenizing the sentences using spacy
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # print(tokens)

    # Adding sos and eos token
    tokens.insert(0, german.init_token)
    tokens.append(german.eos_token)

    # string to index
    text_to_indices = [german.vocab.stoi[token] for token in tokens]

    # Convert to Tensor of size (len(text_to_indices), 1) for pytorch
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        outputs_encoder,hidden, cell = model.encoder(sentence_tensor)

    outputs = [english.vocab.stoi["<sos>"]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        #disabling gradient calculation bc no use in inference
        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word, outputs_encoder, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == english.vocab.stoi["<eos>"]:
            break

    translated_sentence = [english.vocab.itos[idx] for idx in outputs]

    # remove start token
    return translated_sentence[1:]


def evaluation_fn(data, model, german, english, device):
    targets = []
    outputs = []

    for example in data:
        src = vars(example)["src"]
        trg = vars(example)["trg"]

        prediction = translate_sentence(model, src, german, english, device)
        prediction = prediction[:-1]  # remove <eos> token

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)


def save_checkpoint(state, filename):
    print("----------- Saving checkpoint -----------")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("----------- Saving checkpoint -----------")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

# Config

In [3]:
class Config:
  save_filename = "checkpoint.pth.tar"
  load_model = False
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  encoder_embedding_size = 300
  decoder_embedding_size = 300
  hidden_size = 1024
  num_layers = 4
  enc_dropout = 0.0
  dec_dropout = 0.0
  num_epochs = 30
  learning_rate = 1e-4
  batch_size = 32

# Model

## Encoder

In [4]:
class Encoder(nn.Module):
  def __init__(self, vocab_size, embedding_size, hidden_size, num_layers, pdropout):
    super(Encoder,self).__init__()

    self.hidden_size = hidden_size
    self.num_layers = num_layers

    self.dropout = nn.Dropout(pdropout)
    self.embedding = nn.Embedding(vocab_size, embedding_size)

    self.rnn = nn.LSTM(embedding_size, hidden_size,  num_layers, bidirectional = True)

    self.fc_hidden = nn.Linear(hidden_size*2, hidden_size)
    self.fc_cell = nn.Linear(hidden_size*2, hidden_size)

  def forward(self, x):
    # x shape (seq_length, batch_size)
    # embedding shape (seq_length, batch_size, embedding_size)
    # hidden / cell shape (num_layers *num_directions, batch_size, hidden_size) => (2*num_layers, batch_size, hidden_size)
    # hidden / cell shape after linear layer: (1, batch_size, hidden_size*2)
    # encoder states shape: (seq_length, batch_size, hidden_size*num_layers)

    # Word Embedding => building word vectors using dataset
    embedding = self.dropout(self.embedding(x))
    
    # Using Bi-directional LSTMs on Word Vectors
    encoder_states, (hidden,cell) = self.rnn(embedding)

    # The hidden layer going forward is hidden[0:1] and the one going backward is hidden[1:2]
    # Use both the forward and backward cell / hidden layer and run it through a Linear layer
    # The decoder layer is not bidirectional



    #hidden = self.fc_hidden(torch.cat((hidden[0:1], hidden[1:2]), dim=2))
    #cell = self.fc_cell(torch.cat((cell[0:1], cell[1:2]), dim=2))

    hidden = self.fc_hidden(torch.cat((hidden[0:4], hidden[4:8]), dim=2))
    cell = self.fc_cell(torch.cat((cell[0:4], cell[4:8]), dim=2))
    #encoder_states = torch.cat((encoder_states[:,:,0:1024], encoder_states[:,:,1024:]), dim=0)

    #print(encoder_states.shape)

    return encoder_states, hidden, cell
  


## Decoder

In [5]:
class Decoder(nn.Module):
  def __init__(self, vocab_size, embedding_size, hidden_size, output_size, num_layers, pdropout):
    super(Decoder, self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers

    self.embedding = nn.Embedding(vocab_size, embedding_size)
    self.dropout = nn.Dropout(pdropout)

    self.rnn = nn.LSTM(hidden_size*2 + embedding_size, hidden_size,  num_layers)
    
    self.energy = nn.Linear(hidden_size*3, 1)
    self.softmax = nn.Softmax(dim=0)
    self.relu = nn.ReLU()
    self.fc = nn.Linear(hidden_size, output_size)
  
  def forward(self, x, encoder_states, hidden, cell):
    # Shape of x: (batch_size) as we are decoding one word at a time 
    # but we need (1, batch_size) so we unsqueeze the input
    # embedding shape (1, batch_size, embedding_size)

    #hidden_reshape shape (sequence_length, batch_size, hidden_size*2)
    # energy: (sequence_length, batch_size, 1)
    # attention shape (sequence_length, batch_size, 1)

    # context_vector shape: (1, batch_size, hidden_size*2)
    # rnn_input shape: (1, batch_size, hidden_size*2 + embedding_size)
    # outputs shape (1, batch_size, hidden_size)
    # predictions shape: (batch_size, hidden_size)

    x = x.unsqueeze(0)
    embedding = self.dropout(self.embedding(x))

    sequence_length = encoder_states.shape[0]
    hidden_reshaped = hidden.repeat(sequence_length, 1, 1)

    #print(encoder_states[:,:,1536:2048].shape)

    encoder_states = encoder_states.repeat(self.num_layers, 1, 1)

    #print(hidden_reshaped.shape, encoder_states.shape)

    energy = self.relu(self.energy(torch.cat((hidden_reshaped, encoder_states), dim=2)))
    attention = self.softmax(energy)

    # we want context_vector: (1, batch_size, hidden_size*2), i.e knl
    # attention: (sequence_length, batch_size, 1), snk
    # encoder_states: (sequence_length, batch_size, hidden_size*2), snl
    context_vector = torch.einsum("snk,snl->knl", attention, encoder_states)

    rnn_input = torch.cat((context_vector, embedding), dim=2)
    outputs, (hidden,cell) = self.rnn(rnn_input, (hidden, cell))

    predictions = self.fc(outputs).squeeze(0)

    return predictions, hidden, cell

## seq2seq

In [6]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder):
    super(Seq2Seq, self).__init__()
    self.encoder = encoder
    self.decoder = decoder

  def forward(self, source, target, teacher_force_ratio = 0.5):

    # source shape: (source_length, batch_size)
    # target shape: (target_length, batch_size)
    
    batch_size = source.shape[1]
    target_len =target.shape[0]
    target_vocab_size = len(english.vocab)
  
    outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(Config.device)
    encoder_states, hidden, cell = self.encoder(source)

    # Grab start token
    x = target[0]

    for t in range(1, target_len):
      output, hidden, cell = self.decoder(x, encoder_states, hidden, cell)
      outputs[t] = output

      best_guess = output.argmax(1)

      # if random < teacher_force_ratio then the target word is used
      # else, we use the output from the model
      # This allows the model to have similar inputs at both training and testing time
      # Testing time is similar to having teacher_force_ratio set to 0

      x = target[t] if random.random() < teacher_force_ratio else best_guess
    
    return outputs


# Training

In [7]:
spacy_ger = spacy.load('de_core_news_md')
spacy_en = spacy.load('en_core_web_md')

def tokenizer_ger(text):
  return [tok.text for tok in spacy_ger.tokenizer(text)]

def tokenizer_en(text):
  return [tok.text for tok in spacy_en.tokenizer(text)]

german = Field(tokenize = tokenizer_ger, lower = True, init_token='<sos>', eos_token='<eos>')

english = Field(tokenize = tokenizer_en, lower = True,init_token='<sos>', eos_token='<eos>')

In [8]:
train_data, validation_data, test_data = Multi30k.splits(exts=(".de", ".en"),fields=(german,english))

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:03<00:00, 313kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 91.9kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 86.0kB/s]


In [9]:
def train_fn(train_data, validation_data, test_data):

  #building vocabulary (tokenizing my training data)
  german.build_vocab(train_data, max_size=10000, min_freq=2)
  english.build_vocab(train_data, max_size=10000, min_freq=2)

  input_size_encoder = len(german.vocab)
  input_size_decoder = len(english.vocab)
  output_size = len(english.vocab)

  train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
  (train_data, validation_data, test_data),
  batch_size = Config.batch_size,
  sort_within_batch = True,
  sort_key = lambda x: len(x.src),
  device=Config.device)

  encoder = Encoder(input_size_encoder, Config.encoder_embedding_size, Config.hidden_size, Config.num_layers, Config.enc_dropout).to(Config.device)
  decoder = Decoder(input_size_decoder, Config.decoder_embedding_size, Config.hidden_size, output_size, Config.num_layers, Config.dec_dropout).to(Config.device)
  model = Seq2Seq(encoder, decoder).to(Config.device)

  optimizer = optim.Adam(model.parameters(), lr=Config.learning_rate)
  steps_per_epoch = len(train_data)

  # adding padding but don't want to pay anything for it in the cost function
  pad_idx = english.vocab.stoi['<pad>']
  criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

  if Config.load_model:
    load_checkpoint(torch.load(Config.save_filename), model, optimizer)
  
  sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen."

  for epoch in range(Config.num_epochs):
    print(f'Epoch[{epoch} / {Config.num_epochs}]')

    checkpoint = {'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}
    save_checkpoint(checkpoint, Config.save_filename)

    model.eval()

    translated_sentence = translate_sentence(model, sentence, german, english, Config.device, max_length=50)

    print(f"Translated example sentence: \n {translated_sentence}")

    model.train()

    losses = []

    for batch_idx, batch in enumerate(train_iterator):
      inp_data = batch.src.to(Config.device)
      target = batch.trg.to(Config.device)
      
      output = model(inp_data, target)
      # output shape: (target_len, batch_size, output_dim)
      
      output = output[1:].reshape(-1, output.shape[2])
      target = target[1:].reshape(-1)

      optimizer.zero_grad()
      loss = criterion(output, target)
      losses.append(loss)
      loss.backward()

      #to avoid exploding gradients
      torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
      optimizer.step()

    print(f"Loss: {sum(losses)/len(losses)}")

    if epoch % 5 == 0:
      score = evaluation_fn(test_data[1:1000], model, german, english, Config.device)
      print(f"Bleu score {score*100:.2f}")

  score = evaluation_fn(test_data, model, german, english, Config.device)
  print(f"Bleu score {score*100:.2f}")

In [None]:
train_fn(train_data, validation_data, test_data)

Epoch[0 / 30]
----------- Saving checkpoint -----------
Translated example sentence: 
 ['rafting', 'adjacent', 'adjacent', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate', 'karate']
Loss: 4.812053680419922
Bleu score 4.24
Epoch[1 / 30]
----------- Saving checkpoint -----------
Translated example sentence: 
 ['a', 'young', 'is', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', '<eos>']
Loss: 4.258140563964844
Epoch[2 / 30]
----------- Saving checkpoint -----------
Translated example sentence: 
 ['a', '<unk>', 'player', 'with', 'a', '<unk>', 'is', 'is', 'to', 'the', 'the', 'in', 'the