We try encoder-decoder models with attention on German-to-English machine translation problem using Multi30k data.
The Multi30k data can be automatically downloaded or we also provide the data in * multi30k_test_purpose* folder. For quick test purpose, you can use *.10de/en dataset, which is part of multi30k_test_purpose data but much more smaller. 



The pre-processing part (data loading, tokenizing), Encoder part is already implemented. You have to accomplish the part to apply attention mechanism in Decoder.

In [None]:
!conda install -y -c conda-forge spacy  
!conda install -y -c pytorch torchtext

# Imports

Let's first import all the dependencies we will need for this exercise

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas
import spacy
from spacy.lang.en import English
from spacy.lang.de import German
import io
import random
from collections import Counter
from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import math


  return torch._C._cuda_getDeviceCount() > 0


In [5]:
!~/miniconda3/envs/aml_attention_env/bin/python -m spacy download en_core_web_sm
!~/miniconda3/envs/aml_attention_env/bin/python -m spacy download de_core_news_sm

  return torch._C._cuda_getDeviceCount() > 0
Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 3.0 MB/s eta 0:00:01
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.3.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
  return torch._C._cuda_getDeviceCount() > 0
Collecting de-core-news-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.3.0/de_core_news_sm-3.3.0-py3-none-any.whl (14.6 MB)
[K     |████████████████████████████████| 14.6 MB 2.8 MB/s eta 0:00:01��    | 12.5 MB 2.8 MB/s eta 0:00:01
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.3.0
[38;5;2m✔ Download and installation successful[0m
You can now load the packag

# Loading the Dataset and making it iterable


In [6]:
SEED = 1234

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
from torchtext.datasets import Multi30k

# This is needed since original host server keeps shutting down
# Update URLs to point to data stored by user
Multi30k.urls = [
    "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz",
    "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz",
    "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/mmt_task1_test2016.tar.gz"
]

ROOT = './'

# After first download, this line can be disabled
Multi30k.download(ROOT)



In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings
    """
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

SRC = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)


# partial of multi30k data, only for test purpose during implementation 
"""
ROOT = './'
Multi30k.download(ROOT)

train_data, valid_data, test_data = Multi30k.splits(path = ROOT + "/multi30k_test_purpose/", exts = ('.10de', '.10en'), 
                                                    fields = (SRC, TRG))

#At the end if you can train and evaluate with whole multi30k data set
"""

train_data, valid_data, test_data = Multi30k.splits(path="./multi30k", exts = ('.de', '.en'), 
                                                    fields = (SRC, TRG))


#Building vocabularies, which map string to token_ids and vice versa
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)


BATCH_SIZE = 128

# generate iterator, #
#for this exercise we only use train data/train_iterator
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device)





# Neural machine translation with attention 

# Encoder Model


In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, emb_dim, encoder_hidden_dim, decoder_hid_dim, dropout):
        super(Encoder, self).__init__()
        self.input_dim = input_size
        self.encoder_hidden_dim = encoder_hidden_dim
        self.decoder_hid_dim = decoder_hid_dim
       
        self.dropout = dropout

        self.embedding = nn.Embedding(input_size, emb_dim)
        
        #bidirectional GRU
        self.rnn = nn.GRU(emb_dim, encoder_hidden_dim, bidirectional = True)
        
        self.fc = nn.Linear(encoder_hidden_dim*2,decoder_hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_source):
        #Embed input words
        embedded = self.dropout(self.embedding(input_source))
        
        output, hidden = self.rnn(embedded)
        
        #final hidden state (forwards and backwards) of encode is also initial decoder hidden state
        #  encoder RNNs fed through a linear layer
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        
        return output, hidden

# Decoder with attention

#### Below, you find the class of Decoder. Accomplish the attention part accordingly.
 

### Reminder:
At step $t$, given all hidden states of encoder-rnn and previous hidden state of decoder,

* attention weights should be calculated
* context vector: weighted sum of weighted sum of the encoder hidden state, which will be given to decoder


context vector: $c_t = \sum_{t=1}^{T} \alpha (s_{t-1}, h_{t'}) h_{t'}$ 


attention_weights: 

$f_{att} (h_{t'}, s_{t-1}) = v_{a}^{\top} energy$  

$energy = tanh (attn[h_{t'}, s_{t-1}])$  
where  $v_{a}$ is the learnable parameter. 






In [None]:
class Decoder(nn.Module):
    def __init__(self, decoder_output_dim, emb_dim, encoder_hid_dim, decoder_hid_dim, dropout):
        super().__init__()
        self.emb_dim = emb_dim
        self.encoder_hid_dim = encoder_hid_dim
        self.decoder_hid_dim = decoder_hid_dim
        self.decoder_output_dim = decoder_output_dim
        self.dropout = dropout
        
        
       
        
        self.embedding = nn.Embedding(decoder_output_dim, emb_dim)
        
        #attention layer
        self.attention = nn.Linear((encoder_hid_dim * 2) + decoder_hid_dim, decoder_hid_dim)
        self.v = nn.Parameter(torch.rand(decoder_hid_dim))
        
        self.rnn = nn.GRU((encoder_hid_dim * 2) + emb_dim, decoder_hid_dim)
        
        self.out = nn.Linear((encoder_hid_dim * 2) + decoder_hid_dim + emb_dim, decoder_output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
       
    def forward(self, input, hidden, encoder_outputs):
             
        input = input.unsqueeze(0)
        
        embedded = self.dropout(self.embedding(input))
        
        
        batch_size = encoder_outputs.shape[1]
        src_sen_len = encoder_outputs.shape[0]
        
        
         #------------------Task starts here--------------------
        # 1. calculate attention weights: concatenate decoder previous hidden states and encoder_outputs 
        # and passing them through attention layer and a tanh activation function.
         
        #encoder hidden states are a sequence of $T$ tensors, 
        # previous decoder hidden state is a single tensor, so repeat the previous decoder hidden state $T$ times.
        
        # calculate energy according the formula given above
        # energy = ...
        
        energy = None
        
        #similar to the variable v in the formular above
        #v= ....
        v = None
        
       
        
        # multiplication of v and energy
        # attention = ...
        attention = None
        
        # softmax on the attention 
        #att = ....
        att = None
       
        # calculate dynamic context vector: weighted sum of the encoder hidden state
        
        #context_v = ...
        context_v = None
        
        #------------------Task ends here--------------------
        
        rnn_input = torch.cat((embedded, context_v), dim = 2)
        
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        
        
        assert (output == hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        context_v = context_v.squeeze(0)
        
        output = self.out(torch.cat((output, context_v, embedded), dim = 1))
        return output, hidden.squeeze(0), att.squeeze(1)
        

# Building Model

In [None]:
class builModel(nn.Module):
    def __init__(self, encoder, decoder, pad_idx, sos_idx, eos_idx, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.pad_idx = pad_idx
        self.sos_idx = sos_idx
        self.eos_idx = eos_idx
        self.device = device
        
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        if trg is None:
            assert teacher_forcing_ratio == 0, "Must be zero during inference"
            inference = True
            #trg tensor filled with <sos> tokens.
            trg = torch.zeros((100, src.shape[1])).long().fill_(self.sos_idx).to(src.device)
        else:
            inference = False
            
        batch_size = src.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.decoder_output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
         #tensor to store attention
        attentions = torch.zeros(max_len, batch_size, src.shape[0]).to(self.device)
        
       #encoder outputs and hidden state
        encoder_outputs, hidden = self.encoder(src)
                
        #first input to the decoder is the <sos> tokens
        output = trg[0,:]
        
        for t in range(1, max_len):
            # last hidden state of encoder acts as initial hidden state in the decoder. 
            # In the further steps, all encoder outputs, previous hidden state of decoder and output are inserted into decoder
        
            output, hidden, attention_score = self.decoder(output, hidden, encoder_outputs)#self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention
            outputs[t] = output
            attentions[t] = attention_score
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            output = (trg[t] if teacher_force else top1)
            if inference and output.item() == self.eos_idx:
                return outputs[:t], attentions[:t]
      
        return outputs, attentions

# Model initialization

In [None]:
INPUT_DIM = len(SRC.vocab)
DECODER_OUTPUT_DIM = len(TRG.vocab)
ENCODER_EMB_DIM = 256
DECODER_EMB_DIM = 256
ENCODER_HID_DIM = 512
DECODER_HID_DIM = 512
ENCODER_DROPOUT = 0.5
DECODER_DROPOUT = 0.5
PAD_IDX = SRC.vocab.stoi['<pad>']
SOS_IDX = TRG.vocab.stoi['<sos>']
EOS_IDX = TRG.vocab.stoi['<eos>']



encoder = Encoder(INPUT_DIM, ENCODER_EMB_DIM, ENCODER_HID_DIM, DECODER_HID_DIM, ENCODER_DROPOUT)
decoder = Decoder(DECODER_OUTPUT_DIM, DECODER_EMB_DIM, ENCODER_HID_DIM, DECODER_HID_DIM, DECODER_DROPOUT)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = builModel(encoder, decoder, PAD_IDX, SOS_IDX, EOS_IDX, device).to(device)


lr = 0.001
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)
optimizer = optim.Adam(model.parameters())

N_EPOCHS = 10
CLIP = 1

def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            
model.apply(init_weights)

# Training the models

In [None]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        #model returns attention vectors over the batch of source source sentences
        output, attention = model(src, trg)
         
        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)
        
        #Calculate Loss
        loss = criterion(output, trg)
        ## Getting gradients w.r.t. parameters
        loss.backward()
        
        #Clips gradient norm 
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)



best_train_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    
  
    if train_loss < best_train_loss:
        best_train_loss = train_loss
        torch.save(model.state_dict(), 'exercise4_model.pt')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')

# Translation results

### first load the model

In [None]:
model.load_state_dict(torch.load('exercise4_model.pt'))

In [None]:
# randomly read a line of training/test sentence from train/test data

In [None]:
#random_i = random.randint(0,len(train_data.examples)-1)
random_i = 4
src_sentence = ' '.join(vars(train_data.examples[random_i])['src'])
trg_reference = ' '.join(vars(train_data.examples[random_i])['trg'])
print(f'src = {src_sentence}')
print(f'trg = {trg_reference}')

In [None]:
## accomplish the function, which gives the tranlation of the given sentence using given model

In [None]:
def translate_sentence(model, sentence):
    model.eval()
    
    #first tokenized the sentences
    tokenized = tokenize_de(sentence) 
    #lowercase the tokens and add start/end tokens
    tokenized = ['<sos>'] + [t.lower() for t in tokenized] + ['<eos>']
    # numericalize tokens by converting them into their indexes using source vocabulary
    numericalized = [SRC.vocab.stoi[t] for t in tokenized] 
    sentence_length = torch.LongTensor([len(numericalized)]).to(device) 
    #convert the sentence into tensor
    tensor = torch.LongTensor(numericalized).unsqueeze(1).to(device) 
    
    #------------------Task begins here--------------------
    #----------pass inputs into the model ....
    output_tensor_logits, attention= ...
    #get highest predicted token index for each element
    translation_tensor = ....
    #------------------Task ends here--------------------
    # convert translation output into string/text
    translation = [TRG.vocab.itos[t] for t in translation_tensor]
    translation, attention = translation[1:], attention[1:]
    return translation, attention

In [None]:
translation, attention = translate_sentence(model, src_sentence)

print(src_sentence)
print("-------------")
print(f'predicted trg = {translation}')
print()
print(f'reference trg = {trg_reference}')