#### Neural Machine Translation by Jointly Learning to Align and Translate

Till now I have looked at 2 types of seq2seq architecture.
1. A simple seq2seq model
2. Seq2Seq model in which the last hidden state from the encoder i.e. the context vector is input to each of the states in the decoder

Today I'll look at using attention for these seq2seq tasks

Attention works by calculating an attention vector, a, which is the length of the source vector.

#### Preparing Data


In [1]:
!pip install torch==1.4

Collecting torch==1.4
[?25l  Downloading https://files.pythonhosted.org/packages/24/19/4804aea17cd136f1705a5e98a00618cb8f6ccc375ad8bfa437408e09d058/torch-1.4.0-cp36-cp36m-manylinux1_x86_64.whl (753.4MB)
[K     |████████████████████████████████| 753.4MB 21kB/s 
[31mERROR: torchvision 0.6.1+cu101 has requirement torch==1.5.1, but you'll have torch 1.4.0 which is incompatible.[0m
[?25hInstalling collected packages: torch
  Found existing installation: torch 1.5.1+cu101
    Uninstalling torch-1.5.1+cu101:
      Successfully uninstalled torch-1.5.1+cu101
Successfully installed torch-1.4.0


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time
print(torch.__version__)

1.5.1+cu101


In [4]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [5]:
!python -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [6]:
!python -m spacy download de

Collecting de_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9MB)
[K     |████████████████████████████████| 14.9MB 1.1MB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-cp36-none-any.whl size=14907056 sha256=29bdd3b1b9b6a4d494c2690e939b0647e8ad66a8517ae8c0000d66f20cfb23c5
  Stored in directory: /tmp/pip-ephem-wheel-cache-no3qj7mq/wheels/ba/3f/ed/d4aa8e45e7191b7f32db4bfad565e7da1edbf05c916ca7a1ca
Successfully built de-core-news-sm
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/de_core_news_sm -->
/usr/local/

In [7]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

In [8]:
def tokenize_de(text):

  return [token.text for token in spacy_de.tokenizer(text)]

def tokenize_en(text):

  return [token.text for token in spacy_en.tokenizer(text)]

In [9]:
SRC = Field(tokenize = tokenize_de,
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)
TRG = Field(tokenize = tokenize_en,
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)

In [10]:
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), 
                                                    fields = (SRC, TRG))

training.tar.gz:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:00<00:00, 5.35MB/s]
validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 1.39MB/s]

downloading validation.tar.gz
downloading mmt_task1_test2016.tar.gz



mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 1.41MB/s]


In [11]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [13]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device)

#### Encoder

In [16]:
class Encoder(nn.Module):
  def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
    super().__init__()
    self.embedding = nn.Embedding(input_dim, emb_dim)

    self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)

    self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)

    self.dropout = nn.Dropout(dropout)
  
  def forward(self, src):

    #src = [src_len, batch_size]

    embedded = self.dropout(self.embedding(src))

    #embedded = [src_len, batch_size, emb_dim]

    outputs, hidden = self.rnn(embedded)

    #outputs = [src_len, batch_size, n_directions * hid_dim]
    #hidden = [n_layers * n_directions, batch_size, hid_dim]

    #hidden is stacked = [forward 1, backward 1, forward 2, backward 2, ....]
    #outputs are always from the last layer

    #hidden[-2,:,:] is the last of the forward RNN
    #hidden[-1,:,:] is the last of the backward RNN

    #initial decoder hidden is final hidden state of the forward and backward RNN

    print(hidden[-2,:,:].shape)

    hidden =  torch.tanh(self.fc(torch.cat((hidden[-2,:,:],hidden[-1,:,:]),dim = 1)))

    #outputs = [src_len, batch_size, enc_hid_dim * 2]
    #hidden = [batch_size, dec_hid_dim]

    return outputs, hidden

#### Attention

In [15]:
class Attention(nn.Module):
  def __init__(self, enc_hid_dim, dec_hid_dim):
    super().__init__()

    self.attention = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)

    self.v = nn.Linear(dec_hid_dim, 1, bias= False)
  
  def forward(self, hidden, encoder_outputs):

    #hidden = [batch_size, dec_hid_dim]
    #encoder_outputs = [src_len, batch_size, enc_hid_dim * 2]

    src_len = encoder_outputs.shape[0]
    batch_size = encoder_outputs.shape[1]

    # The size of the initial decoder hidden which is also the final hidden of the encoder must be 
    # equal to len of src_len

    print(hidden.unsqueeze(1).shape)
    hidden = hidden.unsqueeze(1).repeat(1,src_len,1)
    encoder_outputs = encoder_outputs.permute(1,0,2)

    #hidden = [batch_size,src_len, dec_hid_dim]
    #encoder_outputs = [batch_size, src_len, enc_hid_dim * 2]

    energy = self.tanh(self.attention(torch.cat((hidden,encoder_outputs),dim = 2)))

    #energy = [batch_size, src_len, dec_hid_dim]

    attention = self.v(enery).squeeze(2)

    #attention = [batch_size, src_len]

    a = F.softmax(attention)
    print(a.shape)

    return F.softmax(attention, dim = 1)

#### Decoder