# "NMT by Jointly Learning to Align and Translate" Implementation

original paper: https://arxiv.org/abs/1409.0473

references
* arichitecture picture: https://arxiv.org/pdf/1703.03906.pdf
* tutorial: https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/seq2seq-translation-batched.ipynb
* data source: http://www.statmt.org/wmt14/translation-task.html

In [1]:
# torch
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data import Field, Iterator, BucketIterator, TabularDataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
# others
import unicodedata
import re
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

In [2]:
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.cuda.current_device()

bring code from tutorial

In [3]:
# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalize_string(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([,.!?])", r" \1 ", s)
    s = re.sub(r"[^a-zA-Z,.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

In [4]:
def read_files(path, max_len=None, min_len=None, n_sentences=None):
    source = []
    target = []
    with open(path, 'r', encoding='utf-8') as file:
        for l in file.readlines():
            so, ta = l.split('\t')
            normed_so = normalize_string(so.strip()).split()
            normed_ta = normalize_string(ta.strip()).split() 
            if len(normed_so) >= min_len and len(normed_so) <= max_len and \
               len(normed_ta) >= min_len and len(normed_ta) <= max_len:
                source.append(normed_so)
                target.append(normed_ta)
        if n_sentences:
            source = source[:n_sentences]
            target = target[:n_sentences]
    return source, target

In [5]:
def write_file(path, source, target):
    with open(path, 'w', encoding='utf-8') as file:
        for s, t in zip(source, target):
            print(' '.join(s) + '\t' + ' '.join(t), file=file)

In [6]:
corpus = open('./data/en_fa/fra.txt', 'r', encoding='utf-8')
corpus_len = len(corpus.readlines())
print(corpus_len)

154883


In [7]:
MAX_LEN = 25
MIN_LEN = 3
N_SENTENCES = 50000
BATCH_SIZE = 32

In [8]:
source, target = read_files('./data/en_fa/fra.txt', max_len=MAX_LEN, min_len=MIN_LEN, n_sentences=N_SENTENCES)

In [9]:
write_file('./data/en_fa/eng-fra.txt', source, target)

In [10]:
SOURCE = Field(tokenize=str.split, use_vocab=True, init_token="<s>", eos_token="</s>", lower=True, 
               include_lengths=True, batch_first=True)
TARGET = Field(tokenize=str.split, use_vocab=True, init_token="<s>", eos_token="</s>", lower=True, 
               batch_first=True)

In [11]:
train_data = TabularDataset(path='./data/en_fa/eng-fra.txt', format='tsv',
                           fields=[('inputs',SOURCE), ('targets',TARGET)])

In [12]:
SOURCE.build_vocab(train_data)
TARGET.build_vocab(train_data)

In [13]:
train_loader = BucketIterator(train_data, batch_size=BATCH_SIZE, device=DEVICE,
                              sort_key=lambda x: len(x.inputs), sort_within_batch=True, repeat=False) 

In [14]:
for batch in train_loader:
    break

## architecture

![](./pics/encoder_decoder_att.png)

In [15]:
V_so = len(SOURCE.vocab)
V_ta = len(TARGET.vocab)
E = 10
H = 5

In [16]:
V_so, V_ta

(5902, 10240)

In [34]:
embed = nn.Embedding(V_so, E).cuda()
rnn = nn.GRU(E, H, 3, batch_first=True, bidirectional=True).cuda()

In [35]:
inputs, lengths = batch.inputs

In [36]:
inputs.size()

torch.Size([32, 8])

In [37]:
embeded = embed(inputs)

In [38]:
embeded[0].size()

torch.Size([8, 10])

In [39]:
packed = pack_padded_sequence(embeded, lengths.tolist(), batch_first=True)

In [40]:
packed.data.size()

torch.Size([256, 10])

In [41]:
output, hidden = rnn(packed)

In [42]:
output

PackedSequence(data=tensor([[ 0.3438, -0.2094,  0.0553,  ..., -0.0368,  0.3782, -0.0662],
        [ 0.3376, -0.2095,  0.0582,  ..., -0.0385,  0.4570, -0.0491],
        [ 0.3617, -0.2198,  0.0587,  ..., -0.1037,  0.3921, -0.0623],
        ...,
        [ 0.6185, -0.3715,  0.4607,  ...,  0.1606,  0.2110, -0.0422],
        [ 0.6751, -0.3095,  0.4269,  ...,  0.1482,  0.2391, -0.0927],
        [ 0.6642, -0.3478,  0.4251,  ...,  0.1590,  0.2273, -0.0820]], device='cuda:0'), batch_sizes=tensor([ 32,  32,  32,  32,  32,  32,  32,  32]))

In [43]:
outputs, output_lengths = pad_packed_sequence(output, batch_first=True)

In [44]:
outputs.size()

torch.Size([32, 8, 10])

In [45]:
hidden.size()

torch.Size([6, 32, 5])

In [46]:
outputs[-1]

tensor([[ 0.3612, -0.2010,  0.0522,  0.1332,  0.1297,  0.1936,  0.1296,
         -0.0584,  0.1640, -0.1198],
        [ 0.5923, -0.3004,  0.1281,  0.2247,  0.1404,  0.2127,  0.1010,
         -0.0364,  0.1256, -0.1407],
        [ 0.6283, -0.3849,  0.1932,  0.2269,  0.2364,  0.2150,  0.1087,
          0.0181,  0.0999, -0.0342],
        [ 0.7000, -0.4010,  0.2364,  0.2917,  0.1795,  0.2200,  0.0970,
          0.0169,  0.1096, -0.1239],
        [ 0.7212, -0.4147,  0.2818,  0.3169,  0.1630,  0.2033,  0.0865,
          0.0474,  0.2006, -0.1365],
        [ 0.7277, -0.4069,  0.3254,  0.3664,  0.1024,  0.1463,  0.0473,
          0.0985,  0.2934, -0.1801],
        [ 0.7296, -0.3676,  0.3751,  0.4147,  0.0294,  0.1270,  0.0302,
          0.1750,  0.3167, -0.2126],
        [ 0.6642, -0.3478,  0.4251,  0.4276,  0.0675,  0.0753, -0.0093,
          0.1590,  0.2273, -0.0820]], device='cuda:0')

In [53]:
torch.cat([h for h in hidden[-2:]], 1).size()

torch.Size([32, 10])

## Encoder

In [None]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1, bidrec=True):
        super(Encoder, self).__init__()
        """
        vocab_size: K
        embed_size: m
        hidden_size: n
        """
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidrec = bidrec
        self.n_direct = 2 if bidrec else 1
        
        self.embed = nn.Embedding(vocab_size, embed_size) 
        self.gru = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True, bidirectional=bidrec)
        
    def forward(inputs):
        """
        inputs: B, T_x
        """
        # embeded: B, T_x, n
        embeded = self.embed(inputs) 
        # packed: B x T_x, n
        packed = pack_padded_sequence(embeded, lengths.tolist(), batch_first=True) 
        # outputs: B*T_x, n
        # hidden: num of layers*n_direct, B, n 
        output, hidden = self.gru(packed)
        # outputs: B, T_x, n
        output, output_lengths = pad_packed_sequence(output, batch_first=True)
        
        # hidden bidirection: [num of layers*n_direct(0,1,2...last one), B, n) 
        hidden = torch.cat([h for h in hidden[-self.n_direct:]], 1).unsqueeze(0)
        
        return output, hidden

## Decoder