In [33]:
from pathlib import Path
import re
import time
from zipfile import ZipFile
import unicodedata

import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

import torch
import torch.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'torch.__version__: {torch.__version__}')
print(f'torch device: {device}')


torch.__version__: 1.6.0
torch device: cpu


## Dataset: Phrase pairs (2 languages)

Download [manythings.org/anki/spa-eng.zip](http://www.manythings.org/anki/spa-eng.zip) 

In [34]:
dataset_path = Path('../data/spa.txt')
if not dataset_path.exists():
    with ZipFile('../data/spa-eng.zip', 'r') as zipobj:
       # Get a list of all archived file names from the zip
       # filenames = zipobj.namelist()
       zipobj.extract('spa.txt')

    # df = pd.read_csv(filename)

In [35]:
lines = dataset_path.open(encoding='UTF-8').read().strip().split('\n')  

In [36]:
num_examples = 100  # toy problem

In [37]:
# 100 rows = toy problem
df = pd.read_csv(dataset_path, sep='\t', nrows=100, header=None, usecols=range(2)) 
df.columns = 'english spanish'.split()
df.tail()

Unnamed: 0,english,spanish
95,No way!,¡De eso nada!
96,No way!,¡Ni cagando!
97,No way!,¡Mangos!
98,No way!,¡Minga!
99,No way!,¡Ni en pedo!


In [38]:
def preprocess_sentence(s):
    """ Tokenize with simple multilingual tokenizer plus add <start> and <end> tokens
    
    Adds space between a word and the punctuation following it, so token
    >>> preprocess(" Hola!   ¿Que tal?   ")
    "Hola ! ¿ Que tal ?"
    
    Reference:
        https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    """
    s = re.sub(r'([?.!,¿""\'])', r' \1 ', s)
    s = re.sub(r'[ ]+', ' ', s)
    # replace everything with space except (a-z, A-Z, "-", ".", "?", "!", ",")
    s = re.sub(r"[^-a-zA-Z?.!,¿]+", " ", s)
    s = s.strip()
    # adding a start and an end token to the sentence so RNN will work on variable length text
    return '<start> ' + s + ' <stop>'

In [39]:
for c in df.columns:
    df[c] = df[c].apply(lambda s: unicodedata.normalize('NFD', s))
    df[c] = df[c].apply(lambda s: preprocess_sentence(s))
df.sample(5)

Unnamed: 0,english,spanish
35,<start> I ran . <stop>,<start> Corri . <stop>
36,<start> I ran . <stop>,<start> Corri a . <stop>
50,<start> Attack ! <stop>,<start> Ataque ! <stop>
89,<start> Listen . <stop>,<start> Escuchen . <stop>
53,<start> Get up . <stop>,<start> Levanta . <stop>


In [55]:
class LanguageIndex():
    """ Create vocabulary mapping and index (inverse mapping)
    
    >>> langindex = LanguageIndex(df['english'])
    >>> langindex.word2idx.items()[:3]
    {"papa": 5, ...
    >>> langindex.idx2word.items()[:3]
    {5: "papa"}
    """
    def __init__(self, phrases):
        """ `phrases` is a list of phrases in one language """
        self.word2idx = {}
        self.vocab = []
        self.idx2word = self.vocab  # this can just be a list
        self.create_index(phrases)
        
    def create_index(self, phrases):
        self.vocab = set('<start> <end> <pad>'.split())
        for phrase in phrases:
            self.vocab.update(set(phrase.split()))
        self.vocab = sorted(self.vocab)

        self.idx2word = self.vocab
        self.word2idx = dict(zip(self.vocab, range(len(self.vocab))))
    
    def __getitem__(self, tok):
        return self.word2idx.get(tok) or self.vocab[tok]

In [56]:
# index language using the class above
targetlang = "english"
sourcelang = "spanish"
inp_index = LanguageIndex(phrases=df[sourcelang].values)
targ_index = LanguageIndex(df[targetlang].values)
# Vectorize the input and target languages
input_tensors = [[inp_index.word2idx[s] for s in es.split(' ')]  for es in df[sourcelang].values.tolist()]
target_tensors = [[targ_index.word2idx[s] for s in eng.split(' ')]  for eng in df[targetlang].values.tolist()]
pd.DataFrame(input_tensors[:5])

Unnamed: 0,0,1,2,3,4
0,5,75,2,6,
1,5,76,2,6,
2,5,73,2,6,
3,5,72,123,2,6.0
4,5,42,2,6,


In [57]:
pd.DataFrame(target_tensors[:5])

Unnamed: 0,0,1,2,3
0,4,10,1,5
1,4,10,1,5
2,4,10,1,5
3,4,10,1,5
4,4,15,1,5


In [58]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [59]:
# calculate the max_length of input and output tensor
max_length_inp = max(len(t) for t in input_tensors)
max_length_tar = max(len(t) for t in target_tensors)
max_length_inp, max_length_tar

(7, 6)

In [60]:
def pad_seq(s, max_len, pad_tok_idx):
    padded = pad_tok_idx * np.ones(max_len, dtype=np.int64)  # FIXME: int16 should be pleanty
    s_len = min(max_len, len(s))
    padded[:s_len] = s[:s_len]
    return padded

In [66]:
# inplace padding
input_tensors = [pad_seq(x, max_length_inp, inp_index['<pad>']) for x in input_tensors]
target_tensors = [pad_seq(x, max_length_tar, targ_index['<pad>']) for x in target_tensors]
len(target_tensor)

100

In [67]:
# Creating training and validation sets using an 80-20 split
# input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)
input_tensor_train = input_tensor_val = input_tensors
target_tensor_train = target_tensor_val = target_tensors
# Show length
len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)

(100, 100, 100, 100)

In [68]:
from torch.utils.data import Dataset, DataLoader

In [69]:
class TranslationDataset(Dataset):
    """ Convert each vector to torch.tensor type and wrap with Dataloader() """
    def __init__(self, X, y):
        self.data = X
        self.target = y
        # FIXME: vectorize with torch.tensor
        self.length = [ np.sum(1 - np.equal(x, 0)) for x in X]
        
    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index]
        x_len = self.length[index]
        return x,y,x_len
    
    def __len__(self):
        return len(self.data)

In [70]:
train_dataset = TranslationDataset(input_tensor_train, target_tensor_train)

BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 16
N_BATCH = BUFFER_SIZE//BATCH_SIZE
embedding_dim = 32
units = 32
vocab_inp_size = len(inp_index.word2idx)
vocab_tar_size = len(targ_index.word2idx)

dataset = DataLoader(train_dataset,
                     batch_size=BATCH_SIZE, 
                     drop_last=True,
                     shuffle=True)

In [71]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super().__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = nn.GRU(self.embedding_dim, self.enc_units)
        
    def forward(self, x, lens, device=device):
        # x: batch_size, max_length 
        
        # x: batch_size, max_length, embedding_dim
        x = self.embedding(x) 
                
        # x transformed = max_len X batch_size X embedding_dim
        # x = x.permute(1,0,2)
        x = pack_padded_sequence(x, lens) # unpad
    
        self.hidden = self.initialize_hidden_state(device)
        
        # output: max_length, batch_size, enc_units
        # self.hidden: 1, batch_size, enc_units
        output, self.hidden = self.gru(x, self.hidden) # gru returns hidden state of all timesteps as well as hidden state at last timestep
        
        # pad the sequence to the max length in the batch
        output, _ = pad_packed_sequence(output)
        
        return output, self.hidden

    def initialize_hidden_state(self, device=device):
        return torch.zeros((1, self.batch_sz, self.enc_units)).to(device)

In [72]:
def sort_batch(X, y, lengths):
    """ Sort batch function to be able to use with pad_packed_sequence """
    lengths, indx = lengths.sort(dim=0, descending=True)
    X = X[indx]
    y = y[indx]
    return X.transpose(0,1), y, lengths # transpose (batch x seq) to (seq x batch)

In [88]:
print('Training set pairs (source then target):...')
for rownum, (inp, targ) in enumerate(zip(input_tensors, target_tensors)):
    print()
    print(' '.join([targ_index.idx2word[i] for i in targ]))
    print(' '.join([inp_index.idx2word[i] for i in inp]))
    if rownum > 5:
        break   

Training set pairs (source then target):...

<start> Go . <stop> <pad> <pad>
<start> Ve . <stop> <pad> <pad> <pad>

<start> Go . <stop> <pad> <pad>
<start> Vete . <stop> <pad> <pad> <pad>

<start> Go . <stop> <pad> <pad>
<start> Vaya . <stop> <pad> <pad> <pad>

<start> Go . <stop> <pad> <pad>
<start> Va yase . <stop> <pad> <pad>

<start> Hi . <stop> <pad> <pad>
<start> Hola . <stop> <pad> <pad> <pad>

<start> Run ! <stop> <pad> <pad>
<start> Corre ! <stop> <pad> <pad> <pad>

<start> Run ! <stop> <pad> <pad>
<start> Corran ! <stop> <pad> <pad> <pad>


## Testing the Encoder
Before proceeding with training, we should always try to test out model behavior such as the size of outputs just to make that things are going as expected. In PyTorch this can be done easily since everything comes in eager execution by default.

### Test Encoder

In [73]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
encoder.to(device)

# obtain one sample from the data iterator
it = iter(dataset)
inp_batch, out_batch, inp_batch_len = next(it)

# sort the batch first to be able to use with pac_pack_sequence
inp_batch_sorted, out_batch_sorted, lengths = sort_batch(inp_batch, out_batch, inp_batch_len)

enc_output, enc_hidden = encoder(inp_batch_sorted.to(device), lengths, device=device)

print('Encoder output tensor should be size (max_length, batch_size, num_enc_units):')
print(enc_output.size()) 
print(inp_batch.size())
print(out_batch.size())
print(f'inp_batch_sorted.size() seems wrong: {inp_batch_sorted.size()}')
print(f'out_batch_sorted.size() seems correct: {out_batch_sorted.size()}')
print(f'BATCH_SIZE: {BATCH_SIZE}')

Encoder output tensor should be size (max_length, batch_size, num_enc_units):
torch.Size([7, 16, 32])
torch.Size([16, 7])
torch.Size([16, 6])
inp_batch_sorted.size() seems wrong: torch.Size([7, 16])
out_batch_sorted.size() seems correct: torch.Size([16, 6])
BATCH_SIZE: 16


### Decoder

Here, we'll implement an encoder-decoder model with attention which you can read about in the TensorFlow [Neural Machine Translation (seq2seq) tutorial](https://github.com/tensorflow/nmt). This example uses a more recent set of APIs. This notebook implements the [attention equations](https://github.com/tensorflow/nmt#background-on-the-attention-mechanism) from the seq2seq tutorial. The following diagram shows that each input word is assigned a weight by the attention mechanism which is then used by the decoder to predict the next word in the sentence.

<img src="https://www.tensorflow.org/images/seq2seq/attention_mechanism.jpg" width="500" alt="attention mechanism">

The input is put through an encoder model which gives us the encoder output of shape *(batch_size, max_length, hidden_size)* and the encoder hidden state of shape *(batch_size, hidden_size)*. 

Here are the equations that are implemented:

<img src="https://www.tensorflow.org/images/seq2seq/attention_equation_0.jpg" alt="attention equation 0" width="800">
<img src="https://www.tensorflow.org/images/seq2seq/attention_equation_1.jpg" alt="attention equation 1" width="800">

We're using *Bahdanau attention*. Lets decide on notation before writing the simplified form:

* FC = Fully connected (dense) layer
* EO = Encoder output
* H = hidden state
* X = input to the decoder

And the pseudo-code:

* `score = FC(tanh(FC(EO) + FC(H)))`
* `attention weights = softmax(score, axis = 1)`. Softmax by default is applied on the last axis but here we want to apply it on the *1st axis*, since the shape of score is *(batch_size, max_length, 1)*. `Max_length` is the length of our input. Since we are trying to assign a weight to each input, softmax should be applied on that axis.
* `context vector = sum(attention weights * EO, axis = 1)`. Same reason as above for choosing axis as 1.
* `embedding output` = The input to the decoder X is passed through an embedding layer.
* `merged vector = concat(embedding output, context vector)`
* This merged vector is then given to the GRU
  
The shapes of all the vectors at each step have been specified in the comments in the code:

In [74]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, dec_units, enc_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.enc_units = enc_units
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = nn.GRU(self.embedding_dim + self.enc_units, 
                          self.dec_units,
                          batch_first=True)
        self.fc = nn.Linear(self.enc_units, self.vocab_size)
        
        # used for attention
        self.W1 = nn.Linear(self.enc_units, self.dec_units)
        self.W2 = nn.Linear(self.enc_units, self.dec_units)
        self.V = nn.Linear(self.enc_units, 1)
    
    def forward(self, x, hidden, enc_output):
        # enc_output original: (max_length, batch_size, enc_units)
        # enc_output converted == (batch_size, max_length, hidden_size)
        enc_output = enc_output.permute(1,0,2)
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        hidden_with_time_axis = hidden.permute(1, 0, 2)
        
        # score: (batch_size, max_length, hidden_size) # Bahdanaus's
        # we get 1 at the last axis because we are applying tanh(FC(EO) + FC(H)) to self.V
        # It doesn't matter which FC we pick for each of the inputs
        score = torch.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis))
        
        #score = torch.tanh(self.W2(hidden_with_time_axis) + self.W1(enc_output))
          
        # attention_weights shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        attention_weights = torch.softmax(self.V(score), dim=1)
        
        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * enc_output
        context_vector = torch.sum(context_vector, dim=1)
        
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        # takes case of the right portion of the model above (illustrated in red)
        x = self.embedding(x)
        
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        #x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        # ? Looks like attention vector in diagram of source
        x = torch.cat((context_vector.unsqueeze(1), x), -1)
        
        # passing the concatenated vector to the GRU
        # output: (batch_size, 1, hidden_size)
        output, state = self.gru(x)
        
        
        # output shape == (batch_size * 1, hidden_size)
        output =  output.view(-1, output.size(2))
        
        # output shape == (batch_size * 1, vocab)
        x = self.fc(output)
        
        return x, state, attention_weights
    
    def initialize_hidden_state(self):
        return torch.zeros((1, self.batch_sz, self.dec_units))

## Testing the Decoder
Similarily, try to test the decoder.

In [75]:
# obtain one sample from the data iterator
it = iter(dataset)
x, y, x_len = next(it)

print("Input: ", x.shape)
print("Output: ", y.shape)

# sort the batch first to be able to use with pac_pack_sequence
xs, ys, lens = sort_batch(x, y, x_len)

enc_output, enc_hidden = encoder(xs.to(device), lens, device=device)
print("Encoder Output: ", enc_output.shape) # batch_size X max_length X enc_units
print("Encoder Hidden: ", enc_hidden.shape) # batch_size X enc_units (corresponds to the last state)

decoder = Decoder(vocab_tar_size, embedding_dim, units, units, BATCH_SIZE)
decoder = decoder.to(device)

#print(enc_hidden.squeeze(0).shape)

dec_hidden = enc_hidden#.squeeze(0)
dec_input = torch.tensor([[targ_index.word2idx['<start>']]] * BATCH_SIZE)
print("Decoder Input: ", dec_input.shape)
print("--------")

for t in range(1, y.size(1)):
    # enc_hidden: 1, batch_size, enc_units
    # output: max_length, batch_size, enc_units
    predictions, dec_hidden, _ = decoder(dec_input.to(device), 
                                         dec_hidden.to(device), 
                                         enc_output.to(device))
    
    print("Prediction: ", predictions.shape)
    print("Decoder Hidden: ", dec_hidden.shape)
    
    #loss += loss_function(y[:, t].to(device), predictions.to(device))
    
    dec_input = y[:, t].unsqueeze(1)
    print(dec_input.shape)
    break

Input:  torch.Size([16, 7])
Output:  torch.Size([16, 6])
Encoder Output:  torch.Size([7, 16, 32])
Encoder Hidden:  torch.Size([1, 16, 32])
Decoder Input:  torch.Size([16, 1])
--------
Prediction:  torch.Size([16, 56])
Decoder Hidden:  torch.Size([1, 16, 32])
torch.Size([16, 1])


In [79]:
criterion = nn.CrossEntropyLoss()

def loss_function(real, pred):
    """ Only consider non-zero inputs in the loss; mask needed """
    #mask = 1 - np.equal(real, 0) # assign 0 to all above 0 and 1 to all 0s
    #print(mask)
    mask = real.ge(1).type(torch.cuda.FloatTensor if device is 'gpu' else torch.FloatTensor)
    
    loss_ = criterion(pred, real) * mask 
    return torch.mean(loss_)

In [80]:
## TODO: Combine the encoder and decoder into one class
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size, embedding_dim, units, units, BATCH_SIZE)

encoder.to(device)
decoder.to(device)

optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), 
                       lr=0.001)

## Training
Now we start the training. We are only using 10 epochs but you can expand this to keep trainining the model for a longer period of time. Note that in this case we are teacher forcing during training. Find a more detailed explanation in the official TensorFlow [implementation](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb) of this notebook provided by the TensorFlow team. 

- Pass the input through the encoder which return encoder output and the encoder hidden state.
- The encoder output, encoder hidden state and the decoder input (which is the start token) is passed to the decoder.
- The decoder returns the predictions and the decoder hidden state.
- The decoder hidden state is then passed back into the model and the predictions are used to calculate the loss.
- Use teacher forcing to decide the next input to the decoder.
- Teacher forcing is the technique where the target word is passed as the next input to the decoder.
- The final step is to calculate the gradients and apply it to the optimizer and backpropagate.

In [90]:
EPOCHS = 300

for epoch in range(EPOCHS):
    start = time.time()
    
    encoder.train()
    decoder.train()
    
    total_loss = 0
    
    for (batch, (inp, targ, inp_len)) in enumerate(dataset):
        loss = 0
        
        xs, ys, lens = sort_batch(inp, targ, inp_len)
        enc_output, enc_hidden = encoder(xs.to(device), lens, device=device)
        dec_hidden = enc_hidden
        
        # use teacher forcing - feeding the target as the next input (via dec_input)
        dec_input = torch.tensor([[targ_index.word2idx['<start>']]] * BATCH_SIZE)
        
        # run code below for every timestep in the ys batch
        for t in range(1, ys.size(1)):
            predictions, dec_hidden, _ = decoder(dec_input.to(device), 
                                         dec_hidden.to(device), 
                                         enc_output.to(device))
            loss += loss_function(ys[:, t].to(device), predictions.to(device))
            #loss += loss_
            dec_input = ys[:, t].unsqueeze(1)
            
        
        batch_loss = (loss / int(ys.size(1)))
        total_loss += batch_loss
        
        optimizer.zero_grad()
        
        loss.backward()

        ### UPDATE MODEL PARAMETERS
        optimizer.step()
        
        if not batch % 100:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.detach().item()))
        
        
    ### TODO: Save checkpoint for model
    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss / N_BATCH))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 0.3814
Epoch 1 Loss 0.3371
Time taken for 1 epoch 0.08265995979309082 sec

Epoch 2 Batch 0 Loss 0.3344
Epoch 2 Loss 0.3316
Time taken for 1 epoch 0.06778597831726074 sec

Epoch 3 Batch 0 Loss 0.4564
Epoch 3 Loss 0.3316
Time taken for 1 epoch 0.06458878517150879 sec

Epoch 4 Batch 0 Loss 0.2913
Epoch 4 Loss 0.3303
Time taken for 1 epoch 0.06866931915283203 sec

Epoch 5 Batch 0 Loss 0.3079
Epoch 5 Loss 0.3202
Time taken for 1 epoch 0.06520199775695801 sec

Epoch 6 Batch 0 Loss 0.3100
Epoch 6 Loss 0.3212
Time taken for 1 epoch 0.0650491714477539 sec

Epoch 7 Batch 0 Loss 0.3310
Epoch 7 Loss 0.3212
Time taken for 1 epoch 0.06708288192749023 sec

Epoch 8 Batch 0 Loss 0.2792
Epoch 8 Loss 0.3120
Time taken for 1 epoch 0.06485843658447266 sec

Epoch 9 Batch 0 Loss 0.2983
Epoch 9 Loss 0.3054
Time taken for 1 epoch 0.06516551971435547 sec

Epoch 10 Batch 0 Loss 0.3141
Epoch 10 Loss 0.3094
Time taken for 1 epoch 0.0668485164642334 sec

Epoch 11 Batch 0 Loss 0.2720
Epoch 11 Lo

Epoch 87 Loss 0.1053
Time taken for 1 epoch 0.0688626766204834 sec

Epoch 88 Batch 0 Loss 0.1078
Epoch 88 Loss 0.1075
Time taken for 1 epoch 0.06318998336791992 sec

Epoch 89 Batch 0 Loss 0.1333
Epoch 89 Loss 0.1007
Time taken for 1 epoch 0.06417131423950195 sec

Epoch 90 Batch 0 Loss 0.1211
Epoch 90 Loss 0.1005
Time taken for 1 epoch 0.06448531150817871 sec

Epoch 91 Batch 0 Loss 0.0783
Epoch 91 Loss 0.1030
Time taken for 1 epoch 0.06408834457397461 sec

Epoch 92 Batch 0 Loss 0.1020
Epoch 92 Loss 0.1014
Time taken for 1 epoch 0.06280398368835449 sec

Epoch 93 Batch 0 Loss 0.0855
Epoch 93 Loss 0.0998
Time taken for 1 epoch 0.06311249732971191 sec

Epoch 94 Batch 0 Loss 0.0702
Epoch 94 Loss 0.0960
Time taken for 1 epoch 0.06409311294555664 sec

Epoch 95 Batch 0 Loss 0.1335
Epoch 95 Loss 0.0964
Time taken for 1 epoch 0.06370830535888672 sec

Epoch 96 Batch 0 Loss 0.0896
Epoch 96 Loss 0.0962
Time taken for 1 epoch 0.06327104568481445 sec

Epoch 97 Batch 0 Loss 0.1609
Epoch 97 Loss 0.0938


Epoch 171 Batch 0 Loss 0.0290
Epoch 171 Loss 0.0413
Time taken for 1 epoch 0.06818175315856934 sec

Epoch 172 Batch 0 Loss 0.0626
Epoch 172 Loss 0.0385
Time taken for 1 epoch 0.06525278091430664 sec

Epoch 173 Batch 0 Loss 0.0296
Epoch 173 Loss 0.0390
Time taken for 1 epoch 0.0653834342956543 sec

Epoch 174 Batch 0 Loss 0.0404
Epoch 174 Loss 0.0409
Time taken for 1 epoch 0.06767797470092773 sec

Epoch 175 Batch 0 Loss 0.0255
Epoch 175 Loss 0.0407
Time taken for 1 epoch 0.07842254638671875 sec

Epoch 176 Batch 0 Loss 0.0301
Epoch 176 Loss 0.0394
Time taken for 1 epoch 0.07813739776611328 sec

Epoch 177 Batch 0 Loss 0.0353
Epoch 177 Loss 0.0392
Time taken for 1 epoch 0.06966280937194824 sec

Epoch 178 Batch 0 Loss 0.0324
Epoch 178 Loss 0.0401
Time taken for 1 epoch 0.06923413276672363 sec

Epoch 179 Batch 0 Loss 0.0499
Epoch 179 Loss 0.0401
Time taken for 1 epoch 0.06337594985961914 sec

Epoch 180 Batch 0 Loss 0.0404
Epoch 180 Loss 0.0381
Time taken for 1 epoch 0.06393599510192871 sec

E

Epoch 254 Loss 0.0225
Time taken for 1 epoch 0.08187651634216309 sec

Epoch 255 Batch 0 Loss 0.0253
Epoch 255 Loss 0.0217
Time taken for 1 epoch 0.06709003448486328 sec

Epoch 256 Batch 0 Loss 0.0182
Epoch 256 Loss 0.0222
Time taken for 1 epoch 0.06466889381408691 sec

Epoch 257 Batch 0 Loss 0.0221
Epoch 257 Loss 0.0217
Time taken for 1 epoch 0.06440401077270508 sec

Epoch 258 Batch 0 Loss 0.0113
Epoch 258 Loss 0.0218
Time taken for 1 epoch 0.06566095352172852 sec

Epoch 259 Batch 0 Loss 0.0261
Epoch 259 Loss 0.0215
Time taken for 1 epoch 0.06453943252563477 sec

Epoch 260 Batch 0 Loss 0.0109
Epoch 260 Loss 0.0215
Time taken for 1 epoch 0.06446719169616699 sec

Epoch 261 Batch 0 Loss 0.0139
Epoch 261 Loss 0.0214
Time taken for 1 epoch 0.06526613235473633 sec

Epoch 262 Batch 0 Loss 0.0157
Epoch 262 Loss 0.0214
Time taken for 1 epoch 0.06854724884033203 sec

Epoch 263 Batch 0 Loss 0.0221
Epoch 263 Loss 0.0201
Time taken for 1 epoch 0.06593203544616699 sec

Epoch 264 Batch 0 Loss 0.0137


In [52]:
for (inp, targ, inp_len) in dataset:
    break
print(inp)
print(targ)
print(inp_len)
#xs, ys, lens = sort_batch(inp, targ, inp_len)
#enc_output, enc_hidden = encoder(xs.to(device), lens, device=device)
#dec_hidden = enc_hidden


tensor([[   5, 5697, 5449,  ...,    0,    0,    0],
        [   5, 3873,   69,  ...,    0,    0,    0],
        [   5, 5697, 3078,  ...,    0,    0,    0],
        ...,
        [   5, 9334, 7298,  ...,    0,    0,    0],
        [   5, 3267, 4244,  ...,    0,    0,    0],
        [   5, 3267, 6040,  ...,    0,    0,    0]])
tensor([[   5, 2124, 2311, 2551, 2269, 4530,    3,    4,    0,    0,    0],
        [   5, 2124, 2549,   23, 4138,    1,    4,    0,    0,    0,    0],
        [   5, 1221, 2269, 1187, 2603,    3,    4,    0,    0,    0,    0],
        [   5, 4288, 4552, 1824,    3,    4,    0,    0,    0,    0,    0],
        [   5, 4805, 3942, 4144,    3,    4,    0,    0,    0,    0,    0],
        [   5, 4698, 2520,    6,    4,    0,    0,    0,    0,    0,    0],
        [   5, 4013, 3804,    3,    4,    0,    0,    0,    0,    0,    0],
        [   5, 2124, 3918, 4278, 3314,    3,    4,    0,    0,    0,    0],
        [   5, 2124,   59, 4362,    3,    4,    0,    0,    0,    

In [69]:
def spanish_to_english(english_sentence):
    english_tensor = [inp_index.word2idx[w] for w in english_sentence.split()]
    inp_len = len(english_tensor)
    # xs, ys, lens = sort_batch(inp, targ, inp_len)
    enc_output, enc_hidden = encoder(torch.tensor([english_tensor]*64).to(device), [inp_len]*64, device=device)
    
    # encode(english_sentence)
    return enc_output

In [70]:
spanish_to_english('<start> como estas usted ? <end>')

RuntimeError: Expected hidden size (1, 6, 1024), got [1, 64, 1024]

## Final Words
Notice that we only trained the model and that's it. In fact, this notebook is in experimental phase, so there could also be some bugs or something I missed during the process of converting code or training. Please comment your concerns here or submit it as an issue in the [GitHub version](https://github.com/omarsar/pytorch_neural_machine_translation_attention) of this notebook. I will appreciate it!

We didn't evaluate the model or analyzed it. To encourage you to practice what you have learned in the notebook, I will suggest that you try to convert the TensorFlow code used in the [original notebook](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb) and complete this notebook. I believe the code should be straightforward, the hard part was already done in this notebook. If you manage to complete it, please submit a PR on the GitHub version of this notebook. I will gladly accept your PR. Thanks for reading and hope this notebook was useful. Keep tuned for notebooks like this on my Twitter ([omarsar0](https://twitter.com/omarsar0)). 

## References

### Seq2Seq:
  - Sutskever et al. (2014) - [Sequence to Sequence Learning with Neural Networks](Sequence to Sequence Learning with Neural Networks)
  - [Sequence to sequence model: Introduction and concepts](https://towardsdatascience.com/sequence-to-sequence-model-introduction-and-concepts-44d9b41cd42d)
  - [Blog on seq2seq](https://guillaumegenthial.github.io/sequence-to-sequence.html)
  - [Bahdanau et al. (2016) NMT jointly learning to align and translate](https://arxiv.org/pdf/1409.0473.pdf)
  - [Attention is all you need](https://arxiv.org/pdf/1706.03762.pdf)