
# **Neural machine translation with attention**


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


This notebook trains a sequence to sequence (seq2seq) model for spoken sentence to sign sentence translation. This is an advanced example that assumes some knowledge of sequence to sequence models.


In [2]:
from IPython.display import HTML
from subprocess import getoutput
s = getoutput('nvidia-smi')
print(s)
if 'K80' in s:
    gpu = 'K80'
elif 'T4' in s:
    gpu = 'T4'
elif 'P100' in s:
    gpu = 'P100'
else:
    gpu='DONT PROCEED'
display(HTML(f"<h1>{gpu}</h1>"))

/bin/sh: 1: nvidia-smi: not found


In [3]:
!pip install chart-studio

Collecting chart-studio
  Downloading chart_studio-1.1.0-py3-none-any.whl (64 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/64.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.4/64.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting retrying>=1.3.3 (from chart-studio)
  Downloading retrying-1.3.4-py3-none-any.whl (11 kB)
Installing collected packages: retrying, chart-studio
Successfully installed chart-studio-1.1.0 retrying-1.3.4


In [4]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import pandas as pd
import unicodedata
import re
import numpy as np
import os
import io
import time
import string
from string import digits

import chart_studio.plotly
import chart_studio.plotly as py
from plotly.offline import init_notebook_mode, iplot
#%plotly.offline.init_notebook_mode(connected=True)
import plotly.graph_objs as go

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


##**Download and prepare the dataset**


1. Add a start and end token to each sentence.
2. Clean the sentences by removing special characters.
3. Create a word index and reverse word index (dictionaries mapping from word → id and id → word).
4. Pad each sentence to a maximum length.

In [9]:
#file_path = '/content/ISL Corpus sign glosses.csv'

# import csv

# csv_file_path = '/content/ISL Corpus sign glosses.csv'
txt_file_path = 'example.txt'

# with open(csv_file_path, 'r') as csv_file:
#     # Assuming the CSV file has a header
#     csv_reader = csv.reader(csv_file)
#     header = next(csv_reader)

#     with open(txt_file_path, 'w') as txt_file:
#         # Write header to the text file
#         txt_file.write('\t'.join(header) + '\n')

#         # Write data to the text file
#         for row in csv_reader:
#             txt_file.write('\t'.join(row) + '\n')

# print(f'Conversion complete. Text file saved at: {txt_file_path}')


In [10]:
lines = open(txt_file_path, encoding='UTF-8').read().strip().split('\n')
lines

['are you free today\tYOU FREE TODAY',
 'are you hiding something\tYOU HIDE SOMETHING',
 'bring water for me\tBRING WATER ME',
 'can i help you\tI HELP YOU',
 'can you repeat that please\tYOU REPEAT PLEASE',
 'comb your hair\tCOMB YOU HAIR',
 'congratulations\tCONGRATULATIIONS',
 'could you please talk slower\tYOU PLEASE TALK SLOWER',
 'do me a favour\tDO ME FAVOUR',
 'do not abuse him\tDONOT ABUSE HIM',
 'do not be stubborn\tDONOT BE STUBBORN',
 'do not hurt me \tDONOT HURT ME',
 'do not make me angry\tDO NOT MAKE ME ANGRY',
 'do not take it to the heart\tDO NOT TAKE IT HEART',
 'do not worry\tDO NOT WORRY',
 'do you need something\tDO YOU NEED SOMETHING',
 'go and sleep\tGO SLEEP',
 'had your food\tYOUR FOOD',
 'he came by train\tHE CAME TRAIN',
 'He is going into the room\tHE GO  INTO  ROOM',
 'he is on the way\tHE ON WAY',
 'he she is my friend\tHE SHE MY FRIEND',
 'he would be coming today\tHE COMING TODAY',
 'help me\tHELP ME',
 'hi how are you\tHI HOW YOU',
 'how are things\tHOW

In [11]:
print("total number of records: ",len(lines))

total number of records:  101


##**Clean and Preprocess the text**

1. Convert to lower case
2. Convert special characters
3. Remove Digits
4. Remove spaces
5. Add start and end tags to each sentence

In [12]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
                 if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
  w = unicode_to_ascii(w.lower().strip())

  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

  w = w.strip()

  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  print(w)
  return w


In [13]:
en_sentence = u"are you free today"
deu_sentence = u"you free today"
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(deu_sentence).encode('utf-8'))

<start> are you free today <end>
<start> are you free today <end>
<start> you free today <end>
b'<start> you free today <end>'


##**Generate pairs of cleaned English and sign  sentences with start and end added**

In [14]:
# Generate pairs of cleaned English and Deutch sentences
sent_pairs = []
#due to memory constraints we'd not be using the whole data (227080 sentences)
for line in lines[:102]:
    sent_pair = []
    sentence = line.rstrip().split('\t')[0]
    target = line.rstrip().split('\t')[1]
    sentence = preprocess_sentence(sentence)
    sent_pair.append(sentence)
    target = preprocess_sentence(target)
    sent_pair.append(target)
    sent_pairs.append(sent_pair)
sent_pairs[:102]

<start> are you free today <end>
<start> you free today <end>
<start> are you hiding something <end>
<start> you hide something <end>
<start> bring water for me <end>
<start> bring water me <end>
<start> can i help you <end>
<start> i help you <end>
<start> can you repeat that please <end>
<start> you repeat please <end>
<start> comb your hair <end>
<start> comb you hair <end>
<start> congratulations <end>
<start> congratulatiions <end>
<start> could you please talk slower <end>
<start> you please talk slower <end>
<start> do me a favour <end>
<start> do me favour <end>
<start> do not abuse him <end>
<start> donot abuse him <end>
<start> do not be stubborn <end>
<start> donot be stubborn <end>
<start> do not hurt me <end>
<start> donot hurt me <end>
<start> do not make me angry <end>
<start> do not make me angry <end>
<start> do not take it to the heart <end>
<start> do not take it heart <end>
<start> do not worry <end>
<start> do not worry <end>
<start> do you need something <end>
<st

[['<start> are you free today <end>', '<start> you free today <end>'],
 ['<start> are you hiding something <end>',
  '<start> you hide something <end>'],
 ['<start> bring water for me <end>', '<start> bring water me <end>'],
 ['<start> can i help you <end>', '<start> i help you <end>'],
 ['<start> can you repeat that please <end>',
  '<start> you repeat please <end>'],
 ['<start> comb your hair <end>', '<start> comb you hair <end>'],
 ['<start> congratulations <end>', '<start> congratulatiions <end>'],
 ['<start> could you please talk slower <end>',
  '<start> you please talk slower <end>'],
 ['<start> do me a favour <end>', '<start> do me favour <end>'],
 ['<start> do not abuse him <end>', '<start> donot abuse him <end>'],
 ['<start> do not be stubborn <end>', '<start> donot be stubborn <end>'],
 ['<start> do not hurt me <end>', '<start> donot hurt me <end>'],
 ['<start> do not make me angry <end>', '<start> do not make me angry <end>'],
 ['<start> do not take it to the heart <end>',


In [15]:
lines

['are you free today\tYOU FREE TODAY',
 'are you hiding something\tYOU HIDE SOMETHING',
 'bring water for me\tBRING WATER ME',
 'can i help you\tI HELP YOU',
 'can you repeat that please\tYOU REPEAT PLEASE',
 'comb your hair\tCOMB YOU HAIR',
 'congratulations\tCONGRATULATIIONS',
 'could you please talk slower\tYOU PLEASE TALK SLOWER',
 'do me a favour\tDO ME FAVOUR',
 'do not abuse him\tDONOT ABUSE HIM',
 'do not be stubborn\tDONOT BE STUBBORN',
 'do not hurt me \tDONOT HURT ME',
 'do not make me angry\tDO NOT MAKE ME ANGRY',
 'do not take it to the heart\tDO NOT TAKE IT HEART',
 'do not worry\tDO NOT WORRY',
 'do you need something\tDO YOU NEED SOMETHING',
 'go and sleep\tGO SLEEP',
 'had your food\tYOUR FOOD',
 'he came by train\tHE CAME TRAIN',
 'He is going into the room\tHE GO  INTO  ROOM',
 'he is on the way\tHE ON WAY',
 'he she is my friend\tHE SHE MY FRIEND',
 'he would be coming today\tHE COMING TODAY',
 'help me\tHELP ME',
 'hi how are you\tHI HOW YOU',
 'how are things\tHOW

##**Create a class to map every word to an index and vice-versa for any given vocabulary.**

In [16]:
# This class creates a word -> index mapping (e.g,. "dad" -> 5) and vice-versa
# (e.g., 5 -> "dad") for each language,
class LanguageIndex():
    def __init__(self, lang):
        self.lang = lang
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = set()

        self.create_index()

    def create_index(self):
        for phrase in self.lang:
            self.vocab.update(phrase.split(' '))

        self.vocab = sorted(self.vocab)

        self.word2idx['<pad>'] = 0
        for index, word in enumerate(self.vocab):
            self.word2idx[word] = index + 1

        for word, index in self.word2idx.items():
            self.idx2word[index] = word

In [17]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [18]:
sent_pairs

[['<start> are you free today <end>', '<start> you free today <end>'],
 ['<start> are you hiding something <end>',
  '<start> you hide something <end>'],
 ['<start> bring water for me <end>', '<start> bring water me <end>'],
 ['<start> can i help you <end>', '<start> i help you <end>'],
 ['<start> can you repeat that please <end>',
  '<start> you repeat please <end>'],
 ['<start> comb your hair <end>', '<start> comb you hair <end>'],
 ['<start> congratulations <end>', '<start> congratulatiions <end>'],
 ['<start> could you please talk slower <end>',
  '<start> you please talk slower <end>'],
 ['<start> do me a favour <end>', '<start> do me favour <end>'],
 ['<start> do not abuse him <end>', '<start> donot abuse him <end>'],
 ['<start> do not be stubborn <end>', '<start> donot be stubborn <end>'],
 ['<start> do not hurt me <end>', '<start> donot hurt me <end>'],
 ['<start> do not make me angry <end>', '<start> do not make me angry <end>'],
 ['<start> do not take it to the heart <end>',


##**Tokenization and Padding**

In [19]:
def load_dataset(pairs, num_examples):
    # pairs => already created cleaned input, output pairs

    # index language using the class defined above
    # Combine input and target sentences into a single list
    all_sentences = [en for en, de in pairs] + [de for en, de in pairs]

    # Create a single LanguageIndex instance for both languages
    lang = LanguageIndex(all_sentences)

    # Vectorize the input and target languages
    # spoken sentence and sign sentence
    input_tensor = [[lang.word2idx[s] for s in en.split(' ')] for en, de in pairs]
    target_tensor = [[lang.word2idx[s] for s in de.split(' ')] for en, de in pairs]
    print(input_tensor)
    print(target_tensor)
    # Calculate max_length of input and output tensor
    # Here, we'll set those to the longest sentence in the dataset
    max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)

    # Padding the input and output tensor to the maximum length
    input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor,
                                                                maxlen=max_length_inp,
                                                                padding='post')
    target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor,
                                                                  maxlen=max_length_tar,
                                                                  padding='post')

    return input_tensor, target_tensor, lang, max_length_inp, max_length_tar

In [20]:
input_tensor, target_tensor, lang, max_length_inp, max_length_targ = load_dataset(sent_pairs, len(lines))

[[4, 18, 192, 60, 166, 3], [4, 18, 192, 80, 146, 3], [4, 25, 176, 59, 99, 3], [4, 28, 85, 77, 192, 3], [4, 28, 192, 130, 157, 125, 3], [4, 36, 193, 70, 3], [4, 39, 3], [4, 40, 192, 125, 154, 140, 3], [4, 48, 99, 5, 54, 3], [4, 48, 111, 7, 81, 3], [4, 48, 111, 20, 150, 3], [4, 48, 111, 84, 99, 3], [4, 48, 111, 98, 99, 14, 3], [4, 48, 111, 153, 89, 165, 158, 76, 3], [4, 48, 111, 189, 3], [4, 48, 192, 107, 146, 3], [4, 64, 13, 139, 3], [4, 69, 193, 58, 3], [4, 74, 27, 26, 167, 3], [4, 74, 88, 65, 87, 158, 131, 3], [4, 74, 88, 117, 158, 177, 3], [4, 74, 135, 88, 105, 61, 3], [4, 74, 190, 20, 37, 166, 3], [4, 77, 99, 3], [4, 78, 82, 18, 192, 3], [4, 82, 18, 160, 3], [4, 82, 28, 85, 77, 192, 3], [4, 82, 28, 85, 168, 192, 3], [4, 82, 43, 192, 3], [4, 82, 116, 18, 192, 3], [4, 85, 12, 9, 3], [4, 85, 12, 8, 114, 157, 3], [4, 85, 12, 42, 3], [4, 85, 12, 55, 24, 3], [4, 85, 12, 55, 34, 3], [4, 85, 12, 57, 2, 156, 192, 137, 3], [4, 85, 12, 83, 3], [4, 85, 12, 86, 46, 181, 165, 48, 3], [4, 85, 12, 

##**Creating training and validation sets using an 80-20 split**

In [21]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.1, random_state = 101)

# Show length
len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)

(90, 90, 11, 11)

In [22]:
input_tensor_train


array([[  4, 155,  99, 169,   3,   0,   0,   0,   0,   0],
       [  4,  85,  12, 164,   3,   0,   0,   0,   0,   0],
       [  4,  64,  13, 139,   3,   0,   0,   0,   0,   0],
       [  4,  85,  53,   5,  96,   3,   0,   0,   0,   0],
       [  4,  85,  12,   8, 114, 157,   3,   0,   0,   0],
       [  4, 181,  73, 192, 124,  59, 193,  30,   3,   0],
       [  4,  48, 111, 189,   3,   0,   0,   0,   0,   0],
       [  4,  82,  18, 160,   3,   0,   0,   0,   0,   0],
       [  4,  85,  67,  84,   3,   0,   0,   0,   0,   0],
       [  4,  74, 135,  88, 105,  61,   3,   0,   0,   0],
       [  4,  28,  85,  77, 192,   3,   0,   0,   0,   0],
       [  4, 181,  48, 192, 161,   3,   0,   0,   0,   0],
       [  4,  85, 175, 149,  26, 143, 118,   3,   0,   0],
       [  4,  85,  48, 111, 100,  89,   3,   0,   0,   0],
       [  4,  85,  48, 111,  95,  89,   3,   0,   0,   0],
       [  4, 179, 158, 136,   3,   0,   0,   0,   0,   0],
       [  4,  89,  49, 111,  98,  15,  45, 165,  99,   3

In [23]:
target_tensor_train

array([[  4, 155,  99, 169,   3,   0,   0,   0,   0,   0],
       [  4,  85, 164,   3,   0,   0,   0,   0,   0,   0],
       [  4,  64, 139,   3,   0,   0,   0,   0,   0,   0],
       [  4,  85,  53,   5,  96,   3,   0,   0,   0,   0],
       [  4,  85,   8, 157,   3,   0,   0,   0,   0,   0],
       [  4, 181,  73, 192, 123, 193,  30,   3,   0,   0],
       [  4,  48, 111, 189,   3,   0,   0,   0,   0,   0],
       [  4,  82, 160,   3,   0,   0,   0,   0,   0,   0],
       [  4,  85,  67,  84,   3,   0,   0,   0,   0,   0],
       [  4,  74, 135, 105,  61,   3,   0,   0,   0,   0],
       [  4,  85,  77, 192,   3,   0,   0,   0,   0,   0],
       [  4, 181,  48, 192, 161,   3,   0,   0,   0,   0],
       [  4,  85, 149,  26, 145,   3,   0,   0,   0,   0],
       [  4,  85,  48, 111, 100,  89,   3,   0,   0,   0],
       [  4,  85,  48, 111,  95,  89,   3,   0,   0,   0],
       [  4, 179, 136,   3,   0,   0,   0,   0,   0,   0],
       [  4,  89,  48, 111,  98,  15,  45, 165,  99,   3

##**Create a tf.data dataset**

In [95]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 12
N_BATCH = BUFFER_SIZE//BATCH_SIZE
embedding_dim = 190
units =   64
vocab_inp_size = len(lang.word2idx)
vocab_tar_size = len(lang.word2idx)
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [96]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([12, 10]), TensorShape([12, 10]))

##**Define the encoder and decoder network**

In [97]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_activation='sigmoid',
                                   recurrent_initializer='glorot_uniform',
                                       dropout=0.4)

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [98]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print('Encoder output shape: (batch size, sequence length, units)', sample_output.shape)
print('Encoder Hidden state shape: (batch size, units)', sample_hidden.shape)

Encoder output shape: (batch size, sequence length, units) (12, 10, 64)
Encoder Hidden state shape: (batch size, units) (12, 64)


In [99]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform',
                                       dropout=0.4)
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.W1 = tf.keras.layers.Dense(self.dec_units)
        self.W2 = tf.keras.layers.Dense(self.dec_units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, x, hidden, enc_output):

        hidden_with_time_axis = tf.expand_dims(hidden, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying tanh(FC(EO) + FC(H)) to self.V
        score = self.V(tf.nn.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * enc_output
        context_vector = tf.reduce_sum(context_vector, axis=1)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size * 1, vocab)
        x = self.fc(output)

        return x, state, attention_weights

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.dec_units))

In [100]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print('Decoder output shape: (batch_size, vocab size)', sample_decoder_output.shape)

Decoder output shape: (batch_size, vocab size) (12, 195)


## **Define the optimizer and the loss function**

In [101]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                            reduction='none')


def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

## **Checkpoints (Object-based saving)**

In [102]:
checkpoint_dir = '/content/drive/MyDrive/training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

## **Training**

1. Pass the *input* through the *encoder* which return *encoder output* and the *encoder hidden state*.
2. The encoder output, encoder hidden state and the decoder input (which is the *start token*) is passed to the decoder.
3. The decoder returns the *predictions* and the *decoder hidden state*.
4. The decoder hidden state is then passed back into the model and the predictions are used to calculate the loss.
5. Use *teacher forcing* to decide the next input to the decoder.
6. *Teacher forcing* is the technique where the *target word* is passed as the *next input* to the decoder.
7. The final step is to calculate the gradients and apply it to the optimizer and backpropagate.

In [103]:
EPOCHS = 100

for epoch in range(EPOCHS):
    start = time.time()

    hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset):
        loss = 0

        with tf.GradientTape() as tape:
            enc_output, enc_hidden = encoder(inp, hidden)

            dec_hidden = enc_hidden

            dec_input = tf.expand_dims([lang.word2idx['<start>']] * BATCH_SIZE, 1)

            # Teacher forcing - feeding the target as the next input
            for t in range(1, targ.shape[1]):
                # passing enc_output to the decoder
                predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
                loss += loss_function(targ[:, t], predictions)

                # using teacher forcing
                dec_input = tf.expand_dims(targ[:, t], 1)

        batch_loss = (loss / int(targ.shape[1]))

        total_loss += batch_loss

        variables = encoder.variables + decoder.variables

        gradients = tape.gradient(loss, variables)

        optimizer.apply_gradients(zip(gradients, variables))

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.numpy()))
    # saving (checkpoint) the model every epoch
    checkpoint.save(file_prefix=checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss / N_BATCH))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))


Epoch 1 Batch 0 Loss 2.2863
Epoch 1 Loss 2.4235
Time taken for 1 epoch 2.710221767425537 sec

Epoch 2 Batch 0 Loss 2.0462
Epoch 2 Loss 2.4581
Time taken for 1 epoch 2.6727254390716553 sec

Epoch 3 Batch 0 Loss 2.4429
Epoch 3 Loss 2.3690
Time taken for 1 epoch 5.188555479049683 sec

Epoch 4 Batch 0 Loss 2.6507
Epoch 4 Loss 2.1018
Time taken for 1 epoch 2.661336660385132 sec

Epoch 5 Batch 0 Loss 1.9962
Epoch 5 Loss 1.9634
Time taken for 1 epoch 2.197526216506958 sec

Epoch 6 Batch 0 Loss 2.0301
Epoch 6 Loss 1.8708
Time taken for 1 epoch 2.6781883239746094 sec

Epoch 7 Batch 0 Loss 1.8038
Epoch 7 Loss 1.8857
Time taken for 1 epoch 2.682539939880371 sec

Epoch 8 Batch 0 Loss 1.8232
Epoch 8 Loss 1.8475
Time taken for 1 epoch 2.6559391021728516 sec

Epoch 9 Batch 0 Loss 1.7753
Epoch 9 Loss 1.8255
Time taken for 1 epoch 2.188988447189331 sec

Epoch 10 Batch 0 Loss 1.7699
Epoch 10 Loss 1.7965
Time taken for 1 epoch 2.2081196308135986 sec

Epoch 11 Batch 0 Loss 1.9457
Epoch 11 Loss 1.7721
Time

##**Inference and Testing**
* The evaluate function is similar to the training loop, except we don't use *teacher forcing* here. The input to the decoder at each time step is its previous predictions along with the hidden state and the encoder output.
* Stop predicting when the model predicts the *end token*.
* And store the *attention weights for every time step*.

Note: The encoder output is calculated only once for one input.

In [104]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7b36fd251120>

In [105]:
def evaluate(inputs, encoder, decoder, lang, max_length_inp, max_length_targ):

    attention_plot = np.zeros((max_length_targ, max_length_inp))
    sentence = ''
    for i in inputs[0]:
        if i == 0:
            break
        sentence = sentence + lang.idx2word[i] + ' '
    sentence = sentence[:-1]

    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([lang.word2idx['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)

        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += lang.idx2word[predicted_id] + ' '

        if lang.idx2word[predicted_id] == '<end>':
            return result, sentence, attention_plot

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot

In [204]:
def predict_random_val_sentence():
    actual_sent = ''
    k = np.random.randint(len(input_tensor_train))
    random_input = input_tensor_train[k]
    random_output = target_tensor_train[k]
    random_input = np.expand_dims(random_input, 0)
    result, sentence, attention_plot = evaluate(random_input, encoder, decoder, lang, max_length_inp, max_length_targ)
    candidate_translation = result[:-6]  # Exclude start and end tokens
    print('Input: {}'.format(sentence[8:-6]))
    print('Predicted translation: {}'.format(candidate_translation))
    for i in random_output:
        if i == 0:
            break
        actual_sent = actual_sent + lang.idx2word[i] + ' '
    actual_sent = actual_sent[8:-7]
    print('Actual translation: {}'.format(actual_sent))
    reference_translation = actual_sent
    attention_plot = attention_plot[:len(result.split(' '))-2, 1:len(sentence.split(' '))-1]
    sentence, result = sentence.split(' '), result.split(' ')
    sentence = sentence[1:-1]
    result = result[:-2]

    # Use plotly to generate the heatmap
    trace = go.Heatmap(z=attention_plot, x=sentence, y=result, colorscale='greens')
    data = [trace]
    iplot(data)

    return candidate_translation, reference_translation, random_input


In [275]:

candidate_translation, reference_translation, random_input = predict_random_val_sentence()



Input: are you free today
Predicted translation: you free today 
Actual translation: you free today


In [195]:

from nltk.translate.bleu_score import sentence_bleu
score = sentence_bleu([reference_translation], candidate_translation, weights=(0.25, 0.25))
print(score)

0.800737402916808


In [190]:
import nltk
from nltk.translate.bleu_score import corpus_bleu

def predict_and_print_validation_set(input_tensor_val, target_tensor_val, encoder, decoder, lang, max_length_inp, max_length_targ):
    predicted_translations = []
    reference_translations = []

    # Predict translations for the entire validation set
    for i in range(len(input_tensor_val)):
        random_input = np.expand_dims(input_tensor_val[i], 0)
        result, _, _ = evaluate(random_input, encoder, decoder, lang, max_length_inp, max_length_targ)
        candidate_translation = result[:-6]  # Exclude start and end tokens
        predicted_translations.append(candidate_translation)

        actual_sent = ''
        random_output = target_tensor_val[i]
        for idx in random_output:
            if idx == 0:
                break
            actual_sent = actual_sent + lang.idx2word[idx] + ' '
        reference_translation = actual_sent[8:-7]  # Exclude start and end tokens
        reference_translations.append(reference_translation)

    # Print predicted and reference translations
    for predicted, reference in zip(predicted_translations, reference_translations):
        print("Predicted: ", predicted)
        print("Reference: ", reference)
        print()

    # Calculate BLEU score
    bleu_score = corpus_bleu([[ref.split()] for ref in reference_translations], [pred.split() for pred in predicted_translations], weights=(0.25, 0.25))
    print("BLEU Score:", bleu_score)

# Usage:
predict_and_print_validation_set(input_tensor_val, target_tensor_val, encoder, decoder, lang, max_length_inp, max_length_targ)


Predicted:  i do not make any difference to hear that 
Reference:  i in dilemma what to do

Predicted:  do you need something 
Reference:  you hide something

Predicted:  you 
Reference:  speak softly

Predicted:  what you 
Reference:  what you do

Predicted:  you from 
Reference:  which college school you from

Predicted:  you cry 
Reference:  my name xxxxxxxx

Predicted:  what you do not like you do not lik
Reference:  what do you want become

Predicted:  i trust you 
Reference:  i need water

Predicted:  comb you disappointed 
Reference:  what your phone number

Predicted:  i like you 
Reference:  i cry

Predicted:  you 
Reference:  congratulatiions

BLEU Score: 0.3933172938363534
