<a href="https://colab.research.google.com/github/pranaya-mathur/Deep-Learning-Projects/blob/master/Translation_with_TF_2_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[Neural Machine Translation With Attention Mechanism](https://machinetalk.org/2019/03/29/neural-machine-translation-with-attention-mechanism/)

In [1]:
import tensorflow as tf
print(tf.__version__)

2.2.0-rc2


In [0]:
import numpy as np
import unicodedata
import re

raw_data = (
    ('What a ridiculous concept!', 'Quel concept ridicule !'),
    ('Your idea is not entirely crazy.', "Votre idée n'est pas complètement folle."),
    ("A man's worth lies in what he is.", "La valeur d'un homme réside dans ce qu'il est."),
    ('What he did is very wrong.', "Ce qu'il a fait est très mal."),
    ("All three of you need to do that.", "Vous avez besoin de faire cela, tous les trois."),
    ("Are you giving me another chance?", "Me donnez-vous une autre chance ?"),
    ("Both Tom and Mary work as models.", "Tom et Mary travaillent tous les deux comme mannequins."),
    ("Can I have a few minutes, please?", "Puis-je avoir quelques minutes, je vous prie ?"),
    ("Could you close the door, please?", "Pourriez-vous fermer la porte, s'il vous plaît ?"),
    ("Did you plant pumpkins this year?", "Cette année, avez-vous planté des citrouilles ?"),
    ("Do you ever study in the library?", "Est-ce que vous étudiez à la bibliothèque des fois ?"),
    ("Don't be deceived by appearances.", "Ne vous laissez pas abuser par les apparences."),
    ("Excuse me. Can you speak English?", "Je vous prie de m'excuser ! Savez-vous parler anglais ?"),
    ("Few people know the true meaning.", "Peu de gens savent ce que cela veut réellement dire."),
    ("Germany produced many scientists.", "L'Allemagne a produit beaucoup de scientifiques."),
    ("Guess whose birthday it is today.", "Devine de qui c'est l'anniversaire, aujourd'hui !"),
    ("He acted like he owned the place.", "Il s'est comporté comme s'il possédait l'endroit."),
    ("Honesty will pay in the long run.", "L'honnêteté paye à la longue."),
    ("How do we know this isn't a trap?", "Comment savez-vous qu'il ne s'agit pas d'un piège ?"),
    ("I can't believe you're giving up.", "Je n'arrive pas à croire que vous abandonniez."),
)

In [0]:
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')


def normalize_string(s):
    s = unicode_to_ascii(s)
    s = re.sub(r'([!.?])', r' \1', s)
    s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)
    s = re.sub(r'\s+', r' ', s)
    return s

In [0]:
raw_data_en, raw_data_fr = list(zip(*raw_data))
raw_data_en, raw_data_fr = list(raw_data_en), list(raw_data_fr)

raw_data_en = [normalize_string(data) for data in raw_data_en]
raw_data_fr_in = ['<start> ' + normalize_string(data) for data in raw_data_fr]
raw_data_fr_out = [normalize_string(data) + ' <end>' for data in raw_data_fr]

In [5]:
raw_data_en[:5]

['What a ridiculous concept !',
 'Your idea is not entirely crazy .',
 'A man s worth lies in what he is .',
 'What he did is very wrong .',
 'All three of you need to do that .']

In [6]:
raw_data_fr_in[:5]

['<start> Quel concept ridicule !',
 '<start> Votre idee n est pas completement folle .',
 '<start> La valeur d un homme reside dans ce qu il est .',
 '<start> Ce qu il a fait est tres mal .',
 '<start> Vous avez besoin de faire cela tous les trois .']

In [7]:
raw_data_fr_out[:5]

['Quel concept ridicule ! <end>',
 'Votre idee n est pas completement folle . <end>',
 'La valeur d un homme reside dans ce qu il est . <end>',
 'Ce qu il a fait est tres mal . <end>',
 'Vous avez besoin de faire cela tous les trois . <end>']

![alt text](https://machinetalk.org/wp-content/uploads/2019/04/input.png)

In [8]:
en_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
en_tokenizer.fit_on_texts(raw_data_en)
print(en_tokenizer.word_index)

{'.': 1, 'you': 2, '?': 3, 'the': 4, 'a': 5, 'is': 6, 'he': 7, 'what': 8, 'in': 9, 'do': 10, 'can': 11, 't': 12, 'did': 13, 'giving': 14, 'me': 15, 'i': 16, 'few': 17, 'please': 18, 'this': 19, 'know': 20, 'ridiculous': 21, 'concept': 22, '!': 23, 'your': 24, 'idea': 25, 'not': 26, 'entirely': 27, 'crazy': 28, 'man': 29, 's': 30, 'worth': 31, 'lies': 32, 'very': 33, 'wrong': 34, 'all': 35, 'three': 36, 'of': 37, 'need': 38, 'to': 39, 'that': 40, 'are': 41, 'another': 42, 'chance': 43, 'both': 44, 'tom': 45, 'and': 46, 'mary': 47, 'work': 48, 'as': 49, 'models': 50, 'have': 51, 'minutes': 52, 'could': 53, 'close': 54, 'door': 55, 'plant': 56, 'pumpkins': 57, 'year': 58, 'ever': 59, 'study': 60, 'library': 61, 'don': 62, 'be': 63, 'deceived': 64, 'by': 65, 'appearances': 66, 'excuse': 67, 'speak': 68, 'english': 69, 'people': 70, 'true': 71, 'meaning': 72, 'germany': 73, 'produced': 74, 'many': 75, 'scientists': 76, 'guess': 77, 'whose': 78, 'birthday': 79, 'it': 80, 'today': 81, 'acted'

In [0]:
data_en = en_tokenizer.texts_to_sequences(raw_data_en)
data_en = tf.keras.preprocessing.sequence.pad_sequences(data_en,
                                                        padding='post')

In [10]:
print(data_en[:3])

[[ 8  5 21 22 23  0  0  0  0  0]
 [24 25  6 26 27 28  1  0  0  0]
 [ 5 29 30 31 32  9  8  7  6  1]]


In [0]:
fr_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')

# ATTENTION: always finish with fit_on_texts before moving on
fr_tokenizer.fit_on_texts(raw_data_fr_in)
fr_tokenizer.fit_on_texts(raw_data_fr_out)

data_fr_in = fr_tokenizer.texts_to_sequences(raw_data_fr_in)
data_fr_in = tf.keras.preprocessing.sequence.pad_sequences(data_fr_in,
                                                           padding='post')

data_fr_out = fr_tokenizer.texts_to_sequences(raw_data_fr_out)
data_fr_out = tf.keras.preprocessing.sequence.pad_sequences(data_fr_out,
                                                            padding='post')

In [12]:
print(data_fr_in[:3])
print("+++++++++++++")
print(data_fr_out[:3])

[[ 3 31 32 33 16  0  0  0  0  0  0  0  0  0]
 [ 3 34 35 20  6 10 36 37  2  0  0  0  0  0]
 [ 3 11 38 21 22 39 40 41 12 17  7  6  2  0]]
+++++++++++++
[[31 32 33 16  4  0  0  0  0  0  0  0  0  0]
 [34 35 20  6 10 36 37  2  4  0  0  0  0  0]
 [11 38 21 22 39 40 41 12 17  7  6  2  4  0]]


In [0]:
dataset = tf.data.Dataset.from_tensor_slices(
    (data_en, data_fr_in, data_fr_out))
dataset = dataset.shuffle(20).batch(5)

In [14]:
for i in dataset:
  print(i)

(<tf.Tensor: shape=(5, 10), dtype=int32, numpy=
array([[ 8,  5, 21, 22, 23,  0,  0,  0,  0,  0],
       [62, 12, 63, 64, 65, 66,  1,  0,  0,  0],
       [10,  2, 59, 60,  9,  4, 61,  3,  0,  0],
       [44, 45, 46, 47, 48, 49, 50,  1,  0,  0],
       [17, 70, 20,  4, 71, 72,  1,  0,  0,  0]], dtype=int32)>, <tf.Tensor: shape=(5, 14), dtype=int32, numpy=
array([[ 3, 31, 32, 33, 16,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 3, 29,  1, 74, 10, 75, 76, 18, 77,  2,  0,  0,  0,  0],
       [ 3,  6, 12, 19,  1, 71,  8, 11, 72, 28, 73,  5,  0,  0],
       [ 3, 53, 54, 55, 56, 25, 18, 57, 26, 58,  2,  0,  0,  0],
       [ 3, 82,  9, 83, 84, 12, 19, 24, 85, 86, 87,  2,  0,  0]],
      dtype=int32)>, <tf.Tensor: shape=(5, 14), dtype=int32, numpy=
array([[31, 32, 33, 16,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [29,  1, 74, 10, 75, 76, 18, 77,  2,  4,  0,  0,  0,  0],
       [ 6, 12, 19,  1, 71,  8, 11, 72, 28, 73,  5,  4,  0,  0],
       [53, 54, 55, 56, 25, 18, 57, 26, 58,  2,  4,  0,

In [0]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, lstm_size):
        super(Encoder, self).__init__()
        self.lstm_size = lstm_size
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.lstm = tf.keras.layers.LSTM(
            lstm_size, return_sequences=True, return_state=True)

    def call(self, sequence, states):
        embed = self.embedding(sequence)
        output, state_h, state_c = self.lstm(embed, initial_state=states)

        return output, state_h, state_c

    def init_states(self, batch_size):
        return (tf.zeros([batch_size, self.lstm_size]),
                tf.zeros([batch_size, self.lstm_size]))

![alt text](https://machinetalk.org/wp-content/uploads/2019/03/data_shapes-1.png)

In [0]:
# Creating Decoder

class Decoder(tf.keras.Model):
  def __init__(self,vocab_size,embedding_size,lstm_size):
    super(Decoder,self).__init__()

    self.lstm_size = lstm_size
    self.embedding = tf.keras.layers.Embedding(vocab_size,embedding_size)
    self.lstm = tf.keras.layers.LSTM(lstm_size,return_sequences=True,return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)


  def call(self,sequence,state):

    embed = self.embedding(sequence)
    lstm_out, state_h, state_c = self.lstm(embed,state)

    logits = self.dense(lstm_out)

    return logits, state_h, state_c

![alt text](https://machinetalk.org/wp-content/uploads/2019/03/data_shapes-2.png)

In [30]:
EMBEDDING_SIZE = 32
LSTM_SIZE = 64

en_vocab_size = len(en_tokenizer.word_index) + 1
encoder = Encoder(en_vocab_size, EMBEDDING_SIZE, LSTM_SIZE)

fr_vocab_size = len(fr_tokenizer.word_index) + 1
decoder = Decoder(fr_vocab_size, EMBEDDING_SIZE, LSTM_SIZE)

source_input = tf.constant([[1, 3, 5, 7, 2, 0, 0, 0]])
initial_state = encoder.init_states(1)
encoder_output, en_state_h, en_state_c = encoder(source_input, initial_state)

target_input = tf.constant([[1, 4, 6, 9, 2, 0, 0]])
decoder_output, de_state_h, de_state_c = decoder(target_input, (en_state_h, en_state_c))

print('Source sequences', source_input.shape)
print('Encoder outputs', encoder_output.shape)
print('Encoder state_h', en_state_h.shape)
print('Encoder state_c', en_state_c.shape)

print('\nDestination vocab size', fr_vocab_size)
print('Destination sequences', target_input.shape)
print('Decoder outputs', decoder_output.shape)
print('Decoder state_h', de_state_h.shape)
print('Decoder state_c', de_state_c.shape)


Source sequences (1, 8)
Encoder outputs (1, 8, 64)
Encoder state_h (1, 64)
Encoder state_c (1, 64)

Destination vocab size 110
Destination sequences (1, 7)
Decoder outputs (1, 7, 110)
Decoder state_h (1, 64)
Decoder state_c (1, 64)


In [0]:
def loss_func(targets, logits):
    crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True)
    mask = tf.math.logical_not(tf.math.equal(targets, 0))
    mask = tf.cast(mask, dtype=tf.int64)
    loss = crossentropy(targets, logits, sample_weight=mask)

    return loss

In [0]:
optimizer = tf.keras.optimizers.Adam()

In [0]:
@tf.function
def train_step(source_seq, target_seq_in, target_seq_out, en_initial_states):
    with tf.GradientTape() as tape:
        en_outputs = encoder(source_seq, en_initial_states)
        en_states = en_outputs[1:]
        de_states = en_states

        de_outputs = decoder(target_seq_in, de_states)
        logits = de_outputs[0]
        loss = loss_func(target_seq_out, logits)

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return loss

In [0]:
def predict():
    test_source_text = raw_data_en[np.random.choice(len(raw_data_en))]
    print(test_source_text)
    test_source_seq = en_tokenizer.texts_to_sequences([test_source_text])
    print(test_source_seq)

    en_initial_states = encoder.init_states(1)
    en_outputs = encoder(tf.constant(test_source_seq), en_initial_states)

    de_input = tf.constant([[fr_tokenizer.word_index['<start>']]])
    de_state_h, de_state_c = en_outputs[1:]
    out_words = []

    while True:
        de_output, de_state_h, de_state_c = decoder(
            de_input, (de_state_h, de_state_c))
        de_input = tf.argmax(de_output, -1)
        out_words.append(fr_tokenizer.index_word[de_input.numpy()[0][0]])

        if out_words[-1] == '<end>' or len(out_words) >= 20:
            break

    print(' '.join(out_words))

In [38]:
NUM_EPOCHS = 250
BATCH_SIZE = 5

for e in range(NUM_EPOCHS):
    en_initial_states = encoder.init_states(BATCH_SIZE)

    for batch, (source_seq, target_seq_in, target_seq_out) in enumerate(dataset.take(-1)):
        loss = train_step(source_seq, target_seq_in,
                          target_seq_out, en_initial_states)

    print('Epoch {} Loss {:.4f}'.format(e + 1, loss.numpy()))
    
    try:
        predict()
    except Exception:
      continue

Epoch 1 Loss 3.1537
He acted like he owned the place .
[[7, 82, 83, 7, 84, 4, 85, 1]]
vous . prie . prie prie . prie quelques quelques fermer je vous <end>
Epoch 2 Loss 2.9474
He acted like he owned the place .
[[7, 82, 83, 7, 84, 4, 85, 1]]
vous . <end>
Epoch 3 Loss 3.6079
Guess whose birthday it is today .
[[77, 78, 79, 80, 6, 81, 1]]
vous vous . <end>
Epoch 4 Loss 3.7928
Guess whose birthday it is today .
[[77, 78, 79, 80, 6, 81, 1]]
vous vous . <end>
Epoch 5 Loss 3.4363
What a ridiculous concept !
[[8, 5, 21, 22, 23]]
vous vous . <end>
Epoch 6 Loss 3.0563
Don t be deceived by appearances .
[[62, 12, 63, 64, 65, 66, 1]]
vous vous vous . <end>
Epoch 7 Loss 3.2204
Do you ever study in the library ?
[[10, 2, 59, 60, 9, 4, 61, 3]]
vous vous vous . . <end>
Epoch 8 Loss 3.4500
Could you close the door please ?
[[53, 2, 54, 4, 55, 18, 3]]
vous vous vous . . <end>
Epoch 9 Loss 3.4032
Do you ever study in the library ?
[[10, 2, 59, 60, 9, 4, 61, 3]]
vous vous vous . <end>
Epoch 10 Loss 3.106