https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/text/nmt_with_attention.ipynb#scrollTo=sC9ArXSsVfqn

In [1]:
%tensorflow_version 2.x

TensorFlow 2.x selected.


In [0]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
import re
import os
import time
import unicodedata
import pandas as pd

In [3]:
!wget http://www.manythings.org/anki/fra-eng.zip
!mkdir data
!unzip /content/fra-eng.zip -d /content/data/

--2020-03-02 13:26:23--  http://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.24.109.196, 104.24.108.196, 2606:4700:3033::6818:6dc4, ...
Connecting to www.manythings.org (www.manythings.org)|104.24.109.196|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5939832 (5.7M) [application/zip]
Saving to: ‘fra-eng.zip’


2020-03-02 13:26:24 (27.9 MB/s) - ‘fra-eng.zip’ saved [5939832/5939832]

Archive:  /content/fra-eng.zip
  inflating: /content/data/_about.txt  
  inflating: /content/data/fra.txt   


In [4]:
!head -n 5 /content/data/fra.txt

Go.	Va !	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)
Hi.	Salut !	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)
Hi.	Salut.	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4320462 (gillux)
Run!	Cours !	CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906331 (sacredceltic)
Run!	Courez !	CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906332 (sacredceltic)


In [0]:
data_df = pd.read_csv('/content/data/fra.txt', sep='\t', header=None)

In [6]:
data_df.head()

Unnamed: 0,0,1,2
0,Go.,Va !,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1,Hi.,Salut !,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
2,Hi.,Salut.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
3,Run!,Cours !,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
4,Run!,Courez !,CC-BY 2.0 (France) Attribution: tatoeba.org #9...


In [7]:
data_df = data_df.loc[:, :1]
data_df.head()

Unnamed: 0,0,1
0,Go.,Va !
1,Hi.,Salut !
2,Hi.,Salut.
3,Run!,Cours !
4,Run!,Courez !


In [0]:
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

In [0]:
def preprocess_sentence(w):
  w = unicode_to_ascii(w.lower().strip())

  w = re.sub(r"([?.!,])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)
  w = re.sub(r"[^a-zA-Z?.!,]+", " ", w)
  w = w.rstrip().strip()
  w = '<start>' + w + '<end>'
  return w

In [0]:

def preprocess_sentence(w):
  w = unicode_to_ascii(w.lower().strip())

  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

  w = w.rstrip().strip()

  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  return w

In [11]:
en_sentence = u"May I borrow this book?"

print(preprocess_sentence(en_sentence))

<start> may i borrow this book ? <end>


In [0]:
data_df['eng_preprocess'] = data_df[0].apply(preprocess_sentence)
data_df['fra_preprocess'] = data_df[1].apply(preprocess_sentence)

In [13]:
data_df.head()

Unnamed: 0,0,1,eng_preprocess,fra_preprocess
0,Go.,Va !,<start> go . <end>,<start> va ! <end>
1,Hi.,Salut !,<start> hi . <end>,<start> salut ! <end>
2,Hi.,Salut.,<start> hi . <end>,<start> salut . <end>
3,Run!,Cours !,<start> run ! <end>,<start> cours ! <end>
4,Run!,Courez !,<start> run ! <end>,<start> courez ! <end>


In [14]:
data_df.tail(1)[['eng_preprocess', 'fra_preprocess']].values

array([['<start> it may be impossible to get a completely error free corpus due to the nature of this kind of collaborative effort . however , if we encourage members to contribute sentences in their own languages rather than experiment in languages they are learning , we might be able to minimize errors . <end>',
        '<start> il est peut etre impossible d obtenir un corpus completement denue de fautes , etant donnee la nature de ce type d entreprise collaborative . cependant , si nous encourageons les membres a produire des phrases dans leurs propres langues plutot que d experimenter dans les langues qu ils apprennent , nous pourrions etre en mesure de reduire les erreurs . <end>']],
      dtype=object)

In [0]:
en = data_df['eng_preprocess'].tolist()
en = tuple(en)

fr = data_df['fra_preprocess'].tolist()
fr = tuple(fr)

In [0]:
def max_length(tensor):
  return max(len(t) for t in tensor)

In [0]:
def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)
  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
  return lang_tokenizer, tensor

In [0]:
def load_dataset(df, eng_process, fr_process, num_examples=None):
  eng_token, eng_tensor = tokenize(tuple(df[eng_process].tolist()))
  fr_token, fr_tensor = tokenize(tuple(df[fr_process].tolist()))
  return eng_token, eng_tensor, fr_token, fr_tensor

In [0]:
eng_token, eng_tensor, fr_token, fr_tensor = load_dataset(df=data_df, eng_process='eng_preprocess', fr_process='fra_preprocess')

In [20]:
max_input_eng_length = max_length(eng_tensor)
print(max_input_eng_length)

max_output_fr_length = max_length(fr_tensor)
print(max_output_fr_length)

54
65


In [0]:
eng_train, eng_val, fr_train, fr_val = train_test_split(eng_tensor, fr_tensor, test_size=0.2)

In [22]:
# Show length
print(len(eng_train), len(fr_train), len(eng_val), len(fr_val))

139584 139584 34897 34897


In [0]:
def convert(tokenizer, tensor):
  for t in tensor:
    if t != 0:
      print(f"{t} ---- > {tokenizer.index_word[t]}")

In [24]:
print ("Input Language; index to word mapping")
convert(eng_token, eng_train[0])
print ()
print ("Target Language; index to word mapping")
convert(fr_token, fr_train[0])

Input Language; index to word mapping
1 ---- > <start>
5 ---- > you
133 ---- > must
324 ---- > pay
790 ---- > attention
6 ---- > to
53 ---- > him
3 ---- > .
2 ---- > <end>

Target Language; index to word mapping
1 ---- > <start>
16 ---- > tu
131 ---- > dois
80 ---- > lui
1186 ---- > preter
443 ---- > attention
3 ---- > .
2 ---- > <end>


## Google Translate Result
- French

  - tu serais impressionne de voir combien de temps tom a pris pour se preparer ce matin.

- English 
  - you would be amazed how long tom took to prepare this morning.

In [0]:
BUFFER_SIZE = len(eng_train)
BATCH_SIZE = 64
steps_per_epoch = len(eng_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(eng_token.word_index)+1
vocab_tar_size = len(fr_token.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((eng_train, fr_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [26]:
print(BUFFER_SIZE)
print(steps_per_epoch)

139584
2181


In [27]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 54]), TensorShape([64, 65]))

In [28]:
example_input_batch

<tf.Tensor: shape=(64, 54), dtype=int32, numpy=
array([[  1,  95,  62, ...,   0,   0,   0],
       [  1,  27,  15, ...,   0,   0,   0],
       [  1,   4,  23, ...,   0,   0,   0],
       ...,
       [  1,  24, 561, ...,   0,   0,   0],
       [  1,  27,  77, ...,   0,   0,   0],
       [  1, 406,   5, ...,   0,   0,   0]], dtype=int32)>

## Build Model

In [0]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, emb_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.emb_dim = emb_dim
    self.vocab_size = vocab_size

    self.embedding = tf.keras.layers.Embedding(vocab_size, emb_dim)
    self.gru = tf.keras.layers.GRU(enc_units, return_sequences=True, return_state=True)

  def call(self, x, hidden):
    x = self.embedding(x)
    print(x.shape)
    output, state = self.gru(x, initial_state=hidden)
    print(output.shape)
    print(state.shape)
    return output, state
  
  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [30]:
vocab_inp_size

13860

In [0]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

In [0]:
sample_hidden = encoder.initialize_hidden_state()

In [33]:
sample_hidden

<tf.Tensor: shape=(64, 1024), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [34]:
example_input_batch.shape

TensorShape([64, 54])

In [35]:
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)

(64, 54, 256)
(64, 54, 1024)
(64, 1024)


In [36]:
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 54, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


In [0]:
class Attention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(Attention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, rnn_hidden, rnn_output):
    hidden = tf.expand_dims(rnn_hidden, axis=1)
    print(hidden.shape)

    score = self.W1(hidden) + self.W2(rnn_output)
    print(score.shape)
    score = tf.nn.tanh(score)
    print(score.shape)
    score = self.V(score)
    print(score.shape)
    
    att_weight = tf.nn.softmax(score, axis=1)
    print(att_weight.shape)

    context_vector = score * rnn_output
    print(context_vector.shape)
    context_vector = tf.reduce_sum(context_vector, axis=1)
    print(context_vector.shape)
    return context_vector, att_weight

In [38]:
attention = Attention(10)

att_vector, att_weight = attention(sample_hidden, sample_output)

(64, 1, 1024)
(64, 54, 10)
(64, 54, 10)
(64, 54, 1)
(64, 54, 1)
(64, 54, 1024)
(64, 1024)


In [39]:
att_vector.numpy().shape

(64, 1024)

In [40]:
print("Attention result shape: (batch size, units) {}".format(att_vector.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(att_weight.shape))

Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 54, 1)


In [0]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embed_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.vocab_size = vocab_inp_size
    self.embed_dim = embed_dim
    self.dec_units = dec_units
    self.batch_sz = batch_sz

    self.embedding = tf.keras.layers.Embedding(vocab_size, embed_dim)
    self.gru = tf.keras.layers.GRU(dec_units, return_sequences=True, return_state=True)
    self.fc = tf.keras.layers.Dense(vocab_size)

    self.attention = Attention(self.dec_units)

  def call(self, x, hidden, enc_output):
    context_vector, attention_weight = self.attention(hidden, enc_output) # context_vector shape = batch_size, encoder_units
    print('Decoder')
    print(context_vector.shape)
    print(attention_weight.shape)
                                          
    x = self.embedding(x) # x shape = batch, vacab_size, emb_dim
    print(x.shape)
    
    x = tf.concat([tf.expand_dims(context_vector, axis=1), x], axis=-1) # concatenate method of embedding and attention vector
    # x shape become (batch, vacab_size, emb_dim + encoder_units)
    print(x.shape)

    output, state = self.gru(x) # output shape = batch, t, dec_unit
    print(output.shape)
    print(state.shape)
    
    # convert the shape to (batch * 1,  units)
    output = tf.reshape(output, (-1, output.shape[2]))
    print(output.shape)
    x = self.fc(output)
    print(x.shape)
    return x, state, attention_weight


In [0]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

In [43]:
sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)), # the input is sequence
                                      sample_hidden, sample_output)

(64, 1, 1024)
(64, 54, 1024)
(64, 54, 1024)
(64, 54, 1)
(64, 54, 1)
(64, 54, 1024)
(64, 1024)
Decoder
(64, 1024)
(64, 54, 1)
(64, 1, 256)
(64, 1, 1280)
(64, 1, 1024)
(64, 1024)
(64, 1024)
(64, 22791)


In [44]:
tf.random.uniform((BATCH_SIZE, 1)).shape

TensorShape([64, 1])

# 0302 Start

In [0]:
optimizer = tf.keras.optimizers.Adam()

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [0]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1) # [1,1,1 ... 1,1,1]

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [47]:
dataset

<BatchDataset shapes: ((64, 54), (64, 65)), types: (tf.int32, tf.int32)>

In [0]:
EPOCHS = 10

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

In [46]:
fr_token.word_index

{'<start>': 1,
 '<end>': 2,
 '.': 3,
 'je': 4,
 'a': 5,
 'de': 6,
 '?': 7,
 'pas': 8,
 'est': 9,
 'vous': 10,
 'que': 11,
 'il': 12,
 'la': 13,
 'ne': 14,
 'le': 15,
 'tu': 16,
 'j': 17,
 'ce': 18,
 'n': 19,
 'tom': 20,
 'un': 21,
 'l': 22,
 'ai': 23,
 ',': 24,
 'nous': 25,
 'en': 26,
 'd': 27,
 'une': 28,
 'les': 29,
 'suis': 30,
 'me': 31,
 'c': 32,
 'pour': 33,
 'elle': 34,
 'qu': 35,
 'faire': 36,
 'ca': 37,
 '!': 38,
 's': 39,
 'dans': 40,
 'plus': 41,
 'y': 42,
 'des': 43,
 'm': 44,
 'du': 45,
 'qui': 46,
 'moi': 47,
 'tout': 48,
 'veux': 49,
 'te': 50,
 't': 51,
 'etre': 52,
 'fait': 53,
 'etait': 54,
 'avec': 55,
 'etes': 56,
 'mon': 57,
 'au': 58,
 'se': 59,
 'si': 60,
 'as': 61,
 'et': 62,
 'avez': 63,
 'sur': 64,
 'sont': 65,
 'cette': 66,
 'ils': 67,
 'es': 68,
 'son': 69,
 'pense': 70,
 'ou': 71,
 'tres': 72,
 'peux': 73,
 'votre': 74,
 'cela': 75,
 'temps': 76,
 'pourquoi': 77,
 'ete': 78,
 'dit': 79,
 'lui': 80,
 'ici': 81,
 'ma': 82,
 'sais': 83,
 'chose': 84,
 'jamais'