In [32]:
import tensorflow as tf
from tensorflow.keras.layers import Layer, Embedding, LSTM, Dense
import matplotlib.pyplot as plt

In [None]:
VOCAB_SIZE = 20000
ENGLISH_SEQUENCE_LENGTH = 64
FRENCH_SEQUENCE_LENGTH = 64
EMBEDDING_DIM = 300
BATCH_SIZE=8
HIDDEN_UNITS = 256

## Data Preparation

In [None]:
!wget https://www.manythings.org/anki/fra-eng.zip

--2023-07-18 13:30:17--  https://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7420323 (7.1M) [application/zip]
Saving to: ‘fra-eng.zip’


2023-07-18 13:30:17 (20.1 MB/s) - ‘fra-eng.zip’ saved [7420323/7420323]



In [None]:
!unzip "/content/fra-eng.zip" -d "/content/dataset/"

Archive:  /content/fra-eng.zip
  inflating: /content/dataset/_about.txt  
  inflating: /content/dataset/fra.txt  


In [None]:
!wc -l /content/dataset/fra.txt

217975 /content/dataset/fra.txt


In [None]:
!head -10000 /content/dataset/fra.txt > /content/dataset/fra_10000.txt

## Preprocessing

In [None]:
text_dataset = tf.data.TextLineDataset("/content/dataset/fra_10000.txt")
text_dataset

<TextLineDatasetV2 element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [None]:
english_vectorize_layer = tf.keras.layers.TextVectorization(
    standardize='lower_and_strip_punctuation',
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=ENGLISH_SEQUENCE_LENGTH
)

In [None]:
french_vectorize_layer = tf.keras.layers.TextVectorization(
    standardize='lower_and_strip_punctuation',
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=FRENCH_SEQUENCE_LENGTH
)

In [None]:
def selector(input_text):
  split_text = tf.strings.split(input_text, '\t')
  return {'input_1':split_text[0:1], 'input_2': '[start] ' + split_text[1:2]}, split_text[1:2]+' [end]'
  # {english, french(with start)}, french(with end)

In [None]:
split_dataset = text_dataset.map(selector)

In [None]:
for i in split_dataset.take(3):
  print(i)

({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'[start] Va !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Va ! [end]'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'[start] Marche.'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Marche. [end]'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'[start] En route !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'En route ! [end]'], dtype=object)>)


In [None]:
def separator(input_text):
  split_text = tf.strings.split(input_text, '\t')
  return split_text[0:1], '[start] ' + split_text[1:2]+' [end]'

In [None]:
init_dataset = text_dataset.map(separator)

In [None]:
for i in init_dataset.take(3):
  print(i)

(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'[start] Va ! [end]'], dtype=object)>)
(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'[start] Marche. [end]'], dtype=object)>)
(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'[start] En route ! [end]'], dtype=object)>)


In [None]:
english_training_data = init_dataset.map(lambda x,y : x) # input x,y and output x # only for english
english_vectorize_layer.adapt(english_training_data)

In [None]:
# check the vectorize layer
english_vectorize_layer.get_vocabulary()[10]

'we'

In [None]:
french_training_data = init_dataset.map(lambda x,y : y) # input x,y and output x # only for english
french_vectorize_layer.adapt(french_training_data)

In [None]:
def vectorizer(inputs, output):
  return {'input_1' :english_vectorize_layer(inputs['input_1']),
          'input_2': french_vectorize_layer(inputs['input_2'])}, french_vectorize_layer(output)

In [None]:
dataset = split_dataset.map(vectorizer)

In [None]:
# check if each inputs are mapped with the adapted vectorizer
for i in dataset.take(3):
  print(i)

({'input_1': <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[11,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])>, 'input_2': <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[ 2, 39,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])>}, <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[39,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0

### Batch/Split

In [None]:
dataset=dataset.shuffle(2048).unbatch().batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)
NUM_BATCHES=int(10000/BATCH_SIZE)
train_dataset=dataset.take(int(0.9*NUM_BATCHES))
val_dataset=dataset.skip(int(0.9*NUM_BATCHES))

## Encoder-Decoder with Bahdanau Attention

- Problem of RNN: Depend on one context vector
- "Neural Machine Translation by Jointly Learning to Align and Translate" -> Bahdanau Attention paper
- https://wikidocs.net/73161
- Attention mechanism: each and every output unit is linked to all the input units via attention with vector
- Bahdanau attention = addtitive attention

In [None]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, units):
    super(Encoder, self).__init__()
    self.vocab_size = vocab_size
    self.embedding_dim = embedding_dim
    self.units = units

  def build(self, input_shape):
    self.embedding = Embedding(self.vocab_size, self.embedding_dim) # (bs, seq_len, embed_dim)
    self.lstm = LSTM(self.units, return_sequences=True) # (bs, seq_len, hs)

  def call(self, input):
    x = self.embedding(input)
    outputs = self.lstm(x) ##?
    return outputs

In [None]:
HIDDEN_UNITS = 256
encoder = Encoder(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_UNITS)
enc_out = encoder(tf.zeros([8, 64]))
enc_out.shape # (bs, seq_len, hs)

TensorShape([8, 64, 256])

In [None]:
class BahdanauAttention(Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.units = units

  def build(self, input_shape):
    self.w_a = Dense(1)
    self.w_b = Dense(self.units)
    self.w_c = Dense(self.units)

  def call(self, prev_dec_state, enc_state): # (bs, hs), (bs, seq_len, hs)
    scores = self.w_a(
        tf.nn.tanh(
            self.w_b(tf.expand_dims(prev_dec_state, -2)) # (bs, 1, hs)
            + self.w_c(enc_state)
        )
    ) # (bs, seq_len, 1)

    attention_distribution = tf.nn.softmax(scores, axis=1) # (bs, seq_len, 1) # attention values with every input sequence
    context_vector = attention_distribution * enc_state # (bs, seq_len, hs)
    context_vector = tf.reduce_sum(context_vector, axis=1) # (bs, hs)
    # print(context_vector.shape)

    return context_vector

In [None]:
attention = BahdanauAttention(HIDDEN_UNITS)
context_vector = attention(tf.zeros([8, 256]), enc_out)
context_vector.shape

TensorShape([8, 256])

In [None]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, units, sequence_length):
    super(Decoder, self).__init__()
    self.vocab_size = vocab_size
    self.embedding_dim = embedding_dim
    self.units = units
    self.sequence_length = sequence_length

  def build(self, input_shape):
    self.embedding = Embedding(self.vocab_size, self.embedding_dim)
    self.lstm = LSTM(self.units, return_sequences=True, return_state=True)
    self.attention = BahdanauAttention(self.units)
    self.dense = Dense(self.vocab_size, activation="softmax")

  def call(self, enc_state, prev_dec_state, shifted_target): ## teacher forcing
    shifted_target = self.embedding(shifted_target) # (bs, seq_len, embed_dim)

    outputs = []
    for t in range(self.sequence_length):
      context_vector = self.attention(prev_dec_state, enc_state) # (bs, hs)

      # Bahdanau concatenates context vector and word embedding
      dec_input = tf.concat([context_vector, shifted_target[:,t]], axis=-1) # (bs, hs + embed_dim)
      output = self.lstm(tf.expand_dims(dec_input, 1)) # lstm accepts 3 dim # (bs, 1, hs)
      #### need to check the output of lstm

      outputs.append(output[0][:, 0])  # (bs, hs) * seq_len

    outputs = tf.convert_to_tensor(outputs) # (seq_len, bs, hs)
    outputs = tf.transpose(outputs, perm=[1,0,2]) # (bs, seq_len, hs)
    outputs = self.dense(outputs) # (bs, seq_len, vocab_size)
    return outputs

In [None]:
tf.zeros([8, 1, 256])[:, 0]

<tf.Tensor: shape=(8, 256), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [None]:
decoder = Decoder(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_UNITS, FRENCH_SEQUENCE_LENGTH)
decoder(enc_out, tf.zeros([8, HIDDEN_UNITS]), tf.zeros([8, 64])).shape

TensorShape([8, 64, 20000])

## Training

In [34]:
## ENCODER
input = tf.keras.Input(shape=(ENGLISH_SEQUENCE_LENGTH,), dtype="int64", name="input_1")
encoder = Encoder(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_UNITS)
encoder_output = encoder(input)

## DEOCDER
shifted_target = tf.keras.Input(shape=(FRENCH_SEQUENCE_LENGTH,), dtype="int64", name="input_2")
decoder = Decoder(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_UNITS, FRENCH_SEQUENCE_LENGTH)
decoder_output = decoder(encoder_output, tf.zeros([1,HIDDEN_UNITS]), shifted_target)

## OUTPUT
bahdanau = tf.keras.Model([input, shifted_target], decoder_output)
bahdanau.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 64)]         0           []                               
                                                                                                  
 encoder_2 (Encoder)            (None, 64, 256)      6570368     ['input_1[0][0]']                
                                                                                                  
 input_2 (InputLayer)           [(None, 64)]         0           []                               
                                                                                                  
 decoder_2 (Decoder)            (None, 64, 20000)    12104353    ['encoder_2[0][0]',              
                                                                  'input_2[0][0]']            

In [36]:
class BLEU(tf.keras.metrics.Metric):
  def __init__(self, name='bleu_score'):
    super(BLEU, self).__init__()
    self.bleu_score = 0

  def update_state(self, y_true, y_pred, sample_weight=None):
    y_pred = tf.argmax(y_pred, axis=-1) # one-hot into vectorized form
    self.bleu_score = 0

    batch_score = []
    for pred_sample, true_sample in zip(y_pred, y_true): # loop through batches
      tf.autograph.experimental.set_loop_options()
      num_words_pred = tf.math.count_nonzero(pred_sample)

      total_matches = 0
      # check every element in y_pred if it is in y_true => +1
      for word in pred_sample:
        if word == 0:
          break
        for i in range(len(true_sample)): # check for each in y_true
          if true_sample[i] == 0:
            break
          if true_sample[i] == word:
            total_matches += 1
            true_sample = tf.boolean_mask(true_sample, [False if y==i else True for y in range(len(true_sample))]) # exclude the matched component -> preventing matching again
            break

        self.bleu_score += total_matches / num_words_pred
  ## need brevity penalty

  def result(self):
    return self.bleu_score / BATCH_SIZE

In [41]:
bahdanau.compile(
    loss = tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer = tf.keras.optimizers.Adam(5e-4),
)

In [42]:
history = bahdanau.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=3,
)

Epoch 1/3


KeyboardInterrupt: ignored