In [0]:
import numpy as np
import math
import re
import os
import time
from google.colab import drive


In [0]:
try:
  %tensorflow_version 2.x
except:
  pass

import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds



In [105]:
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
with open ("/content/drive/My Drive/transformer/data/europarl-v7.fr-en.en",
           mode = "r",
           encoding = "utf-8") as f:
  europarl_en = f.read()

with open ("/content/drive/My Drive/transformer/data/europarl-v7.fr-en.fr",
           mode = "r",
           encoding = "utf-8") as f:
  europarl_fr = f.read()


with open ("/content/drive/My Drive/transformer/data/nonbreaking_prefix.en",
           mode = "r",
           encoding = "utf-8") as f:
  non_breaking_prefix_en = f.read()    


with open ("/content/drive/My Drive/transformer/data/nonbreaking_prefix.fr",
           mode = "r",
           encoding = "utf-8") as f:
  non_breaking_prefix_fr = f.read()  

In [107]:
europarl_en[:100]

'Resumption of the session\nI declare resumed the session of the European Parliament adjourned on Frid'

In [108]:
non_breaking_prefix_en[:5]

'a\nb\nc'

In [109]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [0]:
non_breaking_prefix_en = non_breaking_prefix_en.split("\n")
non_breaking_prefix_en = [' ' + pref + '.' for pref in non_breaking_prefix_en]
non_breaking_prefix_fr = non_breaking_prefix_fr.split("\n")
non_breaking_prefix_fr = [' ' + pref + '.' for pref in non_breaking_prefix_fr]


In [111]:
non_breaking_prefix_en[:5]

[' a.', ' b.', ' c.', ' d.', ' e.']

In [0]:
corpus_en = europarl_en  #Here we distinguish the above NBPrefixes from the fullstops by adding a ### after the prefix's dot, and removing the .###
for prefix in non_breaking_prefix_en:
  corpus_en = corpus_en.replace(prefix, prefix + "###")
corpus_en = re.sub(r"\.(?=[0-9]|[a-z]|[A-Z])", ".###" , corpus_en)  #In regex, whatever is in () are not removed, only occurences are searched, and | is for or
corpus_en = re.sub(r"\.###", '', corpus_en)
corpus_en = re.sub(r"  +", ' ', corpus_en)
corpus_en = corpus_en.split("\n")    #To make the huge list into a normal one with spaces


corpus_fr = europarl_fr
for prefix in non_breaking_prefix_fr:
  corpus_fr = corpus_fr.replace(prefix, prefix + "###")
corpus_fr = re.sub(r"\.(?=[0-9]|[a-z]|[A-Z])", ".###" , corpus_fr)
corpus_fr = re.sub(r"\.###", '', corpus_fr)
corpus_fr = re.sub(r"  +", ' ', corpus_fr)
corpus_fr = corpus_fr.split("\n")  

In [0]:
tokenizer_en = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    corpus_en, target_vocab_size=2**13)
tokenizer_fr = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    corpus_fr, target_vocab_size=2**13)

In [0]:
VOCAB_SIZE_EN = tokenizer_en.vocab_size + 2 #2 extra for the start and end token
VOCAB_SIZE_FR = tokenizer_fr.vocab_size + 2

In [0]:
#Prepare the inputs and target op for training like - <start> text <end>
inputs = [[VOCAB_SIZE_EN - 2] + tokenizer_en.encode(sentence) + [VOCAB_SIZE_EN - 1]
          for sentence in corpus_en]

outputs = [[VOCAB_SIZE_FR - 2] + tokenizer_fr.encode(sentence) + [VOCAB_SIZE_FR - 1] 
           for sentence in corpus_fr]          

In [0]:
#Removing extra long sentences to prevent delay in preprocessing of the corpus
MAXLEN = 20

idx_remove = [count for count, sentence in enumerate(inputs)
              if len(sentence) > MAXLEN]  #If we pass the entire corpus at once, RAM may exhaust, hence pass one by one

for idx in reversed(idx_remove): #if no reversal, one deletion will push the index forward by 1, we will delete the wrong words
  del inputs[idx]   #Since the ip and op text need to correspond to each other, we need to truncate the same idx
  del outputs[idx]    

idx_remove = [count for count, sentence in enumerate(outputs)
              if len(sentence) > MAXLEN]

for idx in reversed(idx_remove):
  del inputs[idx]
  del outputs[idx]                          

In [0]:
#Padding
#0 is a value not used by the tokenizer, hence safe to pad
#The algo used by the transformer (MASKING) will anyway not touch padded data

inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs,
                                                       value = 0,
                                                       padding = 'post',
                                                       maxlen = MAXLEN)

outputs = tf.keras.preprocessing.sequence.pad_sequences(outputs,
                                                        value = 0,
                                                        padding = 'post',
                                                        maxlen = MAXLEN)

In [0]:
BATCH_SIZE = 64
BUFFER_SIZE = 20000 #For shuffling acc to BufSize and then batching

#Cache stores dataset in local storage, faster loading
dataset = tf.data.Dataset.from_tensor_slices((inputs, outputs))
dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) #This starts requesting data before the execution of stmt is initialized

Positional encoding formulae:

$PE_{(pos,2i)} =\sin(pos/10000^{2i/dmodel})$

$PE_{(pos,2i+1)} =\cos(pos/10000^{2i/dmodel})$

In [0]:
class PositionalEncoding(layers.Layer):
  def __init__(self):
    super(PositionalEncoding, self).__init__()  #This will initialize the Layer class properties and complete inheritance

  def get_angles(self, pos, i, d_model):
    #pos refers to the index positions of the sequence, to be encoded
    #embedding size of model is dmodel, specific iterable denoted by i
    # pos: (seq_length, 1) and #i : (1, d_model)

    angles = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model)) #// takes care of even and odd formula
    return pos * angles #shape is seq_length, d_model

  def call(self, inputs):
      #to return inputs + encoding
      seq_length = inputs.shape.as_list()[-2]
      d_model = inputs.shape.as_list()[-1]
      angles = self.get_angles(np.arange(seq_length)[:, np.newaxis], 
                               np.arange(d_model)[np.newaxis, :],
                               d_model)   #adding new dims to get # pos: (seq_length, 1) and #i : (1, d_model)
      angles[:, 0::2] = np.sin(angles[:, 0::2]) #from 0, step 2 - even
      angles[:, 1::2] = np.cos(angles[:, 1::2]) #from 1, step 2 - odd
      pos_encoding = angles[np.newaxis, ...] #This is for a batch dimension
      return inputs + tf.cast(pos_encoding, tf.float32)  #This is to cast as a tensor object


$Attention(Q, K, V ) = \text{softmax}\left(\dfrac{QK^T}{\sqrt{d_k}}\right)V $

In [0]:
def scaled_dot_product_attention(queries, keys, values, mask):
  product = tf.matmul(queries, keys, transpose_b = True)
  keys_dim = tf.cast(tf.shape(keys)[-1], tf.float32)  #The last axis corresponds to the embedding dimension
  scaled_product = product / tf.math.sqrt(keys_dim)

  if mask is not None:
    scaled_product += (mask * -1e9)  #if the value of the scaled product is zero, it will not have any effect on final ans (padded zeros) * -infinite gives 0 after softmax (NO CONTRIBUTION)

  attention = tf.matmul(tf.nn.softmax(scaled_product, axis = -1), values)  #Softmax applied across the last dimension which corr to weights of V, has to sum up to 1 (final op matrix)

  return attention

### Multi-head attention sublayer

In [0]:
class MultiHeadAttention(layers.Layer):

  def __init__(self, nb_proj):
    super(MultiHeadAttention, self).__init__()
    self.nb_proj = nb_proj

  def build(self, input_shape): #Input shape has all required dims, build is a function called when the MHA layer is called the first time, as we need d_model for the linear layers which is not accessible from init, as the layer class attributes are called, so build is used
    self.d_model = input_shape[-1]
    assert self.d_model % self.nb_proj == 0 #make sure it is divisible into subspaces

    self.d_proj = self.d_model // self.nb_proj

    self.query_lin = layers.Dense(units = self.d_model)
    self.key_lin = layers.Dense(units = self.d_model)
    self.value_lin = layers.Dense(units = self.d_model)

    self.final_lin = layers.Dense(units = self.d_model)



  def split_proj(self, inputs, batch_size): # inputs: (batch_size, seq_length, d_model)
    shape = (batch_size,
             -1,
             self.nb_proj,
             self.d_proj)
    split_inputs = tf.reshape(inputs, shape = shape)  # (batch_size, seq_length, nb_proj, d_proj)
    return tf.transpose(split_inputs, perm = [0, 2, 1, 3]) # (batch_size, nb_proj, seq_length, d_proj)


  def call(self, queries, keys, values, mask):

    batch_size = tf.shape(queries)[0]
    queries = self.query_lin(queries) 
    keys = self.key_lin(keys)
    values = self.value_lin(values) 

    queries = self.split_proj(queries, batch_size)
    keys = self.split_proj(keys, batch_size)
    values = self.split_proj(values, batch_size)

    attention = scaled_dot_product_attention(queries, keys, values, mask)

    #Recombine back into original space dims

    attention = tf.transpose(attention, perm = [0, 2, 1, 3])

    concat_attention = tf.reshape(attention,
                                  shape = (batch_size, -1, self.d_model))
    
    outputs = self.final_lin(concat_attention)

    return outputs
    

## Encoder Layer

In [0]:
class EncoderLayer(layers.Layer):
    
    def __init__(self, FFN_units, nb_proj, dropout_rate):
      super(EncoderLayer, self).__init__()
      self.FFN_units = FFN_units
      self.nb_proj = nb_proj
      self.dropout_rate = dropout_rate

    def build(self, input_shape):  
      self.d_model = input_shape[-1]

      self.multi_head_attention = MultiHeadAttention(self.nb_proj)
      self.dropout_1 = layers.Dropout(rate = self.dropout_rate)
      self.normal_1 = layers.LayerNormalization(epsilon = 1e-6)
      self.dense_1 = layers.Dense(units = self.FFN_units, activation = 'relu')
      self.dense_2 = layers.Dense(units = self.d_model)
      self.dropout_2 = layers.Dropout(rate = self.dropout_rate)
      self.normal_2 = layers.LayerNormalization(epsilon = 1e-6)

    def call(self, inputs, mask, training):
      attention = self.multi_head_attention(inputs, inputs, inputs, mask)
      attention = self.dropout_1(attention, training = training)
      attention = self.normal_1(attention + inputs) #Residual connection to ease backprop

      outputs = self.dense_1(attention)
      outputs = self.dense_2(outputs)
      outputs = self.dropout_2(outputs, training = training)
      outputs = self.normal_2(outputs + attention)

      return outputs

      #How to build this : first define the call structure as per the process
      #then go back to build and define each layer one by one



## Encoder

In [0]:
class Encoder(layers.Layer):

  def __init__(self, 
               nb_layers,
               FFN_units,
               nb_proj,
               dropout_rate,
               vocab_size,
               d_model,
               name = "encoder"
               ):
    super(Encoder, self).__init__(name = name)
    self.nb_layers = nb_layers
    self.d_model = d_model

    self.embedding = layers.Embedding(vocab_size, d_model)
    self.pos_encoding = PositionalEncoding()
    self.dropout = layers.Dropout(rate = dropout_rate)
    self.enc_layers = [EncoderLayer(FFN_units, nb_proj, dropout_rate)
                      for _ in range(nb_layers)]
  

  def call(self, inputs, mask, training):
    outputs = self.embedding(inputs)
    outputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    outputs = self.pos_encoding(outputs)
    outputs = self.dropout(outputs, training)

    for i in range(self.nb_layers):
      outputs = self.enc_layers[i](outputs, mask, training)


    return outputs  

## Decoder Layer

In [0]:
class DecoderLayer(layers.Layer):

  def __init__(self, FFN_units, nb_proj, dropout_rate):
    super(DecoderLayer, self).__init__()
    self.FFN_units = FFN_units
    self.nb_proj = nb_proj
    self.dropout_rate = dropout_rate


  def build(self, input_shape):
    self.d_model = input_shape[-1]

    #Self multi-head attention
    self.multi_head_attention_1 = MultiHeadAttention(self.nb_proj)
    self.dropout_1 = layers.Dropout(rate = self.dropout_rate)
    self.normal_1 = layers.LayerNormalization(epsilon = 1e-6)

    # Multi head attention combined with encoder output    
    self.multi_head_attention_2 = MultiHeadAttention(self.nb_proj)
    self.dropout_2 = layers.Dropout(rate = self.dropout_rate)
    self.normal_2 = layers.LayerNormalization(epsilon = 1e-6)  

    #FFN
    self.dense_1 = layers.Dense(units = self.FFN_units, activation = 'relu')
    self.dense_2 = layers.Dense(units = self.d_model) 
    self.dropout_3 = layers.Dropout(rate = self.dropout_rate)
    self.normal_3 = layers.LayerNormalization(epsilon = 1e-6)


  def call(self, inputs, enc_outputs, mask_1, mask_2, training):  #bool train is to apply dropout or not
    attention = self.multi_head_attention_1(inputs,
                                            inputs,
                                            inputs,  
                                            mask_1)  #ip, ip, ip are queries, keys and values respectively
    attention = self.dropout_1(attention, training)
    attention = self.normal_1(attention + inputs)

    attention_2 = self.multi_head_attention_2(attention,
                                              enc_outputs, 
                                              enc_outputs,
                                              mask_2)
    attention_2 = self.dropout_2(attention_2, training)
    attention_2 = self.normal_2(attention_2 + attention)

    outputs = self.dense_1(attention_2)
    outputs = self.dense_2(outputs)
    outputs = self.dropout_3(outputs, training)
    outputs = self.normal_3(outputs + attention_2)

    return outputs


## Decoder

In [0]:
class Decoder(layers.Layer):

  def __init__(self, 
               nb_layers,
               FFN_units,
               nb_proj,
               dropout_rate,
               vocab_size,
               d_model,
               name = "decoder"):
    super(Decoder, self).__init__(name = name)
    self.d_model = d_model
    self.nb_layers = nb_layers

    self.embedding = layers.Embedding(vocab_size, d_model)
    self.pos_encoding = PositionalEncoding()
    self.dropout = layers.Dropout(rate = dropout_rate)

    self.dec_layers = [DecoderLayer(FFN_units,
                                    nb_proj,
                                    dropout_rate)
                      for _ in range(nb_layers)]

    
  def call(self, inputs, enc_outputs, mask_1, mask_2, training):
    outputs = self.embedding(inputs)
    outputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    outputs = self.pos_encoding(outputs)
    outputs = self.dropout(outputs, training)

    for i in range(self.nb_layers):
      outputs = self.dec_layers[i](outputs,
                                   enc_outputs,
                                   mask_1,
                                   mask_2,
                                   training)

      return outputs  

## Transformer

In [0]:
#Here we also define the 2 masks used

class Transformer(tf.keras.Model):

  def __init__(self,
               vocab_size_enc,
               vocab_size_dec,
               d_model,
               nb_layers,
               FFN_units,
               nb_proj,
               dropout_rate,
               name = "transformer"):
    super(Transformer, self).__init__(name = name)

    self.encoder = Encoder(nb_layers,
                           FFN_units,
                           nb_proj,
                           dropout_rate,
                           vocab_size_enc,
                           d_model)
    
    self.decoder = Decoder(nb_layers,
                           FFN_units,
                           nb_proj,
                           dropout_rate,
                           vocab_size_dec,
                           d_model)    
    
    self.last_linear = layers.Dense(units = vocab_size_dec, name = "lin_output")

  def create_padding_mask(self, seq):  #Seq here is tokenized, each word is a num, its not embedded yet

    mask = tf.cast(tf.math.equal(seq, 0), tf.float32) #check for zeros for the padding token and place the zero for the mask there
    return mask[:, tf.newaxis, tf.newaxis, :]   #first newaxis is for mask to be applied on each projection (dim = nb_proj) in the attention layer
                                                #second newaxis is a broadcasting dimension, if mask is a row vector , the dim before it is empty, hence it broadcasts and we get the required masking matrix

  def create_lookahead_mask(self, seq): #This mask is to prevent access of a future word j by word i, during train we provide the full word to the trans, but to predict a word n, it should not look beyond word n

    seq_len = tf.shape(seq)[1]
    lookahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0) #the -1 is the arg position for i < j, not used, and the 0 arg is for i > j, which is filled with 1 to get a UTMatrix (left)                                              
                                                                                 #we convert this to a strict UT Right matrix so that word i does not have access to a future word j   
    return lookahead_mask #tf.max will intelligently broadcast dims to the 4dim tensor

  def call(self, enc_inputs, dec_inputs, training):
    enc_mask = self.create_padding_mask(enc_inputs)
    dec_mask_1 = tf.maximum(
        self.create_padding_mask(dec_inputs),
        self.create_lookahead_mask(dec_inputs)
    )

    #NOTE - a mask applied is a value of 1, We need to block the word i from seeing the future word j, so we use a URT matrix, where each 1 in the j > i part shows that the value is NOT CONSIDERED
    #there are 3 matrices, Q, K and V
    #Q is a matrix with the french translation in training phase
    # here K = V, but the product between K and Q is a similarity norm, to find the dependance on output french words on the input
    #This product acts as weights after softmax, for the V matrix, which is to simply reorganize all the text in V (english) for a better fit for the french translation
    #This product is like a GLOBAL matrix which provides attention to the required input parts

    dec_mask_2 = self.create_padding_mask(enc_inputs)  #For this mask, it is a mask applied to check the similarity between the french word matrix Q and english word matrix to group together the english sentence in such a way that the meaningful english words, wrt the french context word, is re-organized for a better fit
                                                       #The shape of the output - a zero is applied by the mask wherever matrix A (english K) is zero due to the padding, and hence if n zeros, last n terms in Q are not touched (decoder_dims), hence enc_input matrix decides this mask
    enc_outputs = self.encoder(enc_inputs, enc_mask, training)
    dec_outputs = self.decoder(dec_inputs,
                               enc_outputs,
                               dec_mask_1,
                               dec_mask_2,
                               training)
    
    outputs = self.last_linear(dec_outputs)

    return outputs



# Training

In [0]:
tf.keras.backend.clear_session()

# Hyper-parameters
D_MODEL = 128 # 512  These second values are those used in the Google Research paper
NB_LAYERS = 4 # 6 - Reduce all to ease computation
FFN_UNITS = 512 # 2048
NB_PROJ = 8 # 8
DROPOUT_RATE = 0.1 # 0.1

transformer = Transformer(vocab_size_enc=VOCAB_SIZE_EN,
                          vocab_size_dec=VOCAB_SIZE_FR,
                          d_model=D_MODEL,
                          nb_layers=NB_LAYERS,
                          FFN_units=FFN_UNITS,
                          nb_proj=NB_PROJ,
                          dropout_rate=DROPOUT_RATE)

In [0]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True,
                                                            reduction = "none") #indicating a softmax prediction and not to sum losses and reduce, keep each loss value instead

def loss_function(target, pred):
  mask = tf.math.logical_not(tf.math.equal(target, 0))  
  loss_ = loss_object(target, pred)   #This is to mask out all the padded zeros, to not evaluate those in the loss function
  mask = tf.cast(mask, dtype = loss_.dtype)
  loss_ *= mask  #calc each term in the loss function and mask out the padded terms

  return tf.reduce_mean(loss_)

train_loss = tf.keras.metrics.Mean(name = "train_loss")
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name="train_accuracy")

In [0]:
#This is the custom learning rate to be defined

class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):

  def __init__(self, d_model, warmup_steps = 4000):
    super(CustomSchedule, self).__init__()

    self.d_model = tf.cast(d_model, tf.float32)
    self.warmup_steps = warmup_steps

  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)  

learning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(learning_rate,
                                     beta_1 = 0.9,
                                     beta_2 = 0.98,
                                     epsilon = 1e-9)

In [133]:
checkpoint_path = "./drive/My Drive/transformer/ckpt/"

ckpt = tf.train.Checkpoint(transformer = transformer,
                           optimizer = optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep = 5)

if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print("Latest Checkpoint restored!")

Latest Checkpoint restored!


In [134]:
EPOCHS = 8
for epoch in range(EPOCHS):
  print("Start of epocch {}".format(epoch + 1))
  start = time.time()

  train_loss.reset_states()
  train_accuracy.reset_states()

  for (batch, (enc_inputs, targets)) in enumerate(dataset):
    dec_inputs = targets[:, :-1]
    dec_outputs_real = targets[:, 1:]
    with tf.GradientTape() as tape:
      predictions = transformer(enc_inputs, dec_inputs, True)
      loss = loss_function(dec_outputs_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(dec_outputs_real, predictions)

    if batch % 50 == 0:
      print("Epoch {} Batch {} Loss {:.4f} Accuracy {:.4}".format(
          epoch + 1, batch, train_loss.result(), train_accuracy.result()
      )) 

  ckpt_save_path = ckpt_manager.save()
  print("Saving checkpoint for epoch {} at {}".format(epoch+1,
                                                        ckpt_save_path))
  print("Time taken for 1 epoch: {} secs\n".format(time.time() - start))   




Start of epocch 1
Epoch 1 Batch 0 Loss 2.8019 Accuracy 0.2508
Epoch 1 Batch 50 Loss 2.2186 Accuracy 0.2947
Epoch 1 Batch 100 Loss 2.0657 Accuracy 0.3154
Epoch 1 Batch 150 Loss 1.9729 Accuracy 0.3268
Epoch 1 Batch 200 Loss 1.9109 Accuracy 0.3377
Epoch 1 Batch 250 Loss 1.8662 Accuracy 0.3438
Epoch 1 Batch 300 Loss 1.8287 Accuracy 0.3496
Epoch 1 Batch 350 Loss 1.7995 Accuracy 0.3545
Epoch 1 Batch 400 Loss 1.7659 Accuracy 0.3587
Epoch 1 Batch 450 Loss 1.7400 Accuracy 0.3622
Epoch 1 Batch 500 Loss 1.7191 Accuracy 0.3655
Epoch 1 Batch 550 Loss 1.7007 Accuracy 0.3681
Epoch 1 Batch 600 Loss 1.6860 Accuracy 0.3708
Epoch 1 Batch 650 Loss 1.6687 Accuracy 0.3735
Epoch 1 Batch 700 Loss 1.6549 Accuracy 0.376
Epoch 1 Batch 750 Loss 1.6429 Accuracy 0.3781
Epoch 1 Batch 800 Loss 1.6300 Accuracy 0.3801
Epoch 1 Batch 850 Loss 1.6189 Accuracy 0.382
Epoch 1 Batch 900 Loss 1.6080 Accuracy 0.3837
Epoch 1 Batch 950 Loss 1.5983 Accuracy 0.3849
Epoch 1 Batch 1000 Loss 1.5893 Accuracy 0.3861
Epoch 1 Batch 1050 L

In [0]:
def evaluate(input_sentence):
  input_sentence = \
        [VOCAB_SIZE_EN - 2] + tokenizer_en.encode(input_sentence) + [VOCAB_SIZE_EN - 1]

  enc_input = tf.expand_dims(input_sentence, axis = 0) #Batch dims

  output = tf.expand_dims([VOCAB_SIZE_FR - 2], axis = 0)  #we have only start tag now in predictions, enable batchDims

  for _ in range(MAXLEN):
    predictions = transformer(enc_input, output, False)  #Note here that the output is dec op which is also dec ip, but the dec ip has a start tag and text is shifted right, but dec op wll be shifted left rel to the dec ip and will have the extra final predicted word

    prediction = predictions[:, -1:, :]   #size of predictions is (1, seq_length, vocab_size_fr), taking the last element from seq which is at each stage the predicted word, each word is a array of preds of size vocab_size_fr

    predicted_id = tf.cast(tf.argmax(prediction, axis = -1), tf.int32)  #argmax along vocab_size arg to pull out highest of softmax preds

    if predicted_id == VOCAB_SIZE_FR - 1:
      return tf.squeeze(output, axis = 0)

    output = tf.concat([output, predicted_id], axis = -1)

  return tf.squeeze(output, axis = 0)   


In [0]:
def translate(sentence):
  output = evaluate(sentence).numpy()  #Numpy array is easier to handle than tensor

  predicted_ans = tokenizer_fr.decode(
      [i for i in output if i < VOCAB_SIZE_FR - 2]    #Do npt decode the start and end tag
  )

  print("Input : {}".format(sentence))
  print("Output: {}".format(predicted_ans))

In [137]:
translate("Yesterday")

Input : Yesterday
Output: Hier, Hier


In [138]:
translate("This is a really powerful tool!")

Input : This is a really powerful tool!
Output: C'est lÃ  un instrument de la plus puissant!


In [139]:
translate("Good morning, how are you doing?")

Input : Good morning, how are you doing?
Output: Que faisons-nous ce matin, comment faisons-nous ?
