## Install Packages and Import Libraries

In [None]:
!pip install -q --upgrade tensorflow

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m589.8/589.8 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m96.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m91.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m101.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m78.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tf-keras 2.15.1 requires tensorflow<2.16,>=2.15, but you have tensorflow 2.16.1 which is inc

In [None]:
# check tensorflow version
import tensorflow as tf
print(tf.__version__)

2.16.1


In [None]:
import tensorflow as tf
import keras
from google.colab import userdata
import pandas as pd
import os
import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd
import random
import gc

## Downlaod Data and preprocess it

Download [English-French Translation Dataset](https://www.kaggle.com/datasets/dhruvildave/en-fr-translation-dataset) from Kaggle

In [None]:
os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')
os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')

In [None]:
!kaggle datasets download -d dhruvildave/en-fr-translation-dataset

Downloading en-fr-translation-dataset.zip to /content
100% 2.54G/2.54G [02:32<00:00, 17.7MB/s]
100% 2.54G/2.54G [02:32<00:00, 17.9MB/s]


In [None]:
!unzip "/content/en-fr-translation-dataset.zip"

Archive:  /content/en-fr-translation-dataset.zip
  inflating: en-fr.csv               


Take a look into the `train.csv` file

## Run Rest of Code

In [None]:
BATCH_SIZE = 32
NUM_LAYERS = 4
D_MODEL = 256
DFF = 256
NUM_HEADS = 4
DROPOUT_RATE = 0.5
TRAINING_EXMAPLES = 1800000
WARMUP_STEPS = 35000

In [None]:
file_path = "/content/en-fr.csv"
df = pd.read_csv(file_path)

In [None]:
df = df.sample(n=TRAINING_EXMAPLES, random_state=42)

In [None]:
df.head()

Unnamed: 0,en,fr
17765420,Operating in the rugged Arctic always poses un...,Les conditions rudes de l’Arctique ont toujour...
7566010,Building a Knowledge Culture,Création d'une culture du savoir
13382656,More than 360 current and former Cadets of the...,Plus de 360 anciens et actuels de divers campu...
19904187,Like the Château Saint-Louis it was a 2-storey...,"Comme le château Saint-Louis, il s'agit d'un é..."
12978711,• L'administration québécoise trailed the Queb...,Pour les entreprises dans le secteur privé don...


Since the `train.csv` file alone already has many exmaples, using subset of it, create a dataset and divide to `train_dataset` and `val_dataset`. In this document, 14000 examples was sampled from the `train.csv`.

You  have to create a tf.Dataset and each example has following structure:
`((enc_input, dec_input), dec_ouput)` where
```
enc_input: (None, n_input)
dec_input: (None, n_output)
dec_output: (None, n_output)
```



The output shape of decoder in transformer model is `(None, n_output, vocab_size)`, so you have to use `SparseCategoricalCorssEntropy` when calculating loss.

Below are the steps to create `train_dataset` and `val_dataset` using loaded `df`.

1. Add [START] and [END] tokens to all English and Fench sentences, and remove any rows if number of tokens is bigger than 64.

In [None]:
def clean_dataset(df):
  # remove special characters
  for column in df.columns:
    df[column] = df[column].apply(lambda x: re.sub(r'[\n\t]|[^a-zA-Z0-9\s.]', '', str(x)))

  # remove too long en
  df = df[df['en'].apply(lambda x: len(x.split()) <= 64)]
  # remove too long fr
  df = df[df['fr'].apply(lambda x: len(x.split()) <= 64)]

  return df

In [None]:
df.loc[:, 'en'] = '[START] ' + df['en'] + ' [END]'
df.loc[:, 'fr'] = '[START] ' + df['fr'] + ' [END]'

In [None]:
cleaned_df = clean_dataset(df)

In [None]:
len(cleaned_df)

1700562

In [None]:
cleaned_df.head()

Unnamed: 0,en,fr
17765420,START Operating in the rugged Arctic always po...,START Les conditions rudes de lArctique ont to...
7566010,START Building a Knowledge Culture END,START Cration dune culture du savoir END
13382656,START More than 360 current and former Cadets ...,START Plus de 360 anciens et actuels de divers...
19904187,START Like the Chteau SaintLouis it was a 2sto...,START Comme le chteau SaintLouis il sagit dun ...
12978711,START Ladministration qubcoise trailed the Qu...,START Pour les entreprises dans le secteur pri...


In [None]:
train_df, val_df = train_test_split(cleaned_df, test_size=0.15, random_state=42)

In [None]:
# delete memory in df ad cleaned_df
del df
del cleaned_df
gc.collect()

8

2. Create two instances of the `Tokenizer`: one for English and another for French. Update their word vocabularies by passing all English and French sentences from train_df to the `fit_on_texts` method.

In [None]:
en_tokenizer = Tokenizer(num_words=10000)
en_tokenizer.fit_on_texts(train_df['en'])

In [None]:
fr_tokenizer = Tokenizer(num_words=10000)
fr_tokenizer.fit_on_texts(train_df['fr'])

In [None]:
print(len(en_tokenizer.word_index))
print(len(fr_tokenizer.word_index))

485848
498672


Here, you will notice that `len(en_tokenizer.index_word)` and `len(fr_tokenizer.word_index)` are larger than the value you set for the `num_words` parameter in the `fit_on_texts` method. This discrepancy occurs because the `num_words` limitation is applied during the `texts_to_sequences` method, not during the initial fitting.

The `word_index` and `index_word` dictionaries are ordered by the frequency of each word in the corpus. When using `texts_to_sequences`, it will use words up to num_words index in the vocab.

In [None]:
max_input_length = 64
max_output_length = 64

def encode_texts(tokenizer, texts, max_len):
  sequences = tokenizer.texts_to_sequences(texts)
  padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
  return padded_sequences

def prepare_dataset(df):
  input_seqs = encode_texts(en_tokenizer, df['en'], max_input_length)
  output_seqs = encode_texts(fr_tokenizer, df['fr'], max_output_length+1)

  dec_input = output_seqs[:, :-1]
  dec_output = output_seqs[:, 1:]

  return tf.data.Dataset.from_tensor_slices(((input_seqs, dec_input), dec_output))

In [None]:
train_dataset = prepare_dataset(train_df).shuffle(10000).batch(32).prefetch(buffer_size=tf.data.AUTOTUNE)
val_dataset = prepare_dataset(val_df).batch(32).prefetch(buffer_size=tf.data.AUTOTUNE)

In [None]:
for example in val_dataset.take(1):
  inputs, output = example
  enc_input, dec_input = inputs
  print(enc_input.shape)
  print(dec_input.shape)
  print(output.shape)

(32, 64)
(32, 64)
(32, 64)


## Build Transformer Model

### Create Embedding Block

In [None]:
def positional_encoding(length, embedding_dim):
  """ Creates a postional encoding tensor.

  Inputs:
    length: sequence length
    embedding_dim: d_model of a embedding layer

  Outputs:
    positional encoding tensor with shape of (length, embedding_dim)

  """
  i = np.arange(embedding_dim/2)[np.newaxis, :] # (1, half_embedding_dim)
  exponent = 2*i/embedding_dim
  denominator = 1 / (10000 ** exponent)
  positions = np.arange(length)[:, np.newaxis] # (seq, 1)

  angle_rads = positions * denominator
  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1)

  return tf.cast(pos_encoding, dtype=tf.float32)

In [None]:
class PositionalEmbedding(tf.keras.layers.Layer):
  """ Custom Keras layer which encapsulates Embedding Layer and Positional Encoding.

  Attributes:
    d_model (int): embedding of the Embedding Block
    embedding (tf.keras.layers.Embedding): Embedding layer, which takes (None, n) and outputs (None, n, d_model)
    pos_encoding (tf.tensor): positional encoding tensor with shape of (n, d_model)


  """
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.d_model = d_model
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
    self.pos_encoding = positional_encoding(length=2048, embedding_dim=d_model) # (None, seq_length=2048, embdding_dim)

  def call(self, x):
    """ Add positional encoding to the output got by inputting x to the Embedding Layer

    Inputs:
      x: (None, n_input)

    Outputs:
      x: (None, n_input, d_model)
    """
    # x: (None, seq)
    length = tf.shape(x)[1]
    x = self.embedding(x)
    # x: (None, seq, embedding_dim)

    # This factor sets the relative scale of the embedding and positonal_encoding.
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))

    pos_encoding = self.pos_encoding[tf.newaxis, :length, :] # (None, seq_length, embedding_dim)
    x = x + pos_encoding
    return x

### Create MultihHeadAttention Blocks

In [None]:
class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()

In [None]:
class CrossAttention(BaseAttention):
  """ Second MHA block in decoder. It takes K, V from Encoder and Q from previous Add&Norm Layer in Decoder

  Inputs:
    x: (None, n_target, embedding_dim). Output of Encoder is inputted as K, V.
    context: The outptu of encoder block, has shape of (None, n_input, embedding_dim). Inputted as Q.

  Outputs:
    x: (None, n_target, embedding_dim)
  """
  def call(self, x, context):
    attn_output, attn_scores = self.mha(
        query=x,
        key=context,
        value=context,
        return_attention_scores=True
    )

    x = self.add([x, attn_output])
    x = self.layernorm(x)

    return x

In [None]:
class CausalSelfAttention(BaseAttention):
  """ First MHA Block in Decoder. It uses LookAhed masking

  Methods:
    call
      Inputs:
        x: (None, n_target, embedding_dim)

      Outputs:
        x: (None, n_target, embedding_dim)
  """
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x,
        use_causal_mask = True)

    x = self.add([x, attn_output])
    x = self.layernorm(x)

    return x

In [None]:
class GlobalSelfAttention(BaseAttention):
  """ MHA Block in Encoder

  Inputs:
    x: (None, n_input, d_model). This will be inputted as Q, K, V to the MHA block.

  Outputs:
    x: (None, n_input, d_model)
  """
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

### Create FeedFowardBlocks

In [None]:
class FeedForward(tf.keras.layers.Layer):
  """ Two instance of it will be created, and one will be used in Encoder and other one will be used in Deocder.
  """
  def __init__(self, d_model, dff, dropout_rate=0.1):
    super().__init__()

    self.seq = tf.keras.Sequential([
      # (None, n, embedding_dim)
      tf.keras.layers.Dense(dff, activation='relu'),
      # (None, n, diff)
      tf.keras.layers.Dense(d_model),
      # (None, n, embedding_dim)
      tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    x = self.layer_norm(x)
    return x

### Create Encoder

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, dropout_rate=0.1):
    super().__init__()

    self.self_attention = GlobalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.self_attention(x)
    x = self.ffn(x)
    return x

In [None]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads,dff, vocab_size, dropout_rate=0.1):
    super().__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size, d_model=d_model)

    self.enc_layers = [
        EncoderLayer(d_model=d_model,
                     num_heads=num_heads,
                     dff=dff,
                     dropout_rate=dropout_rate)
        for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x):
    # x: (None, x_input)
    x = self.pos_embedding(x)
    # x: (None, x_input, embedding_im)
    x = self.dropout(x)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x)

    # x: (None, x_input, embedding_dim)

    return x

### Create Decoder

In [None]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self,
               d_model,
               num_heads,
               dff,
               dropout_rate=0.1):

    super(DecoderLayer, self).__init__()

    self.causal_self_attention = CausalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.cross_attention = CrossAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x, context):
    x = self.causal_self_attention(x=x)
    x = self.cross_attention(x=x, context=context)
    x = self.ffn(x)
    return x

In [None]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, vocab_size,
               dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size, d_model=d_model)
    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.dec_layers = [
        DecoderLayer(d_model=d_model, num_heads=num_heads,
                     dff=dff, dropout_rate=dropout_rate)
        for _ in range(num_layers)]


  def call(self, x, context):
    x = self.pos_embedding(x)
    x = self.dropout(x)

    for i in range(self.num_layers):
      x  = self.dec_layers[i](x, context)

    # The shape of x is (batch_size, target_seq_len, d_model).
    return x

### Define the model

In [None]:
class Transformer(tf.keras.Model):
  def __init__(self, num_layers, d_model, num_heads, dff, vocab_size, dropout_rate=0.1):
    super().__init__()

    self.encoder = Encoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=vocab_size,
                           dropout_rate=dropout_rate)

    self.decoder = Decoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=vocab_size,
                           dropout_rate=dropout_rate)

    self.final_layer = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs):
    # To use a Keras model with `.fit` you must pass all your inputs in the
    # first argument.
    context, x  = inputs

    context = self.encoder(context)  # (batch_size, context_len, d_model)

    x = self.decoder(x, context)  # (batch_size, target_len, d_model)

    # Final linear layer output.
    logits = self.final_layer(x)  # (batch_size, target_len, target_vocab_size)

    try:
      # Drop the keras mask, so it doesn't scale the losses/metrics.
      # b/250038731
      del logits._keras_mask
    except AttributeError:
      pass

    # Return the final output and the attention weights.
    return logits

In [None]:
num_layers = NUM_LAYERS
d_model = D_MODEL
dff = DFF
num_heads = NUM_HEADS
dropout_rate = DROPOUT_RATE

In [None]:
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    vocab_size=10000,
    dropout_rate=dropout_rate)

In [None]:
sample_enc_input = np.random.rand(32, 64)
sample_dec_input = np.random.rand(32, 64)
output = transformer((sample_enc_input, sample_dec_input))
output.shape

TensorShape([32, 64, 10000])

In [None]:
transformer.summary()

### Compile the model

custom learning rate scheduler

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=WARMUP_STEPS):
    super().__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float32)
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [None]:
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1 = 0.9, beta_2 = 0.98, epsilon = 1e-9)

custom loss

In [None]:
def masked_loss(label, pred):
  """
  Calculates SparceCrossEntropy Loss between label and pred, then multiply it with mask tensor.
  Then return sum of it.

  Inputs:
    label: (None, sequence_length)
    pred: (None,sequence_length, vocab_size)
  """
  mask = label != 0
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
  loss = loss_object(label, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask

  loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
  return loss

custom accuracy

In [None]:
def masked_accuracy(label, pred):
  pred = tf.argmax(pred, axis=2)
  label = tf.cast(label, pred.dtype)
  match = label == pred

  mask = label != 0

  match = match & mask

  match = tf.cast(match, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [None]:
transformer.compile(
    loss=masked_loss,
    optimizer=optimizer,
    metrics=[masked_accuracy])

## Training & Inferencing

Train

In [None]:
history = transformer.fit(train_dataset, epochs=5, validation_data=val_dataset)

Epoch 1/5
[1m45172/45172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m863s[0m 18ms/step - loss: 5.1763 - masked_accuracy: 0.2481 - val_loss: 2.7951 - val_masked_accuracy: 0.4892
Epoch 2/5
[1m45172/45172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m807s[0m 18ms/step - loss: 2.6992 - masked_accuracy: 0.5038 - val_loss: 2.5375 - val_masked_accuracy: 0.5276
Epoch 3/5
[1m45172/45172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m784s[0m 17ms/step - loss: 2.4848 - masked_accuracy: 0.5349 - val_loss: 2.4555 - val_masked_accuracy: 0.5398
Epoch 4/5
[1m45172/45172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m794s[0m 18ms/step - loss: 2.3891 - masked_accuracy: 0.5492 - val_loss: 2.4116 - val_masked_accuracy: 0.5473
Epoch 5/5
[1m45172/45172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m797s[0m 18ms/step - loss: 2.3269 - masked_accuracy: 0.5584 - val_loss: 2.3813 - val_masked_accuracy: 0.5516


Inference

In [None]:
def extract_random_example():
  """ Extract a random example from a val_df and returns English sentence and Ground Truth French sentence in a tuple.

  Inputs: None

  Outputs:
    tuple of a English sentence and a original French translation
  """
  random_row = random.randint(0, len(val_df) - 1)

  # Extract the article and highlights
  en = val_df.iloc[random_row]['en']
  fr = val_df.iloc[random_row]['fr']

  return (en, fr)

In [None]:
def inference(example):
  """ For given example, prints English sentence, ground truth French translation and predicted translation.

  Inputs:
    example: tuple of english sentence and original french translation

  Outputs:
    None
  """
  en, fr = example
  end_token_index = fr_tokenizer.word_index['end']

  # encode input
  enc_input = en_tokenizer.texts_to_sequences([en])
  enc_input = pad_sequences(enc_input, maxlen=64, padding='post')

  # decoder input
  dec_input = fr_tokenizer.texts_to_sequences(['[START]'])
  dec_input = pad_sequences(dec_input, maxlen=64, padding='post')

  max_gen_length = 64

  for i in range(max_gen_length-1):
    pred = transformer((enc_input, dec_input), training=False)
    pred = tf.argmax(pred, axis=-1)


    next_token = pred[0][i]

    # if model generated end token, stop the generation.
    if next_token == end_token_index:
      break

    dec_input[0][i+1] = next_token

  pred = fr_tokenizer.sequences_to_texts(dec_input)


  print(f"Englsih Sentence: {en}\n")
  print(f"True French Sentence: {fr}\n")
  print(f"Predicted French Sentence: {pred[0]}")

In [None]:
example = extract_random_example()
inference(example)

Englsih Sentence: START Departmental officials have subsequently advised that action has been taken to address this concern. END

True French Sentence: START Les fonctionnaires du Ministre ont par la suite fait savoir que des mesures avaient t prises pour rgler ce problme. END

Predicted French Sentence: start les responsables ministriels ont ensuite inform que des mesures ont t prises pour rgler cette question
