In [None]:
import tensorflow as tf
import numpy as np, cv2, io, os, re, string, time, datetime
import seaborn as sns, sklearn
import matplotlib.pyplot as plt
from keras.layers import (TextVectorization, Embedding)
from keras.layers import MultiHeadAttention, LayerNormalization

In [None]:
#@ Downloading datasets:
!wget https://www.manythings.org/anki/fra-eng.zip

--2025-01-12 16:18:14--  https://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7943074 (7.6M) [application/zip]
Saving to: ‘fra-eng.zip’


2025-01-12 16:18:16 (6.34 MB/s) - ‘fra-eng.zip’ saved [7943074/7943074]



In [None]:
!unzip '/content/fra-eng.zip' -d '/content/dataset' # -d flag specifies directories

Archive:  /content/fra-eng.zip
  inflating: /content/dataset/_about.txt  
  inflating: /content/dataset/fra.txt  


#### Data Preprocessing

In [None]:
text_dataset=tf.data.TextLineDataset('/content/dataset/fra.txt') #each line is treated as separate string

In [None]:
for i in text_dataset.take(3):
  print(i)

tf.Tensor(b'Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)', shape=(), dtype=string)
tf.Tensor(b'Go.\tMarche.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)', shape=(), dtype=string)
tf.Tensor(b'Go.\tEn route !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8267435 (felix63)', shape=(), dtype=string)


In [None]:
#@ Setting up the Parameters:
VOCAB_SIZE=20000 #unique tokens from dataset, setting value 20000 for efficiency
ENGLISH_SEQUENCE_LENGTH=32 #max length of i/p sequence[in tokens]
FRENCH_SEQUENCE_LENGTH=32 #max len of o/p sequence[in tokens]
EMBEDDINGS_DIM=512 #size of vectors to represent tokens(as per paper)
BATCH_SIZE=128 #for data size processed during training

In [None]:
#@ for english word:
english_vectorize_layer=TextVectorization(
                      standardize='lower_and_strip_punctuation',
                      max_tokens=VOCAB_SIZE,
                      output_mode='int', #mapping wrt to the integer index
                      output_sequence_length=ENGLISH_SEQUENCE_LENGTH
)

#@ for french word:
french_vectorize_layer=TextVectorization(
                       standardize='lower_and_strip_punctuation',
                       max_tokens=VOCAB_SIZE,
                       output_mode='int',
                       output_sequence_length=FRENCH_SEQUENCE_LENGTH
)

In [None]:
def seperator(input_text):
  split_text=tf.strings.split(input_text, '\t')
  return {
      'input_1':split_text[0:1],
      'input_2':'starttoken' + split_text[1:2]
      }, split_text[1:2]+' endtoken'

In [None]:
text='hello\tprijal'
seperator(text)


({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'hello'], dtype=object)>,
  'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttokenprijal'], dtype=object)>},
 <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'prijal endtoken'], dtype=object)>)

In [None]:
#@ Initializing dataset:
init_dataset=text_dataset.map(seperator)

In [None]:
for i in init_dataset.take(3):
  print(i)


({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttokenVa !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Va ! endtoken'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttokenMarche.'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Marche. endtoken'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttokenEn route !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'En route ! endtoken'], dtype=object)>)


### Vocab Creation

In [None]:
english_training_data=init_dataset.map(lambda x, y:x['input_1'])
english_vectorize_layer.adapt(english_training_data)

french_training_data=init_dataset.map(lambda x, y:y)
french_vectorize_layer.adapt(french_training_data)


In [None]:
#@ Grouping and  Vectorization for training:
def vectorizer(inputs, output):
  return {'input_1':english_vectorize_layer(inputs['input_1']),
          'input_2':french_vectorize_layer(inputs['input_2'])}, french_vectorize_layer(output)

In [None]:
init_dataset

<_MapDataset element_spec=({'input_1': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'input_2': TensorSpec(shape=(None,), dtype=tf.string, name=None)}, TensorSpec(shape=(None,), dtype=tf.string, name=None))>

In [None]:
dataset=init_dataset.map(vectorizer)

In [None]:
for i in init_dataset.take(3):
  print(i)

({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttokenVa !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Va ! endtoken'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttokenMarche.'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Marche. endtoken'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttokenEn route !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'En route ! endtoken'], dtype=object)>)


In [None]:
for i in dataset.take(1):
  print(i)

In [None]:
dataset=dataset.shuffle(2048).unbatch().batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
NUM_BATCHES=int(200000/BATCH_SIZE)

In [None]:
#@ Training and testing split
train_dataset=dataset.take(int(0.9*NUM_BATCHES))
val_dataset=dataset.skip(int(0.9*NUM_BATCHES))

### Model Architecture

In [None]:
#@ Positional Encoding:
def PositionalEncoding(d_model, SEQUENCE_LENGTH):
  output=[]
  for pos in range(SEQUENCE_LENGTH):
    PE=np.zeros(d_model)
    for i in range(d_model):
      if i % 2 == 0: #even position, sine formula is used
        PE[i]=np.sin(pos/(10000**(2*i/d_model)))
      else:
        PE[i]=np.cos(pos/(10000**(2*i/d_model)))
    output.append(tf.expand_dims(PE, axis=0))
  out=tf.concat(output, axis=0)
  out=tf.expand_dims(out, axis=0)
  return tf.cast(out, dtype=tf.float32)


In [None]:
print(PositionalEncoding(512, 32))

### Input Embeddings

In [None]:
from keras.layers import Layer
class Embeddings(Layer):
  def __init__(self, sequence_length, vocab_size, embedding_dim):
    super(Embeddings, self).__init__()
    self.token_embeddings=Embedding(input_dim=vocab_size, output_dim=embedding_dim)
    self.sequence_length=sequence_length
    self.vocab_size=vocab_size
    self.embedding_dim=embedding_dim

  def call(self, inputs):
    embedded_tokens=self.token_embeddings(inputs)
    embedded_positions=PositionalEncoding(self.embedding_dim, self.sequence_length)
    return embedded_tokens + embedded_positions

  def compute_mask(self, inputs, mask=None):
    return tf.math.not_equal(inputs, 0)


### Custome Attention Layer

- Self attention layer

In [None]:
class CustomSelfAttention(Layer):
  def __init__(self, model_size):
    super(CustomSelfAttention, self).__init__()
    self.model_size=model_size

  def call(self, query, key, value, masking):
    score=tf.matmul(query, key, transpose_b=True)

    score/= tf.math.sqrt(tf.cast(self.model_size, dtype=tf.float32))

    masking=tf.cast(masking, dtype=tf.float32)
    score -= (1.0-masking)* 1e10

    attention_weights=tf.nn.softmax(score, axis=1) * masking

    head_output=tf.matmul(attention_weights, value)

    return head_output

### Multi-headed Attention
- Multihead Attention allows model to focus on different part of input sequence simultaneously and combine these prespective into comprehensive representation.

- For example: "Harry saw a man with binoculars'. This sentence can have two meanings, they are either it can be harry saw a man using binoculars or it can be harry saw a man who has binoculars. These both can be correct. So transformer has to understand both these meanings which self-attention fails to recognize that's why multi-head attention is used.

In [None]:
class MultiHeadAttention(Layer):
  def __init__(self, n_heads, key_dim):
    super(MultiHeadAttention, self).__init__()

    self.n_heads=n_heads
    self.dense_q=[Dense(key_dim//n_heads) for _ in range(n_heads)]
    self.dense_k=[Dense(key_dim//n_heads) for _ in range(n_heads)]
    self.dense_v=[Dense(key_dim//n_heads) for _ in range(n_heads)]
    self.dense_o=Dense(key_dim)
    self.attention=CustomSelfAttention(key_dim)

  def call(self, query, key, value, attention_mask):
    heads=[]

    for i in range(n_heads):
      print(f'head-{i}', self.dense_q[i](query).shape)
      head=self.self_attention(self.dense_q[i](query), self.dense_k[i](key),
                               self.dense_v[i](value), attention_mask)
      heads.append(head)

    heads=tf.concat(heads, axis=2)
    heads=self.dense_o(heads)
    return heads



## Encoder

In [None]:
class Encoder(Layer):
  def __init__(self, embeddings_dim, dense_dim, n_heads):
    super(Encoder, self).__init__()
    self.embeddings_dim=embeddings_dim
    self.dense_dim=dense_dim
    self.n_heads=n_heads
    self.attention=MultiHeadAttention(n_heads=n_heads, key_dim=embeddings_dim)

    self.dense_projection=tf.keras.Sequential([
        Dense(self.dense_dims, activation='relu'),
        Dense(self.embeddings_dims)
    ])   #projection is done for enriching local relationship

    self.layernorm1=LayerNormalization()
    self.layernorm2=LayerNormalization()
    self.supports_masking=True

  def call(self, inputs, mask=None):
    if mask in not None:
      mask=tf.cast(mask[:, tf.newaxis, :], dtype='int32')
      T=tf.shape(mask)[2]
      padding_mask=tf.repeat(mask, T, axis=1)

      attention_output=self.attention(query=inputs, value=inputs, key=inputs, attention_mask=padding_mask)

      projection_input=self.layernorm_1(inputs + attention_output)
      projection_output=self.dense_projection(projection_input)
      return self.layernorm_2(projection_input + projection_output)

### Decoder

In [None]:
class Decoder(Layer):
  def __init__(self, embeddings_dim, n_heads, latent_dim):
    super(Decoder, self).__init__()
    self.embeddings_dim=embeddings_dim
    self.n_heads=n_heads
    self.latent_dim=latent_dim

    self.attention_1=MultiHeadAttention(n_heads=n_heads, key_dim=embeddings_dim) # self-attention
    self.attention_2=MultiHeadAttention(n_heads=n_heads, key_dim=embeddings_dim) #cross attention


    self.dense_projection=tf.keras.Sequential(
        [Dense(latent_dim, activation='relu'), Dense(embedding_dim)]
    ) #feedforward layer

    self.layernorm_1=LayerNormalization()
    self.layernorm_2=LayerNormalization()
    self.layernorm_3=LayerNormalization()

  def call(self, inputs, encoder_output, encoder_mask, mask=None):

     attention_output_1=self.attention_1(query=inputs, key=inputs, value=inputs)
     output_1=self.layernorm_1(inputs + attention_output_1)

     attention_output_2=self.attention_2(query=output_1, key=encoder_output, value=encoder_output)
     output_2=self.layernorm_2(output_1 + attention_output_2)

     projection_output=self.dense_projection(output_2)

     return self.layernorm_3(output_2 + projection_output)

