In [1]:
import tensorflow as tf### models
import numpy as np### math computations
import matplotlib.pyplot as plt### plotting bar chart
import sklearn### machine learning library
import cv2## image processing
from sklearn.metrics import confusion_matrix, roc_curve### metrics
import seaborn as sns### visualizations
import datetime
import pathlib
import io
import os
import re
import string
import time
from numpy import random
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from keras.models import Model
from keras.layers import Layer
from keras.layers import (Dense,Flatten,SimpleRNN,InputLayer,Conv1D,Bidirectional,GRU,LSTM,BatchNormalization,Dropout,Input, Embedding,TextVectorization)
from keras.losses import BinaryCrossentropy,CategoricalCrossentropy, SparseCategoricalCrossentropy
from keras.metrics import Accuracy,TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy
from keras.optimizers import Adam
from keras.layers import MultiHeadAttention, LayerNormalization
from google.colab import drive
from google.colab import files
from tensorboard.plugins import projector

In [2]:
#@ Downloading datasets:
!wget https://www.manythings.org/anki/fra-eng.zip

--2025-01-22 15:47:18--  https://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7943074 (7.6M) [application/zip]
Saving to: ‘fra-eng.zip’


2025-01-22 15:47:21 (3.94 MB/s) - ‘fra-eng.zip’ saved [7943074/7943074]



In [3]:
!unzip '/content/fra-eng.zip' -d '/content/dataset' # -d flag specifies directories

Archive:  /content/fra-eng.zip
  inflating: /content/dataset/_about.txt  
  inflating: /content/dataset/fra.txt  


#### Data Preprocessing

In [4]:
text_dataset=tf.data.TextLineDataset('/content/dataset/fra.txt') #each line is treated as separate string

In [5]:
for i in text_dataset.take(3):
  print(i)

tf.Tensor(b'Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)', shape=(), dtype=string)
tf.Tensor(b'Go.\tMarche.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)', shape=(), dtype=string)
tf.Tensor(b'Go.\tEn route !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8267435 (felix63)', shape=(), dtype=string)


In [6]:
#@ Setting up the Parameters:
VOCAB_SIZE=20000 #unique tokens from dataset, setting value 20000 for efficiency
ENGLISH_SEQUENCE_LENGTH=32 #max length of i/p sequence[in tokens]
FRENCH_SEQUENCE_LENGTH=32 #max len of o/p sequence[in tokens]
EMBEDDINGS_DIM=512 #size of vectors to represent tokens(as per paper)
BATCH_SIZE=128 #for data size processed during training

In [7]:
#@ for english word:
english_vectorize_layer=TextVectorization(
                      standardize='lower_and_strip_punctuation',
                      max_tokens=VOCAB_SIZE,
                      output_mode='int', #mapping wrt to the integer index
                      output_sequence_length=ENGLISH_SEQUENCE_LENGTH
)



In [8]:
french_vectorize_layer = TextVectorization(
    standardize='lower_and_strip_punctuation',
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=FRENCH_SEQUENCE_LENGTH
)

In [9]:
def selector(input_text):
  split_text = tf.strings.split(input_text,'\t')
  return {'input_1':split_text[0:1],'input_2':'starttoken '+split_text[1:2]},split_text[1:2]+' endtoken'

In [10]:
#@ Initializing dataset:
split_dataset = text_dataset.map(selector)

In [11]:
def separator(input_text):
  split_text = tf.strings.split(input_text,'\t')
  return split_text[0:1],'starttoken '+split_text[1:2]+' endtoken'

In [12]:
init_dataset = text_dataset.map(separator)

In [13]:
for i in init_dataset.take(3):
  print(i)


(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Va ! endtoken'], dtype=object)>)
(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Marche. endtoken'], dtype=object)>)
(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken En route ! endtoken'], dtype=object)>)


### Vocab Creation

In [14]:
english_training_data=init_dataset.map(lambda x,y:x) # input x,y and output x
english_vectorize_layer.adapt(english_training_data) # adapt the vectorize_layer to the training data

french_training_data=init_dataset.map(lambda x,y:y) # input x,y,z and output y
french_vectorize_layer.adapt(french_training_data) # adapt the vectorize_layer to the training data


KeyboardInterrupt: 

In [None]:

def vectorizer(inputs,output):
  return {'input_1':english_vectorize_layer(inputs['input_1']),
          'input_2':french_vectorize_layer(inputs['input_2'])},french_vectorize_layer(output)


In [None]:
split_dataset

In [None]:
dataset=split_dataset.map(vectorizer)

In [None]:
for i in init_dataset.take(3):
  print(i)

In [None]:
for i in dataset.take(1):
  print(i)

In [None]:
dataset=dataset.shuffle(2048).unbatch().batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
NUM_BATCHES=int(200000/BATCH_SIZE)

In [None]:
#@ Training and testing split
train_dataset=dataset.take(int(0.9*NUM_BATCHES))
val_dataset=dataset.skip(int(0.9*NUM_BATCHES))

### Model Architecture

In [None]:
def positional_encoding(model_size, SEQUENCE_LENGTH): # d_model
  output = []
  for pos in range(SEQUENCE_LENGTH):
    PE = np.zeros((model_size)) # initilizing with zeros
    for i in range(model_size):
      if i % 2 == 0: # even positions, sin formula is used according to paper
        PE[i] = np.sin(pos/(10000**(i/model_size)))
      else: # odd positions, cos formula is used as mentioned in the paper
        PE[i] = np.cos(pos/(10000**((i-1)/model_size)))
    output.append(tf.expand_dims(PE, axis = 0))

  out = tf.concat(output, axis=0)
  out = tf.expand_dims(out, axis=0)
  return tf.cast(out, dtype=tf.float32)


### Input Embeddings

In [None]:
from keras.layers import Lambda
class Embeddings(Layer):
  def __init__(self, sequence_length, vocab_size, embedding_dim):
    super(Embeddings, self).__init__()
    self.token_embeddings = Embedding(input_dim=vocab_size, output_dim=embedding_dim)
    self.sequence_length = sequence_length
    self.vocab_size = vocab_size
    self.embedding_dim = embedding_dim

  def call(self, inputs):
    embedded_tokens = self.token_embeddings(inputs)
    embedded_positions = positional_encoding(self.embedding_dim, self.sequence_length) # PE adding here
    return embedded_tokens + embedded_positions # final output for inputs

  def compute_mask(self, inputs, mask=None):
     return Lambda(lambda x: tf.math.not_equal(x, 0))(inputs) # masking function for checking if there are pad tokens(0)

### Custome Attention Layer

- Self attention layer

In [None]:

class CustomSelfAttention(Layer):
  def __init__(self, model_size):
    super(CustomSelfAttention, self).__init__()
    self.model_size = model_size

  def call(self, query, key, value, masking):
    #### Compute Scores ####
    score = tf.matmul(query, key, transpose_b=True)

    #### Scaling ####
    score = score / tf.math.sqrt(tf.cast(self.model_size, dtype=tf.float32))

    #### Masking ####
    masking = tf.cast(masking, dtype=tf.float32)
    score -= (1.0 - masking) * 1e10

    #### Attention Weights ####
    attention_weights = tf.nn.softmax(score, axis=-1) * masking

    #### Weighted Sum ####
    head_output = tf.matmul(attention_weights, value)

    #### Output ####
    return head_output

### Multi-headed Attention
- Multihead Attention allows model to focus on different part of input sequence simultaneously and combine these prespective into comprehensive representation.

- For example: "Harry saw a man with binoculars'. This sentence can have two meanings, they are either it can be harry saw a man using binoculars or it can be harry saw a man who has binoculars. These both can be correct. So transformer has to understand both these meanings which self-attention fails to recognize that's why multi-head attention is used.

In [None]:


class CustomMultiHeadAttention(Layer):
  def __init__(self, num_heads, key_dim):
    super(CustomMultiHeadAttention, self).__init__()

    self.num_heads = num_heads
    self.dense_q = [Dense(key_dim//num_heads) for _ in range(num_heads)]
    self.dense_k = [Dense(key_dim//num_heads) for _ in range(num_heads)]
    self.dense_v = [Dense(key_dim//num_heads) for _ in range(num_heads)]
    self.dense_o = Dense(key_dim)
    self.attention = CustomSelfAttention(key_dim)

  def call(self, query, key, value, attention_mask):
    heads = []

    for i in range(self.num_heads): # for each head
      print(f'head-{i}', self.dense_q[i](query).shape)
      head = self.self_attention(self.dense_q[i](query), self.dense_k[i](key),
                                 self.dense_v[i](value), attention_mask)

      heads.append(head)
    heads = tf.concat(heads, axis=2) # concatenating all heads
    heads = self.dense_o(heads) # passing all heads through a linear layer for the final output
    return heads



## Encoder

In [None]:


class TransformerEncoder(Layer):
  def __init__(self, embedding_dims, dense_dims, num_heads):
    super(TransformerEncoder, self).__init__()
    self.embedding_dims = embedding_dims
    self.dense_dims = dense_dims
    self.num_heads = num_heads
    self.attention = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dims)

    self.dense_proj = tf.keras.Sequential([
        Dense(self.dense_dims, activation="relu"),
        Dense(self.embedding_dims),
    ])
    self.layernorm_1 = LayerNormalization()
    self.layernorm_2 = LayerNormalization()
    self.supports_masking = True

  def call(self, inputs, mask=None):
    # print(mask)
    if mask is not None:
      mask = tf.cast(mask[:, tf.newaxis, :], dtype='int32')
      # print(mask)
      T = tf.shape(mask)[2]
      padding_mask = tf.repeat(mask, T, axis=1)
      # print(padding_mask)

    attention_output = self.attention(query=inputs, value=inputs, key=inputs, attention_mask=padding_mask)

    proj_input = self.layernorm_1(inputs + attention_output)
    proj_output = self.dense_proj(proj_input)
    return self.layernorm_2(proj_input + proj_output)


### Decoder

In [None]:
class TransformerDecoder(Layer):
  def __init__(self, embedding_dims, latent_dims, num_heads):
    super(TransformerDecoder, self).__init__()
    self.embedding_dims = embedding_dims
    self.latent_dims = latent_dims
    self.num_heads = num_heads
    self.attention_1 = MultiHeadAttention(
        num_heads=num_heads, key_dim=embedding_dims
    ) # self attention
    self.attention_2 = MultiHeadAttention(
        num_heads=num_heads, key_dim=embedding_dims
    ) # cross-attention with encoder's outputs
    self.dense_proj = tf.keras.Sequential(
        [Dense(latent_dims, activation='relu'), Dense(embedding_dims)]
    ) # feed forward layer
    self.layernorm_1 = LayerNormalization() # layer norm for all three layers as in paper
    self.layernorm_2 = LayerNormalization()
    self.layernorm_3 = LayerNormalization()
    self.supports_masking = True # this is special because of decoder

  def call (self, inputs, encoder_outputs, enc_mask, mask=None):

    combined_mask=None
    cross_attn_mask = None

    if mask is not None:
      causal_mask = tf.linalg.band_part(
          tf.ones([tf.shape(inputs)[0],
                   tf.shape(inputs)[1],
                   tf.shape(inputs)[1]], dtype=tf.int32), -1, 0)
      # the role of causal mask is to prevent peeking into the future tokens for the decoder to predict better
      # the band_part method makes it really easier to do this

      mask = tf.cast(
          mask[:, tf.newaxis, :], dtype='int32'
      )
      enc_mask = tf.cast(
          enc_mask[:, tf.newaxis, :], dtype='int32'
      )

      T = tf.shape(mask)[2] # T is the number of queries from the decoder
      padding_mask = tf.repeat(mask, T, axis=1)
      cross_attn_mask = tf.repeat(enc_mask, T, axis=1)
      combined_mask = tf.minimum(padding_mask, causal_mask) # the full mask for the masked mutli-head-connection
      # print(f'Padding_mask: {padding_mask}')
      # print(f'Causal_mask: {causal_mask}')
      # print(f'Combined_mask: {combined_mask}')
      # print(f'Cross_attention_mask: {cross_attn_mask}')

    if combined_mask is None:
            combined_mask = tf.ones([tf.shape(inputs)[0], tf.shape(inputs)[1], tf.shape(inputs)[1]], dtype=tf.int32)

    if cross_attn_mask is None:
            T = tf.shape(inputs)[1]  # Get the sequence length from inputs
            cross_attn_mask = tf.ones([tf.shape(inputs)[0], T, T], dtype=tf.int32)


    attention_output_1 = self.attention_1(
        query=inputs, key=inputs, value=inputs,
        attention_mask=combined_mask # the first layer which is the self attention for decoder
    )

    out_1 = self.layernorm_1(inputs + attention_output_1) # the first output + inputs added to be the input
    # for the cross_attention layer

    attention_output_2, scores = self.attention_2(
        query=out_1, key=encoder_outputs, value=encoder_outputs,
        attention_mask=cross_attn_mask,# the mask from cross attention just like encoder
        return_attention_scores=True # returning score to visualize
    )

    out_2 = self.layernorm_2(out_1 + attention_output_2) # output 2 after adding and normalizing to be passed
    # to feed forward layer for the final outputs

    proj_output = self.dense_proj(out_2)

    return self.layernorm_3(out_2 + proj_output), scores # the last norm layer

### Full transformer Model

In [None]:
EMBEDDING_DIMS = 512
LATENT_DIMS = 2048
NUM_HEADS = 8
NUM_LAYERS = 1
NUM_EPOCHS = 10
attention_scores = {}

In [None]:
encoder_inputs = Input(shape=(None,), dtype='int64', name='input_1')
embeddings = Embeddings(ENGLISH_SEQUENCE_LENGTH, VOCAB_SIZE, EMBEDDING_DIMS)
x = embeddings(encoder_inputs)
enc_mask = embeddings.compute_mask(encoder_inputs)


for _ in range(NUM_LAYERS): # there can be N number of layers as mentioned by paper
  x = TransformerEncoder(EMBEDDING_DIMS, LATENT_DIMS, NUM_HEADS)(x)
encoder_outputs = x

decoder_inputs = Input(shape=(None,), dtype='int64', name='input_2')
x = Embeddings(FRENCH_SEQUENCE_LENGTH, VOCAB_SIZE, EMBEDDING_DIMS)(decoder_inputs)

for i in range(NUM_LAYERS):
  x, scores = TransformerDecoder(EMBEDDING_DIMS, LATENT_DIMS, NUM_HEADS)(x, encoder_outputs, enc_mask)
  attention_scores[f'decoder_layer{i+1}_block2'] = scores

x = tf.keras.layers.Dropout(0.5)(x)
decoder_outputs = Dense(VOCAB_SIZE, activation='softmax')(x)

attention_score_model = tf.keras.Model(
    [encoder_inputs, decoder_inputs],
    attention_scores, name='attention_score_model'
)


transformer = tf.keras.Model(
    [encoder_inputs, decoder_inputs],
    decoder_outputs, name='transformer'
)

transformer.summary()

### Training:

# BLEU (Bilingual Evaluation Understudy)

BLEU is a metric for evaluating the quality of text generated by machine translation systems by comparing the generated text (candidate translation) with one or more reference translations. It provides a numerical score that reflects how closely the generated text matches the reference translations.

---

## How BLEU Works

### 1. **n-gram Precision**
- BLEU calculates how many n-grams (contiguous sequences of `n` words) in the candidate translation appear in the reference translations.
- **n-grams** can range from unigrams (single words) to higher-order n-grams like bigrams (two words), trigrams (three words), etc.

**Example:**
Candidate: `the cat is on the mat`  
Reference: `the cat sat on the mat`  
- Unigrams: `the`, `cat`, `is`, `on`, `the`, `mat`
- Bigrams: `the cat`, `cat is`, `is on`, `on the`, `the mat`

---

### 2. **Clipping**
- To prevent overcounting, BLEU uses **clipping** for n-grams. The count of an n-gram in the candidate is clipped to the maximum count of that n-gram in the reference(s).

**Example:**
Candidate: `the the the the`  
Reference: `the cat is on the mat`  
- Without clipping: Unigram `the` count = 4.
- With clipping: Unigram `the` count = 2 (since it appears twice in the reference).

---

### 3. **Precision for Different n-grams**
- BLEU computes precision for unigrams, bigrams, trigrams, and so on.

**Formula for n-gram precision:**
\[
P_n = \frac{\text{Number of clipped n-gram matches}}{\text{Total number of candidate n-grams}}
\]

---

### 4. **Geometric Mean of Precision Scores**
- BLEU combines the precision scores of all n-grams using the **geometric mean**, giving equal weight to each precision:
\[
P = \left( \prod_{n=1}^N P_n \right)^{1/N}
\]

---

### 5. **Brevity Penalty (BP)**
- BLEU penalizes short translations that match n-grams but fail to capture the full meaning of the reference.

**Brevity penalty formula:**
\[
BP =
\begin{cases}
1 & \text{if } c > r, \\
e^{1-r/c} & \text{if } c \leq r,
\end{cases}
\]

Where:
- \( c \) = length of the candidate translation.
- \( r \) = length of the closest reference translation.

---

### 6. **Final BLEU Score**
- The BLEU score combines the geometric mean of n-gram precisions with the brevity penalty:
\[
\text{BLEU} = BP \cdot P
\]

---

## Strengths of BLEU
1. **Language-Agnostic**: Works across languages as it relies on n-grams.
2. **Multiple References**: Accommodates multiple reference translations to account for variability in phrasing.
3. **Efficient**: Computationally inexpensive compared to human evaluation.

---

## Limitations of BLEU
1. **Surface-Level Matching**: Only matches n-grams and doesn't capture semantic meaning or grammatical correctness.
2. **Insensitive to Context**: Ignores sentence structure and context.
3. **Poor for Short Texts**: Fails to give meaningful scores for very short texts.
4. **Overemphasis on Precision**: Does not explicitly account for recall, potentially penalizing candidates with good coverage but few exact matches.

---

## When to Use BLEU
BLEU is widely used for:
- **Machine Translation**: Evaluating the quality of translations.
- **Text Generation**: Assessing tasks like text summarization or dialogue generation.
- **Natural Language Processing Benchmarks**: Providing a standardized comparison metric.


In [None]:
class BLEU(tf.keras.metrics.Metric):
  def __init__(self, name='bleu_score'):
    super(BLEU, self).__init__()
    self.bleu_score=0

  def update_state(self, y_true, y_pred):
    y_pred=tf.argmax(y_pred, -1)
    self.bleu_score=0
    for i, j in zip(y_pred, y_true):
      tf.autograph.experimental.set_loop_options()

      total_words=tf.math.count_nonzero(i)
      total_matches=0
      for word in i:
        if word==0:
          break
        for q in range(len(j)):
          if j[q]==0:
            break
          if word==j[q]:
            total_matches+=1
            j=tf.boolean_mask(j, [False if y==q else True for y in range(len(j))])
            break

       self.bleu_score+=total_matches/total_words

  def result(set):
    return self.bleu_score/BATCH_SIZE

## Learning Rate Schedular

In [None]:
from keras.optimizers.schedules import LearningRateSchedule

class Schedular(LearningRateSchedule):
  def __init__(self, d_model, warmup_steps):
    super(Schedular, self).__init__()
    self.d_model=tf.cast(d_model, tf.float64)
    self.warmup_steps=tf.cast(warmup_steps, dtype=tf.float64)

  def __call__(self, step):
    step=tf.cast(step, dtype=tf.float64)
     return (self.d_model**(-0.5))*tf.math.minimum(step**(-0.5), step * (self.warmup_steps ** -1.5))