In [1]:
# importing the required libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import os
import re
import string
import tensorflow_datasets as tfds
import time

### Importing data

#### Train

In [2]:
# importing train data
train_data=pd.read_csv("data/TRAIN.csv")

In [3]:
# checking the shape
train_data.shape

(315488, 2)

In [4]:
# checking the null values
train_data.isna().sum()

english     0
hinglish    5
dtype: int64

#### Validation

In [5]:
# importing the validation data
val_data=pd.read_csv("data/VALIDATION.csv")

In [6]:
# checking the shape
val_data.shape

(41112, 2)

In [7]:
# checking for null values
val_data.isna().sum()

english     0
hinglish    0
dtype: int64

### Pre-Processing

In [8]:
# defining a function that cleans the text
def clean_text(s):
  # changing the short form words like didn't, wouldn't
  cleaned = re.sub(r"n\'t", " not", s)
  cleaned = re.sub(r"\'re", " are", cleaned)
  cleaned = re.sub(r"\'s", " is", cleaned)
  cleaned = re.sub(r"\'d", " would", cleaned)
  cleaned = re.sub(r"\'ll", " will", cleaned)
  cleaned = re.sub(r"\'t", " not", cleaned)
  cleaned = re.sub(r"\'ve", " have", cleaned)
  cleaned = re.sub(r"\'m", " am", cleaned)
  cleaned = re.sub("(\s+)", " ", cleaned)
  out = cleaned.translate(str.maketrans('', '', string.punctuation))
  return out

# defining a function to add special tokens
def add_special_tokens(s):
  return '<SOS>'+' '+s+' '+'<EOS>'

# defining a function which preprocess the data
def preprocess(df):

  # remove the NaNs
  df=df.dropna()
  # removing the duplicates
  df=df.drop_duplicates()
  # applying the clean_text function to remove punctuations from hindi
  df['hinglish']=df['hinglish'].apply(lambda x: clean_text(x))
  # making all hinglish words into lower case
  df['hinglish']=df['hinglish'].apply(lambda x: x.lower())

  # applying the clean_text function to english
  df['english']=df['english'].apply(lambda x: clean_text(x))
  # making all english words into lower case
  df['english']=df['english'].apply(lambda x: x.lower())

  return df

In [9]:
# preprocessing train_data
train_df = preprocess(train_data)

# preprocessing the validation data
val_df = preprocess(val_data)


In [10]:
import random

In [11]:
# testing
ii=random.randint(0,1000)
train_df.iloc[ii][1]

'thodi dark comedy involved hai ek point me mean girls me se ek ko bus thok deti hai to family night se jyada date night k liye hai'

In [12]:
for j,i in enumerate(train_df.iterrows()):
  print(i[1][1])
  if j==3:
    break


hi
tumne konsi movie dekhi
hello tum kaise ho kya tumne batman begins ke bare mein suna hai kya great movie hai
nahi aur batao


## Tokenization

In [13]:
# Define file paths for saved tokenizers
hindi_tokenizer_path = "./hindi_tokenizer.subwords"
en_tokenizer_path = "./en_tokenizer.subwords"

# Check if the saved tokenizers exist
if os.path.exists(hindi_tokenizer_path+".subwords") and os.path.exists(en_tokenizer_path+".subwords"):
    # Load the Hindi tokenizer
    hindi_tokenizer = tfds.deprecated.text.SubwordTextEncoder.load_from_file(hindi_tokenizer_path)

    # Load the English tokenizer
    en_tokenizer = tfds.deprecated.text.SubwordTextEncoder.load_from_file(en_tokenizer_path)
    print("Tokenizers loaded successfully.")
else:
    print("Tokenizers not found. Building from scratch")
    SOS_TOKEN='<SOS>'
    EOS_TOKEN='<EOS>'

    # building the tokenizer for hinglish
    hindi_tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
        corpus_generator=(row[1][1] for row in train_df.iterrows()),
        target_vocab_size=2**13,
        reserved_tokens=[SOS_TOKEN,EOS_TOKEN])

    # building the tokenizer for english
    en_tokenizer=tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
                corpus_generator=(row[1][0] for row in train_df.iterrows()),
                target_vocab_size=2**13,
                reserved_tokens=[SOS_TOKEN,EOS_TOKEN])


Tokenizers loaded successfully.


In [14]:
# # saving the tokenizers

# # Define file paths to save the tokenizers
# hindi_tokenizer_path = "hindi_tokenizer"
# en_tokenizer_path = "en_tokenizer"

# # Save the Hindi tokenizer
# hindi_tokenizer.save_to_file(hindi_tokenizer_path)

# # Save the English tokenizer
# en_tokenizer.save_to_file(en_tokenizer_path)


In [15]:
# testing the tokenization
sample_string="<SOS> hello tum kaise ho kya tumne batman begins ke bare mein suna hai kya great movie hai <EOS>"

tokenized_string=hindi_tokenizer.encode(sample_string)
og_string=hindi_tokenizer.decode(tokenized_string)

for token in tokenized_string:
  print(f"{str(token)}-----> {hindi_tokenizer.decode([token])}")

1-----> <SOS>
7911----->  
1189-----> hello 
620-----> tum 
367-----> kaise 
137-----> ho 
6-----> kya 
1510-----> tumne 
1607-----> batman 
7847-----> begins
7911----->  
3-----> ke 
318-----> bare 
139-----> mein 
2545-----> suna 
12-----> hai 
6-----> kya 
1042-----> great 
216-----> movie 
12-----> hai 
2-----> <EOS>


In [16]:
# finding the vocab sizes
print("Hindi Vocab size",hindi_tokenizer.vocab_size)
print("English Vocab size",en_tokenizer.vocab_size)

Hindi Vocab size 8135
English Vocab size 8188


In [17]:
# defining a function which adds CLS and SEP tokens in the beginning and in the end

def encode(lang1,lang2):
  """
  lang1: hindi
  lang2: english
  encodes the tokens to indices
  """
  lang1= hindi_tokenizer.encode(lang1.numpy())
  lang2= en_tokenizer.encode(lang2.numpy())
  return lang1,lang2


# wrapping our encode function using tf_encode function
def tf_encode(hin,en):
  hin_result,en_result=tf.py_function(encode,[hin,en],[tf.int64,tf.int64])
  hin_result.set_shape([None])
  en_result.set_shape([None])
  return hin_result,en_result

# writing a function to filter our data ie remove datapoints (hin,en) where if any one (pt or en) has more than 40 tokens
def max_len_filter(hin,en,max_len=40):
  # returning a mask (1 if both pt and en has less than 40 tokens each else 0)
  return tf.logical_and(tf.size(hin)<=max_len,
                        tf.size(en)<=max_len)

In [18]:
# converting pandas dataframe to tensorflow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_df['hinglish'].values, train_df['english'].values))
val_dataset = tf.data.Dataset.from_tensor_slices((val_df['hinglish'].values, val_df['english'].values))


In [19]:
# applying the tf_encode to our train dataset
train_data=train_dataset.map(tf_encode)
# filtering the train data
train_data=train_data.filter(max_len_filter)

# applying the tf_encode to our validation dataset
val_data=val_dataset.map(tf_encode)
# filtering the validation data
val_data=val_data.filter(max_len_filter)

In [20]:
# defining the global variables
BATCH_SIZE=64
BUFFER_SIZE=2000

# caching the train data
train_data=train_data.cache()
#shuffling and padding the train data
train_data=train_data.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE)
# prefetching batches in training data
train_data=train_data.prefetch(tf.data.experimental.AUTOTUNE)

# caching the val data
val_data=val_data.cache()
# shuffling and padding the val data
val_data=val_data.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE)
# prefetching batches in val data
val_data=val_data.prefetch(tf.data.experimental.AUTOTUNE)



In [21]:
# printing a batch from the validation dataset
""" a batch is of dimension (batch_size,maxlen)"""
hin_batch,en_batch=next(iter(train_data))
print(hin_batch,'\n')
print(en_batch)

tf.Tensor(
[[ 319  129    7 ...    0    0    0]
 [ 649 3571  139 ...    0    0    0]
 [ 369  161   12 ...    0    0    0]
 ...
 [  50  111 1310 ...    0    0    0]
 [  27   23  567 ...    0    0    0]
 [   6 4391 7091 ...    0    0    0]], shape=(64, 38), dtype=int64) 

tf.Tensor(
[[  13    6   20 ...    0    0    0]
 [  45    6 5939 ...    0    0    0]
 [ 213 1586  369 ...    0    0    0]
 ...
 [   7 1051   20 ...    0    0    0]
 [  25    8   11 ...    0    0    0]
 [  49   34 6913 ...    0    0    0]], shape=(64, 34), dtype=int64)


## Positional Encodings



$$\Large{PE_{(pos, 2i)} = sin(pos / 10000^{2i / d_{model}})} $$
$$\Large{PE_{(pos, 2i+1)} = cos(pos / 10000^{2i / d_{model}})} $$

In [22]:
# definfing the function to create angle_matrix which contains angles for sin and cos
def get_angles(pos,i,d_model):
  """
  pos     : column vector (pos,1) having position values 0 to pos-1
  i       : row vector (1,d_model) having values from 0 to d_models-1
  d_model : embedding_dim

  returns
  angle_matrix: a matrix of dim (pos,d_model)

  """
  angles= 1/ np.power (10000,(2*(i//np.float32(d_model))))
  # assume pos=5 and d_model=512 then (5,1)*(1,512) => (5,512) dimensions of angles will get broadcasted to match the dim of pos
  angle_matrix=pos*angles

  return angle_matrix


def positional_encodings(pos,d_model):

  # creating a column vector (pos,1) from 0 to pos-1
  pos=np.arange(pos)[:,np.newaxis]
  # creating a row vector (1,d_model) from 0 to d_model-1
  i=np.arange(d_model)[np.newaxis,:]
  # passing the two vectors and d_model scalar to the get_angle fuction which returns a 2d matrix of dim (pos,d_model)
  angle_matrix = get_angles(pos,i,d_model)

  # applying sin function to even col indices in the matrix
  angle_matrix[:,0::2]=np.sin(angle_matrix[:,0::2])

  # applying the cos function to odd col indices in the matrix
  angle_matrix[:,1::2]=np.cos(angle_matrix[:,1::2])

  # after we apply sin and cos to the angle matrix it becomes positional encodings and we convert this matrix into a tensor
  pos_encodings=angle_matrix[np.newaxis,...]
  pos_encodings = tf.convert_to_tensor(pos_encodings, dtype=tf.float32)

  return pos_encodings

### Utility functions

In [23]:
# defining a function which creates padding mask ie it returns a binary vecotor where 1 represent corresponding token is a padding token ie "0"
def create_padding_mask(seq):
  mask=tf.cast(tf.math.equal(seq,0),tf.float32)
  # making this as a tensor
  mask=mask[:,tf.newaxis,tf.newaxis,:]
  return mask

# defining a function for look ahead mask
def create_look_ahead_mask(size):
  mask=1-tf.linalg.band_part(tf.ones((size,size)),-1,0)
  return mask

# defining functions which creates aboves two masks
def create_masks(inp,target):
  # creating encoder padding mask
  enc_padding_mask= create_padding_mask(inp)
  # creating decoder padding mask which will be used in the 2nd attention in the decoder layer
  dec_padding_mask= create_padding_mask(inp)
  # creating look_ahead_mask for masking future tokens in the decoder layer which will be used in the 1st attention in the decoder
  look_ahead_mask= create_look_ahead_mask(tf.shape(target)[1])
  # creating the padding mask for decoder
  dec_target_padding_mask= create_padding_mask(target)

  # creating a combined mask
  combined_mask= tf.maximum(dec_target_padding_mask,look_ahead_mask)

  return enc_padding_mask,combined_mask,dec_padding_mask



# Encoder

### Scaled Dot Product Attention:
$${Attention(Q, K, V) = softmax_k(\frac{QK^T}{\sqrt{d_k}}) V} $$

In [24]:
def scaled_dot_product_attention(k,v,q,mask):
  """
  This is a self attention so q,k and v is build from the datamatrix
  q: data tensor after passing through by linear layer Wq , (batch_size,n_heads,seq_len,depth)
  k: data tensor after passing through by linear layer Wk , (batch_size,n_heads,seq_len,depth)
  v: data tensor after passing through by linear layer Wv , (batch_size,n_heads,seq_len,depth)
  """

  """
  k_transpose   : (batch_size,n_heads,depth_seq_len)
  q.k_transpose : (batch_size,n_heads,seq_len,depth) * (batch_size,n_heads,depth,seq_len) ==> (batch_size,n_heads,seq_len,seq_len)
  """
  # matrix multiplication of Q and K.transpose
  matmul_qk=tf.matmul(q,k,transpose_b=True) # now this tensor has logits

  # computing dk(embedding_dim) and casting it to float
  dk=tf.cast(tf.shape(k)[-1],tf.float32)

  # scaling the logits in the matmul_qk using dk
  scaled_logits=matmul_qk/tf.math.sqrt(dk)

  # since padding tokens contribute nothing to the attention we ignore them padding by adding a large negative numbers to the logits of the padding tokens
  if mask is not None:
    scaled_logits+=(mask * -1e9)

  # applying the softmax function
  attention_weights=tf.nn.softmax(scaled_logits,axis=-1)

  # multiplying the attention weights with the v (batch_size,n_heads,seq_len,seq_len)*(batch_size,n_heads,seq_len,depth)
  output=tf.matmul(attention_weights,v)

  """output            : (batch_size,n_heads,seq_len,depth)
     attention_weights : (batch_size,n_heads,seq_len,seq_len) """

  return output,attention_weights

In [25]:
# testing of function
np.set_printoptions(suppress=True)

temp_k = tf.constant([[10,0,0],
                      [0,10,0],
                      [0,0,10],
                      [0,0,10]], dtype=tf.float32)  # (4, 3)

temp_v = tf.constant([[10,0,0],
                      [0,10,0],
                      [0,0,10],
                      [0,0,10]], dtype=tf.float32)  # (4, 2)

# This `query` aligns with the second `key`,
# so the second `value` is returned.
temp_q = tf.constant([[10,0,0],
                      [0,10,0],
                      [0,0,10],
                      [0,0,10]], dtype=tf.float32)  # (1, 3)
# testing the function
temp_out,temp_attn=scaled_dot_product_attention(temp_q,temp_k,temp_v,None)

### Multi Head Attention

<img src="https://www.tensorflow.org/images/tutorials/transformer/multi_head_attention.png" width="500" alt="multi-head attention">


In [26]:
# implementing the Multi Head attention layer
class MultiHeadAttention(tf.keras.layers.Layer):

  def __init__(self,d_model,n_heads):
    """
    d_model: embedding_dim or no of hidden_units
    n_heads: no of heads of self attention

    """
    super(MultiHeadAttention,self).__init__()
    self.n_heads=n_heads
    self.d_model=d_model

    # asserting if the d_model is divisible by the number of heads
    assert d_model % self.n_heads == 0
    # depth is the split of d_model for each head
    self.depth = self.d_model // self.n_heads

    # defining the linear layers for weight matrices Wk,Wq,Wv
    self.wk=tf.keras.layers.Dense(d_model)
    self.wq=tf.keras.layers.Dense(d_model)
    self.wv=tf.keras.layers.Dense(d_model)

    # defining the linear layer for the last weight matrix W0
    self.w0=tf.keras.layers.Dense(d_model)


  def split_heads(self,x,batch_size):
    """
    x: tensor of dim (batch_size,seq_len,embedding)
    splits the tensors along last (embeddings) dim to pass that slice for each head
    """
    # splitting the last dimension of x into (n_heads,depth)
    x=tf.reshape(x,(batch_size,-1,self.n_heads,self.depth))

    # after reshaping the shape of x is (batch_size,seq_len,n_heads,depth) but we want to change the dim order to (batch_size,n_heads,seq_len,depth) so permuting it
    x=tf.transpose(x,perm=[0,2,1,3])

    return x


  def call(self,k,v,q,mask):
    """
    k: tensor of shape (batch_size,seq_len,embedding)
    q: tensor of shape (batch_size,seq_len,embedding)
    v: tensor of shape (batch_size,seq_len,embedding)
    """
    # getting the batch size
    batch_size=tf.shape(q)[0]

    # passing the k,v,q to the linear layers wk,wv,wq respectively
    k=self.wk(k)  # (batch_size, seq_len, d_model)
    v=self.wv(v)  # (batch_size, seq_len, d_model)
    q=self.wq(q)  # (batch_size, seq_len, d_model)

    # splitting the last dimension of k,v,q
    k=self.split_heads(k,batch_size)  # (batch_size, n_heads, seq_len, depth)
    v=self.split_heads(v,batch_size)  # (batch_size, n_heads, seq_len, depth)
    q=self.split_heads(q,batch_size)  # (batch_size, n_heads, seq_len, depth)


    # passing the k,v,q to the scaled attention function to compute attention weights
    scaled_attention, attention_weights = scaled_dot_product_attention(k,v,q,mask)
    """
    shape of scaled_attention : (batch_size,n_heads,seq_len,depth)
    shape of attention_weights: (batch_size,n_heads,seq_len,seq_len)
    """
    # transposing the scaled attention to make it prepare for reshaping
    scaled_attention= tf.transpose(scaled_attention,perm=[0,2,1,3]) # (batch_size, seq_len, n_heads, depth)

    # reshaping the scaled_attention back to the (batch_size,seq_len,d_model) d_model=n_heads*depth
    concat_attention= tf.reshape(scaled_attention, (batch_size,-1,self.d_model))

    # passing the concat_attention to the final linear layer which consists of w0
    output=self.w0(concat_attention) # (batch_size, seq_len, d_model)

    return output,attention_weights


In [27]:
# testing the MultiHeadAttention
temp_mha = MultiHeadAttention(d_model=512, n_heads=8)
y = tf.random.uniform((1, 60, 512))  # (batch_size, encoder_sequence, d_model)
out, attn = temp_mha(y,y,y, mask=None)
out.shape, attn.shape

(TensorShape([1, 60, 512]), TensorShape([1, 8, 60, 60]))

### Feed forward network
After self attention we have layer norm followed by a feed forward network for each token

In [28]:
def point_wise_fc_layer(d_model,dff):
  """
  d_model: dimensionality of the input
  dff    : hidden_units in the dense_layer

  it returns a model consisting 2 FC layers. 1st layers reduces the dimensionality of input to dff, the 2nd layer converts the dimensionality of input back to d_model"""
  return tf.keras.Sequential([tf.keras.layers.Dense(dff,activation='relu'), # (batch_size,seq_len,dff)
                              tf.keras.layers.Dense(d_model)])              # (batch_size,seq_len,d_model)

In [29]:
# testing of point_wise_fc_layer
sample_ffn = point_wise_fc_layer(512, 2048)
sample_ffn(tf.random.uniform((64, 50, 512))).shape

TensorShape([64, 50, 512])

### Encoder Layer

In [30]:
# implementing Single Encoder layer

class EncoderLayer(tf.keras.layers.Layer):

  def __init__(self,d_model,num_heads,dff,rate=0.1):
    super(EncoderLayer,self).__init__()

    # defining the multi head attention layer
    self.mha=MultiHeadAttention(d_model,num_heads)
    # defining the point wise feed forward layer
    self.fc=point_wise_fc_layer(d_model,dff)

    # defining the layer norm 1 and 2
    self.layer_norm1=tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layer_norm2=tf.keras.layers.LayerNormalization(epsilon=1e-6)

    # defining the dropouts
    self.dropout1=tf.keras.layers.Dropout(rate)
    self.dropout2=tf.keras.layers.Dropout(rate)


  def call(self,x,training,mask):

    # since it is a self attention we pass the same input x as key,query,value to the multi-head-attention
    attn_output,_= self.mha(k=x,v=x,q=x,mask=mask) # (batch_size,seq_len,d_model)
    # passing the attn_output through the dropout
    attn_output= self.dropout1(attn_output,training=training)

    # passing the output throught the layer norm
    out1= self.layer_norm1(x+attn_output)

    # passing the output from the layer norm to the FC layer
    fc_output=self.fc(out1)
    # passing throught the dropout
    fc_output=self.dropout2(fc_output)

    # passing the ouput to the layer norm2
    out2=self.layer_norm2(fc_output) # (batch_size, seq_len, d_model)

    return out2

In [31]:
# testing of the encoder layer class
sample_encoder_layer = EncoderLayer(512, 8, 2048)

sample_encoder_layer_output = sample_encoder_layer(
    tf.random.uniform((64, 43, 512)), False, None)

sample_encoder_layer_output.shape  # (batch_size, input_seq_len, d_model)

TensorShape([64, 43, 512])

### Encoder

In [32]:
class Encoder(tf.keras.layers.Layer):

  def __init__(self,num_layers,d_model,num_heads,dff,input_vocab_size,max_pos_encoding,rate=0.1):
    """
    num_layers        : no of encoder layers
    d_model           : dimensionality of input embeddings
    num_heads         : no of heads in multi-head attention
    input_vocab_size  : no of words in input language vocab
    max_pos_encoding  :
    dff               :
    max_pos_encoding  :
    """
    super(Encoder,self).__init__()

    self.d_model=d_model
    self.num_heads=num_heads
    self.num_layers=num_layers

    # defining the embedding layer
    self.embeddings=tf.keras.layers.Embedding(input_vocab_size,d_model)
    # defining the positional encoding layer
    self.pos_encodings=positional_encodings(max_pos_encoding,self.d_model)

    # defining the encoding layers
    self.enc_layers=[EncoderLayer(d_model,num_heads,dff,rate) for _ in range(num_layers)]

    self.dropout=tf.keras.layers.Dropout(rate)



  def call(self,x,training,mask):

    """
    x        : input tensor of shape (batch_size, seq_len)
    training : bool variable to indicate whether we are in training
    mask     : masks for padding

    """

    # getitng the seq_len of the inputs
    seq_len=tf.cast(tf.shape(x)[1], tf.int32)

    # passing the input throught the emebdding layer
    x=self.embeddings(x)  # x:(batch_size, seq_len, d_model)

    # normalization
    x*=tf.math.sqrt(tf.cast(self.d_model,tf.float32))

    # passing the embedding to the positional encodng
    x+=self.pos_encodings[:,:seq_len,:]

    # adding dropout
    x=self.dropout(x,training=training)

    # adding encoder layers
    for i in range(self.num_layers):
      x=self.enc_layers[i](x, training, mask)

    return x

In [33]:
  # testing of Encoder
sample_encoder = Encoder(num_layers=2, d_model=512, num_heads=8,
                         dff=2048, input_vocab_size=8500,
                         max_pos_encoding=10000)
temp_input = tf.random.uniform((64, 62), dtype=tf.int64, minval=0, maxval=200)
sample_encoder_output = sample_encoder(temp_input, training=False, mask=None)
print (sample_encoder_output.shape)  # (batch_size, input_seq_len, d_model)

(64, 62, 512)


### Decoder Layer

In [34]:
# implementing Single Decoder layer

class DecoderLayer(tf.keras.layers.Layer):

  def __init__(self,d_model,num_heads,dff,rate=0.1):
    super(DecoderLayer,self).__init__()

    # defining the multi head attention layer1
    self.mha1=MultiHeadAttention(d_model,num_heads)
    # definfing the multi head attention layer2 (encoder-decoder attention)
    self.mha2=MultiHeadAttention(d_model,num_heads)

    # defining the point wise feed forward layer
    self.fc=point_wise_fc_layer(d_model,dff)

    # defining the layer norm 1,2 and 3
    self.layer_norm1=tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layer_norm2=tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layer_norm3=tf.keras.layers.LayerNormalization(epsilon=1e-6)


    # defining the dropouts
    self.dropout1=tf.keras.layers.Dropout(rate)
    self.dropout2=tf.keras.layers.Dropout(rate)
    self.dropout3=tf.keras.layers.Dropout(rate)



  def call(self,x,enc_output,training,look_ahead_mask,padding_mask):
    """
    x          : input at the current timestep t
    enc_output : output of the encoder with shape (batch_size, seq_len, d_model)
    """

    # passing the target as the inputs to the multi-head-attention
    attn1,attn_weights1,=self.mha1(x,x,x,look_ahead_mask)
    # passing throught dropouts
    attn1=self.dropout1(attn1,training=training)
    # passing through layer norm
    out1=self.layer_norm1(attn1 + x)

    # passing the encoders output as v and k and passing out1 as q to the multi-head-attention(encoder_decoder_attention)
    attn2,attn_weights2=self.mha2(enc_output,enc_output,out1,padding_mask)
    # passing through dropout
    attn2=self.dropout2(attn2,training=training)
    # passing through selfnorm
    out2=self.layer_norm2(attn2 + out1)

    # passing through the fc layer
    fc_output=self.fc(out2)
    # passing through dropout
    fc_output=self.dropout3(fc_output)
    # passing through layer norm
    out3=self.layer_norm3(fc_output+out2)

    return out3,attn_weights1,attn_weights2

In [35]:
# testing of decoder layer
sample_decoder_layer = DecoderLayer(512, 8, 2048)

sample_decoder_layer_output, _, _ = sample_decoder_layer(
    tf.random.uniform((64, 50, 512)), sample_encoder_layer_output,
    False, None, None)

sample_decoder_layer_output.shape  # (batch_size, target_seq_len, d_model)

TensorShape([64, 50, 512])

### Decoder

In [36]:
class Decoder(tf.keras.layers.Layer):

  def __init__(self,num_layers,d_model,num_heads,dff,target_vocab_size,max_pos_encoding,rate=0.1):
    """
    num_layers        : number of decoder layers
    d_model           : dimensionality of target embedding
    num_heads         : number of heads in decoder multi-head attention
    dff               : hidden units in the decoder fc layer
    target_vocab_size : vocab size of target
    """

    super(Decoder,self).__init__()

    self.d_model=d_model
    self.num_layers=num_layers

    # defining the embedding layer in the decoder
    self.embeddings=tf.keras.layers.Embedding(target_vocab_size,d_model)

    # defining the positional encoding layer
    self.pos_encodings=positional_encodings(max_pos_encoding,self.d_model)

    # defining the encoding layers
    self.dec_layers=[DecoderLayer(d_model,num_heads,dff,rate) for _ in range(num_layers)]

    self.dropout=tf.keras.layers.Dropout(rate)



  def call(self,x,enc_output,training,look_ahead_mask,padding_mask):
    """
    x          : input tensor of target words with shape (batch_size,seq_len)
    enc_output : output from the encoding layer with shape (batch_size, seq_len, d_model)
    training   : boolean variable to indicate whether we are training

    """

    # getting the seq_len of the target inputs
    seq_len=tf.cast(tf.shape(x)[1],tf.float32)

    attn_weights={}

    # passing the input target words to the embedding layer
    x=self.embeddings(x)  # x: (batch_size, seq_len, d_model)

    # normalization
    x*=tf.math.sqrt(tf.cast(self.d_model,tf.float32))

    # positional encodings
    x += self.pos_encodings[:, :tf.cast(seq_len, tf.int32), :]


    # adding dropouts
    x=self.dropout(x,training=training)

    # passing through the decoder layers
    for i in range(self.num_layers):
      x,block1,block2=self.dec_layers[i](x,enc_output,training,look_ahead_mask,padding_mask)
      attn_weights[f'decoder_layer{i+1}_block1']=block1
      attn_weights[f'decoder_layer{i+1}_block2']=block2

    return x,attn_weights # x: (batch_size,seq_len,d_model)




In [37]:
# TESING OF DECODER
sample_decoder = Decoder(num_layers=2, d_model=512, num_heads=8,
                         dff=2048, target_vocab_size=8000,
                         max_pos_encoding=5000)
temp_input = tf.random.uniform((64, 26), dtype=tf.int64, minval=0, maxval=200)
output, attn = sample_decoder(temp_input,
                              enc_output=sample_encoder_output,
                              training=False,
                              look_ahead_mask=None,
                              padding_mask=None)
output.shape, attn['decoder_layer2_block2'].shape

(TensorShape([64, 26, 512]), TensorShape([64, 8, 26, 62]))

### Transformer

In [38]:
class Transformer(tf.keras.Model):

  def __init__(self,num_layers,d_model,num_heads,dff,input_vocab_size,target_vocab_size,pe_input,pe_target,rate=0.1):
    """
    num_layers        : number of encoder/decoder layers
    d_model           : dimensionality of target embedding
    num_heads         : number of heads in decoder multi-head attention
    dff               : hidden units in the decoder fc layer
    input_vocab_size  : no of words in input language vocab
    target_vocab_size : vocab size of target
    pe_iput           : positional encodings of input embeddings
    pe_output         : positional encodings of target embeddings

    """

    super(Transformer,self).__init__()

    # defining the encoder
    self.encoder= Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input, rate)

    # defining the decoder
    self.decoder= Decoder(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate)

    # defining the final classfier layer
    self.final_layer=tf.keras.layers.Dense(target_vocab_size)



  def call(self,inp,target,training,enc_padding_mask,look_ahead_mask,dec_padding_mask):
    """
    inp    : input tesor of shape (batch_size, input_seq_len)
    target : target tensor of shape (batch_size, target_seq_len)

    """

    # passing the input to the encoder
    enc_output= self.encoder(inp,training,enc_padding_mask)  #(batch_size,input_seq_len,d_model)

    # passing the target to the decoder
    dec_output,attention_weights=self.decoder(target,enc_output,training,look_ahead_mask,dec_padding_mask)

    # passing the decoder output to the final layer
    final_output=self.final_layer(dec_output)  # (batch_size,target_seq_len,target_vocab_size)

    return final_output,attention_weights

In [39]:
# TESTING OF TRANSFORMER
sample_transformer = Transformer(
    num_layers=2, d_model=512, num_heads=8, dff=2048,
    input_vocab_size=8500, target_vocab_size=8000,
    pe_input=10000, pe_target=6000)

temp_input = tf.random.uniform((64, 38), dtype=tf.int64, minval=0, maxval=200)
temp_target = tf.random.uniform((64, 36), dtype=tf.int64, minval=0, maxval=200)

fn_out, _ = sample_transformer(temp_input, temp_target, training=False,
                               enc_padding_mask=None,
                               look_ahead_mask=None,
                               dec_padding_mask=None)

fn_out.shape  # (batch_size, tar_seq_len, target_vocab_size)

TensorShape([64, 36, 8000])

#### Custom Scheduler

$$\Large{lrate = d_{model}^{-0.5} * min(step{\_}num^{-0.5}, step{\_}num * warmup{\_}steps^{-1.5})}$$


In [40]:
# creating a custom scheduler which changes the learning rate according the formula in the research paper
class CustomScheduler(tf.keras.optimizers.schedules.LearningRateSchedule):

  def __init__(self,d_model,warmup_steps=4000):
    self.d_model=d_model
    self.d_model=tf.cast(self.d_model,tf.float32)
    self.warmup_steps=warmup_steps


  def __call__(self,step):
    step = tf.cast(step, tf.float32)
    arg1= tf.math.rsqrt(step)
    arg2= step * (self.warmup_steps ** -1.5)
    return tf.math.rsqrt(self.d_model)*tf.math.minimum(arg1,arg2)

### Loss Function

In [41]:
# creating the loss object
loss_obj=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,reduction='none')

# creating a loss function
def loss_function(real,pred):

  # creating a mask because we need to ignore padding tokens while calculating the loss
  mask=tf.math.logical_not(tf.math.equal(real,0)) # this is a tensor which contains 0 if the corresponding token is a padding token ie 0 else 1

  # creating the loss obj
  loss_=loss_obj(real,pred)

  # casting the mask
  mask=tf.cast(mask,dtype=loss_.dtype)

  # we need to multiply the loss with the mask to ignore the losses corresponding to the token
  loss_*=mask

  # calculating the avg loss
  total_loss= tf.reduce_sum(loss_) # total loss for the non padding tokens
  non_padding_tokens= tf.reduce_sum(mask) # total number of non padding tokens
  avg_loss= total_loss/non_padding_tokens

  return avg_loss

### Accuracy function

In [42]:
def accuracy_func(real, pred):
    """
    real : ground truth matrix of shape (batch_size, seq_len)
    pred : predicted tensor of shape (batch_size, seq_len, target_voc_size)
    """

    # computing the correctly predicted words
    correctly_predicted = tf.equal(real, tf.argmax(pred, axis=2))

    # creating a mask for padding tokens in the real
    mask = tf.math.logical_not(tf.math.equal(real, 0))

    # calculating the correctly predicted words excluding the padding tokens
    accuracies = tf.math.logical_and(correctly_predicted, mask)

    # calculating the accuracy
    accuracy = tf.reduce_sum(tf.cast(accuracies, dtype=tf.float32)) / tf.reduce_sum(tf.cast(mask, dtype=tf.float32))

    return accuracy


### Training

In [43]:
# defining the hyper parameters
NUM_LAYERS = 4
D_MODEL = 128
DFF = 512
NUM_HEADS = 8
DROPOUT_RATE = 0.1

In [44]:
# defining a transformer model
transformer= Transformer(num_layers=NUM_LAYERS,
                         d_model=D_MODEL,
                         num_heads=NUM_HEADS,
                         dff=DFF,
                         input_vocab_size=hindi_tokenizer.vocab_size,
                         target_vocab_size=en_tokenizer.vocab_size,
                         pe_input=1000,
                         pe_target=1000,
                         rate=DROPOUT_RATE)

In [45]:
# creating learning rate custom scheduler
learning_rate= CustomScheduler(D_MODEL)
# creating the optimizer
optimizer= tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

In [54]:
checkpoint_path = "./checkpoints/train"

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print ('Latest checkpoint restored!!')

Latest checkpoint restored!!


#### Train Step

In [47]:
# creating metrics for train
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')

train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
]

# wrapping the training function using tf.function wrapper to compile the steps into a TF Graph for faster execution
@tf.function(input_signature=train_step_signature)
def train_step(inp,target):
  """
  inp    : input tensor of shape (batch_size, inp_seq_len)
  target : target tensor of shape (batch_size, target_seq_len)

  """
  # slicing the target by excluding the last token of all sequences in the batch to pass it to the decoder as input
  tar_inp= target[:,:-1]
  # slicing the target by excluding the first token of all sequences in the batch to pass it as the label for the decoder to compute loss
  tar_real= target[:,1:]

  # creating masks
  enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

  # Gradient Calculation
  with tf.GradientTape() as tape:

    # making predictions
    predictions, _ = transformer(inp,
                                 tar_inp,
                                 True,
                                 enc_padding_mask,
                                 combined_mask,
                                 dec_padding_mask)
    # computing the loss
    loss = loss_function(tar_real,predictions)


    # computing the gradients
    gradients = tape.gradient(loss, transformer.trainable_variables)

    # updating the params by applying the gradients
    optimizer.apply_gradients(zip(gradients,transformer.trainable_variables))

    # making the loss and accuracies as the metrics
    train_loss(loss)
    train_accuracy(accuracy_func(tar_real,predictions))

#### Val step

In [48]:
# creating metrics for validation
val_loss = tf.keras.metrics.Mean(name='val_loss')
val_accuracy = tf.keras.metrics.Mean(name='val_accuracy')


val_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
]


# wrapping the val_step function using tf.function wrapper to compile the steps into a TF Graph for faster execution
@tf.function(input_signature=val_step_signature)
def val_step(inp,target):
  """
  inp    : input tensor of shape (batch_size, inp_seq_len)
  target : target tensor of shape (batch_size, target_seq_len)

  """
  # slicing the target by excluding the last token of all sequences in the batch to pass it to the decoder as input
  tar_inp= target[:,:-1]
  # slicing the target by excluding the first token of all sequences in the batch to pass it as the label for the decoder to compute loss
  tar_real= target[:,1:]

  # creating masks
  enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)


  # making predictions
  predictions, _ = transformer(inp,
                                 tar_inp,
                                 False,
                                 enc_padding_mask,
                                 combined_mask,
                                 dec_padding_mask)
  # computing the loss
  loss = loss_function(tar_real,predictions)

  # computing the accuracy
  accuracy = accuracy_func(tar_real, predictions)

  # making the loss and accuracies as the metrics
  val_loss(loss)
  val_accuracy(accuracy)


#### Fitting

In [None]:
# TRAINING
num_epochs=40

for epoch in range(num_epochs):

  # noting the time of the start
  start=time.time()

  # resetting the train loss and train accuracy to calculate fresh for the each epoch
  train_loss.reset_states()
  train_accuracy.reset_states()

  # resetting the validation loss and val accuracy to calculate fresh for each epoch
  val_loss.reset_states()
  val_accuracy.reset_states()




  # TRAINING
  for (batch,(inp,tar)) in enumerate(train_data):
    # training the batch using train_step function
    train_step(inp,tar)

    if batch % 250 == 0:
      print(f'Epoch {epoch + 1} Batch {batch} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

  # VALIDATION
  for (batch,(inp, tar)) in enumerate(val_data):
    # validating the batch using val_step fuction
    val_step(inp, tar)



  # Print training and validation metrics
  print(f'\n Epoch {epoch + 1} \n Training Loss: {train_loss.result():.4f} Training Accuracy: {train_accuracy.result():.4f} \n Validation Loss: {val_loss.result():.4f} Validation Accuracy: {val_accuracy.result():.4f}')
  print(f'Time taken for epoch {epoch + 1}: {time.time() - start:.2f} secs\n')

  # Save checkpoint
  ckpt_save_path = ckpt_manager.save()
  print(f'Saving checkpoint for epoch {epoch + 1} at {ckpt_save_path}')


### Model Inference

In [56]:
# getting the index for the <EOS> token
EOS_INDEX=en_tokenizer.encode('<EOS>')[0]

In [61]:
# defining a function to evaluate query at runtime
def translate(sentence,maxlen=10):

  """sentence : hinglish sentence"""

  # adding <SOS> and <EOS> tokens to the sentence
  sentence= add_special_tokens(sentence)
  target="<SOS>"


  # tokenization (converting the sequence to indices)
  sentence=hindi_tokenizer.encode(sentence)
  target=en_tokenizer.encode(target)

  # converting the sentence to tf tensor
  sentence=tf.convert_to_tensor(sentence)      # (,inp_len)

  # adding a new dim to the sentence tensor
  sentence = tf.expand_dims(sentence, axis=0)
  encoder_input=sentence                       # (1, inp_len)

  # converting the target to a tensor
  target= tf.convert_to_tensor(target)         # (1, )
  # adding a new dimension to the target tensor
  target = tf.expand_dims(target, axis=0)      # (1, 1)

  """
  Initially @ 1st timestep we pass <EOS> token to the transformer,
  it generates maxlen ie 40 words, but we take the last word as the output of the 1st timestep
  now we concatenate this ouput to the <EOS> and pass it to the transformer as the input @ 2nd timestep and
  and again it generates 40 words we take the last word and concatenate it and pass it as the input @ 3rd timestep
  """

  for i in range(maxlen):

    # creating masks
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(encoder_input, target)

    #enccoder_input : (1, inp_len)
    # target        : (1,1)

    # making predictions of shape (batch_size,target_seq_len,target_vocab_size)
    predictions,attn_weights=transformer(encoder_input,
                                         target,
                                         False,
                                         enc_padding_mask,
                                         combined_mask,
                                         dec_padding_mask)



    # selecting the last word from predictions @ curr timestep
    last_word = predictions[:,-1,:] # (batch_size,1,target_vocab_size)

    # computing the index of the highest probability and last_word is a eager tensor containing the index
    last_word = tf.argmax(last_word,axis=-1)

    # converting the eager tensor to a int for if condition
    last_word_idx = last_word.numpy().item()


    # casting the last_word to int32
    last_word_id  = last_word
    last_word_id = tf.cast(last_word_id,tf.int32)
    # reshaping last_word_id to match the dimension of target to concat both of them
    last_word_id = tf.reshape(last_word_id, [1, 1])


    # adding the last predicted word(index) @ curr timestep to the input for the transformer @ next timestep
    target= tf.concat([target,last_word_id],axis=-1)

    # if the last predicted word is <EOS> break the loop and stop generating
    if last_word_idx==EOS_INDEX:
      break

  # decoding the indices in targets to texts
  target=list(target.numpy()[0])
  decoded=en_tokenizer.decode(target)

  # removing the special tokens
  decoded_lst= decoded.split(" ")
  decoded=[i for i in decoded_lst if i not in ('<SOS>','<EOS>') ]
  decoded=" ".join(decoded)

  return decoded


### Testing

In [124]:
sample_sentence=input("Enter a hinglish sentence: ")
translated_sentence=translate(sample_sentence)
print(sample_sentence)
print(translated_sentence)

haan interesting tha jab stars sab apne areas par cross overs karthe new one ke saath
yeah it is interesting when stars in other areas cross over to new ones right
