# English to Hindi Translation Using IIT Bombay dataset.

### Importing the Libraries

In [None]:
# !pip install tensorflow==2.11

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
tf.__version__

'2.13.0'

### Downloading the Dataset From Kaggle

In [None]:
!pip install -q kaggle
from google.colab import files

# Choose the kaggle.json file that you downloaded
files.upload()

# Make directory named kaggle and copy kaggle.json file there.
!mkdir ~/.kaggle

!cp kaggle.json ~/.kaggle/

# Change the permissions of the file.
!chmod 600 ~/.kaggle/kaggle.json


# !kaggle datasets list

Saving kaggle.json to kaggle.json


In [None]:
# Downloading the dataset
!kaggle datasets download -d "vaibhavkumar11/hindi-english-parallel-corpus"

Downloading hindi-english-parallel-corpus.zip to /content
100% 112M/112M [00:06<00:00, 23.0MB/s]
100% 112M/112M [00:06<00:00, 18.1MB/s]


### Unzipping the dataset

In [None]:
import zipfile

zip_ref = zipfile.ZipFile("/content/hindi-english-parallel-corpus.zip")
zip_ref.extractall()
zip_ref.close()

In [None]:
# Another way to extract data using cmd
# !unzip "/content/hindi-english-parallel-corpus.zip"

## Data Preprocessing

### Loading the Dataset

In [None]:
# Setting data directory path
data_dir = "/content/hindi_english_parallel.csv"

In [None]:
dataset = pd.read_csv(data_dir)
dataset.head()

Unnamed: 0,hindi,english
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,Give your application an accessibility workout
1,एक्सेर्साइसर पहुंचनीयता अन्वेषक,Accerciser Accessibility Explorer
2,निचले पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the bottom panel
3,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the top panel
4,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...,A list of plugins that are disabled by default


In [None]:
dataset.describe()

Unnamed: 0,hindi,english
count,1555785,1561116
unique,983939,1015945
top,Name,Your names
freq,1093,363


In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1561841 entries, 0 to 1561840
Data columns (total 2 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   hindi    1555785 non-null  object
 1   english  1561116 non-null  object
dtypes: object(2)
memory usage: 23.8+ MB


In [None]:
# Checking for null values in the hindi column of the dataframe
dataset["hindi"].isnull().values.any()

True

In [None]:
# Counting the number of null values in the hindi column
dataset.hindi.isnull().values.sum()

6056

In [None]:
# Checking for null values in the english column of the dataframe
dataset.english.isnull().values.any()

True

In [None]:
# Counting the number of null values in the english column
dataset.english.isnull().values.sum()

725

In [None]:
# Checking and counting null values in the dataframe
dataset.isnull().values.any(), dataset.isnull().values.sum()

(True, 6781)

In the case of `Language Translation` we can't perform `Imputation` as we do normally, link imputing the value by mean, median or most frequent, etc.

In this case we are going to `remove the rows` having null values from both the columns.

### Dropping the null values from the dataset

In [None]:
data = dataset.dropna()
data.head()

Unnamed: 0,hindi,english
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,Give your application an accessibility workout
1,एक्सेर्साइसर पहुंचनीयता अन्वेषक,Accerciser Accessibility Explorer
2,निचले पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the bottom panel
3,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the top panel
4,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...,A list of plugins that are disabled by default


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1555727 entries, 0 to 1561839
Data columns (total 2 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   hindi    1555727 non-null  object
 1   english  1555727 non-null  object
dtypes: object(2)
memory usage: 35.6+ MB


In [None]:
data.describe()

Unnamed: 0,hindi,english
count,1555727,1555727
unique,983909,1011562
top,Name,Your names
freq,1093,363


In [None]:
# Checking the number of values in each columns
len(data["hindi"]), len(data["english"])

(1555727, 1555727)

### Calculate the average number of words in the sentences

In [None]:
num_words_in_hindi_sentences = [len(val.split()) for val in data["hindi"]]

average_words_in_hindi_sentences = sum(num_words_in_hindi_sentences) / len(data["hindi"])
average_words_in_hindi_sentences

13.751966765377215

In [None]:
np.percentile(num_words_in_hindi_sentences, 97)

53.0

In [None]:
num_words_in_english_sentences = [len(val.split()) for val in data["english"]]

average_words_in_english_sentences = sum(num_words_in_english_sentences) / len(data["english"])
average_words_in_english_sentences

12.734555612906378

In [None]:
np.percentile(num_words_in_english_sentences, 97)

47.0

### Vectorizing the Sentences by using two different TextVectorization Layer

In [None]:
eng_sentences = data["english"]
hin_sentences = data["hindi"]

In [None]:
sentence_hindi = [f"startofseq {sen} endofseq" for sen in hin_sentences]
sentence_hindi[:10]

['startofseq अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें endofseq',
 'startofseq एक्सेर्साइसर पहुंचनीयता अन्वेषक endofseq',
 'startofseq निचले पटल के लिए डिफोल्ट प्लग-इन खाका endofseq',
 'startofseq ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका endofseq',
 'startofseq उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है endofseq',
 'startofseq अवधि को हाइलाइट रकें endofseq',
 'startofseq पहुंचनीय आसंधि (नोड) को चुनते समय हाइलाइट बक्से की अवधि endofseq',
 'startofseq सीमांत (बोर्डर) के रंग को हाइलाइट करें endofseq',
 'startofseq हाइलाइट किए गए सीमांत का रंग और अपारदर्शिता।  endofseq',
 'startofseq भराई के रंग को हाइलाइट करें endofseq']

In [None]:
# Prefetching and batching the dataset
eng_sen_prefetched = tf.data.Dataset.from_tensor_slices(eng_sentences).batch(256).prefetch(tf.data.AUTOTUNE)
hin_sen_prefetched = tf.data.Dataset.from_tensor_slices(sentence_hindi).batch(256).prefetch(tf.data.AUTOTUNE)

In [None]:
vocab_size = 30000
max_token_length = 53
# input_vocab_size = 10000
# output_vocab_size = 15000
# max_token_length_input = 47
# max_token_length_output = 53

# Text Vectorization layer for hindi (output or decoder)
text_vec_hindi = tf.keras.layers.TextVectorization(max_tokens=vocab_size,
                                                   output_sequence_length=max_token_length)

# Text Vectorization layer for english sentences (input or encoder)
text_vec_english = tf.keras.layers.TextVectorization(max_tokens=vocab_size,
                                                     output_sequence_length=max_token_length)

# Adapting the english sentence
text_vec_english.adapt(eng_sen_prefetched)

# Adapting and adding startofseq and endofseq to starting and ending of the hindi sentence
text_vec_hindi.adapt(hin_sen_prefetched)

In [None]:
text_vec_hindi.get_vocabulary()[-10:]

['सिंधियों',
 'सारहीन',
 'साफतौर',
 'सादापाठ',
 'सागरतट',
 'साकेत',
 'साइरिलिकयूक्रेनी',
 'साइडिंग',
 'साइंसेज',
 'सांद्रण']

In [None]:
text_vec_english.get_vocabulary()[-10:]

['lyricism',
 'ly',
 'luxemburg',
 'lunatic',
 'lumpsum',
 'lumbini',
 'lubrication',
 'luanda',
 'ls',
 'lotion']

### Spliting the dataset into train and test set

We can not use train test split method to split the dataset.

In [None]:
X_train = eng_sentences[:1400001]
X_valid = eng_sentences[1400001:]

X_train_dec = [f"startofseq {sen}" for sen in hin_sentences[:1400001]]
X_valid_dec = [f"startofseq {sen}" for sen in hin_sentences[1400001:]]

y_train = text_vec_hindi([f"{sen} endofseq" for sen in hin_sentences[:1400001]])
y_valid = text_vec_hindi([f"{sen} endofseq" for sen in hin_sentences[1400001:]])

In [None]:
# # Prefetching and batching the dataset
# X_train = tf.data.Dataset.from_tensor_slices((X_train, X_train_dec))
# X_valid = tf.data.Dataset.from_tensor_slices((X_valid, X_valid_dec))

# # X_train_dec = tf.data.Dataset.from_tensor_slices(X_train_dec)
# # X_valid_dec = tf.data.Dataset.from_tensor_slices(X_valid_dec)

# y_train = tf.data.Dataset.from_tensor_slices(y_train)
# y_valid = tf.data.Dataset.from_tensor_slices(y_valid)

# X_train_data = tf.data.Dataset.zip((X_train, y_train)).batch(32).prefetch(tf.data.AUTOTUNE)
# X_valid_data = tf.data.Dataset.zip((X_valid, y_valid)).batch(32).prefetch(tf.data.AUTOTUNE)

In [None]:
# Prefetching and batching the dataset
X_train = tf.data.Dataset.from_tensor_slices(X_train) # .batch(32).prefetch(tf.data.AUTOTUNE)
X_valid = tf.data.Dataset.from_tensor_slices(X_valid) # .batch(32).prefetch(tf.data.AUTOTUNE)

In [None]:
# Prefetching and batching the dataset
X_train_dec = tf.data.Dataset.from_tensor_slices(X_train_dec) # .batch(32).prefetch(tf.data.AUTOTUNE)
X_valid_dec = tf.data.Dataset.from_tensor_slices(X_valid_dec) # .batch(32).prefetch(tf.data.AUTOTUNE)

In [None]:
# Prefetching and batching the dataset
y_train = tf.data.Dataset.from_tensor_slices(y_train) # .batch(32).prefetch(tf.data.AUTOTUNE)
y_valid = tf.data.Dataset.from_tensor_slices(y_valid) # .batch(32).prefetch(tf.data.AUTOTUNE)

In [None]:
X_train_dataset = tf.data.Dataset.zip(((X_train, X_train_dec), y_train))
X_valid_dataset = tf.data.Dataset.zip(((X_valid, X_valid_dec), y_valid))

X_train_dataset = X_train_dataset.batch(256).prefetch(tf.data.AUTOTUNE)
X_valid_dataset = X_valid_dataset.batch(256).prefetch(tf.data.AUTOTUNE)

In [None]:
y_train, y_valid

(<_TensorSliceDataset element_spec=TensorSpec(shape=(53,), dtype=tf.int64, name=None)>,
 <_TensorSliceDataset element_spec=TensorSpec(shape=(53,), dtype=tf.int64, name=None)>)

In [None]:
# X_train[:5], X_valid[:5]

In [None]:
# X_valid_dec[:5], X_train[:5]

In [None]:
# y_train[:2], y_valid[:2]

# Model_0:  Building the LSTM model

In [None]:
# Setting the embedding output size
embed_size = 128

In [None]:
# Creating the inputs layers for the encoder and decoder
# encoder_inputs = tf.keras.layers.Input(shape=(None, input_vocab_size), dtype=tf.string)
# decoder_inputs = tf.keras.layers.Input(shape=(None, output_vocab_size), dtype=tf.string)

encoder_inputs = tf.keras.layers.Input(shape=(), dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=(), dtype=tf.string)

# Vectorizing the inputs of encoder and decoder
encoder_input_ids = text_vec_english(encoder_inputs)
decoder_input_ids = text_vec_hindi(decoder_inputs)

In [None]:
# Creating embedding layers for both the encoder and decoder
encoder_embedding_layer = tf.keras.layers.Embedding(input_dim=vocab_size,
                                              output_dim=embed_size,
                                              mask_zero=True)

decoder_embedding_layer = tf.keras.layers.Embedding(input_dim=vocab_size,
                                              output_dim=embed_size,
                                              mask_zero=True)

# Now passing the vectorized input ids to embedding layers
encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

In [None]:
# Now Create the encoder and pass it the embedded inputs
encoder = tf.keras.layers.LSTM(units=512, return_state=True)
encoder_outputs, *encoder_state = encoder(encoder_embeddings)

In [None]:
# Now create the decoder and pass it the embedded inputs
decoder = tf.keras.layers.LSTM(units=512, return_sequences=True, return_state=True)
decoder_outputs, *decoder_state = decoder(decoder_embeddings, initial_state=encoder_state)

In [None]:
# Creating output layer
output_layer = tf.keras.layers.Dense(units=vocab_size, activation="softmax")
y_prob = output_layer(decoder_outputs)

In [None]:
# Finally creating the model
model_0 = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                         outputs=[y_prob], name="LSTM_language_translation_model")

# Compile the model
model_0.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Nadam(),
                metrics=["accuracy"])

In [None]:
int(len(X_train_dataset) * 0.1), X_train_dataset

(4375,
 <PrefetchDataset element_spec=((TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.string, name=None)), TensorSpec(shape=(None, 53), dtype=tf.int64, name=None))>)

In [None]:
# Fit the model
model_0.fit(X_train_dataset,
            epochs=10,
            steps_per_epoch=int(len(X_train_dataset) * 0.1),
            validation_data=X_valid_dataset,
            validation_steps=int(len(X_valid_dataset) * 0.1))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7d95ca2f9c90>

## Making Prediction

In [None]:
# Writing a function to make prediction
def translate(sentence_eng, model):
  """Function to translate the given english sentence into hindi sentence."""
  translation = ""
  for word in range(max_token_length): # Number of token in each sentences
    X = np.array([sentence_eng])  # Encoder input
    X_dec = np.array(["startofseq" + translation])  # Decoder input
    y_proba = model.predict((X, X_dec))[0, word]  # last token's probability
    predicted_word_id = np.argmax(y_proba)
    predicted_word = text_vec_hindi.get_vocabulary()[predicted_word_id]
    if predicted_word == "endofseq":
      break
    translation += " " + predicted_word
  return translation.strip()

In [None]:
translate(["i love hindi"])

TypeError: ignored

# Model 1: Building Bidirectional LSTM Model

To implement a Bidirectional LSTM layer in Keras, just wrap a recurrent layer in a `tf.keras.layers.Bidirectional layer`.

**Problem with Bidirectional layer:**
* It return four states instead of two: `the final short-term` and `long-term states of the forward LSTM layer` and `the final short-term` and `final long term state of backward LSTM layer`.
* We cannot use this quadruple state directly as the initial state of the decoder's LSTM layer, since it expects just two states (short-term and long-term).
* So, Concatenate the final short-term of forward LSTM layer and final short-term of backward LSTM layer into one.
* Similarly, concatenate the final long-term of forward LSTM layer and final long-term of backward LSTM layer into one.

**Note:** We cannot make the `Decoder Bidirectional`, since it must remain causal: otherwise it would cheat during training and it would not work.

In [None]:
# Creating the model_1 using functional API

# Encoder
encoder_inputs = tf.keras.layers.Input(shape=(), dtype=tf.string) # Input layer
x = text_vec_english(encoder_inputs)
x = encoder_embedding_layer(x)
encoder_outputs, *encoder_states = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=256, return_state=True))(x)

# Concatenating the states of encoders
encoder_states = [tf.concat(encoder_states[::2], axis=-1), # short-term states (0, 2)
                 tf.concat(encoder_states[1::2], axis=-1)] # long-term states (1, 3)



# Decoder
decoder_inputs = tf.keras.layers.Input(shape=(), dtype=tf.string)
x = text_vec_hindi(decoder_inputs)
x = decoder_embedding_layer(x)
decoder_output, *decoder_states = tf.keras.layers.LSTM(units=512,  # Size of decoder lstm units = 2 * bidirectional lstm units
                                      return_sequences=True,
                                      return_state=True)(x, initial_state=encoder_states)


# Output layer
output_y_prob = tf.keras.layers.Dense(units=vocab_size, activation="softmax")(decoder_output)

# Creating the model
model_1 = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                         outputs=output_y_prob,
                         name="Bidirectional_LSTM_model")

# Compile the model
model_1.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Nadam(),
                metrics=["accuracy"])

In [None]:
# Fit the model on 10% of the trainind and validation dataset
checkpoint_path_1 = f"Checkpoints/{model_1.name}"

histroy_1 = model_1.fit(X_train_dataset,
                        epochs=10,
                        steps_per_epoch=int(0.05 * len(X_train_dataset)),
                        validation_data=X_valid_dataset,
                        validation_steps=int(0.05 * len(X_valid_dataset)),
                        callbacks=[tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path_1),
                                   tf.keras.callbacks.EarlyStopping(monitor="val_loss",
                                                                    patience=3,
                                                                    restore_best_weights=True)])

Epoch 1/10



Epoch 2/10



Epoch 3/10



Epoch 4/10



Epoch 5/10



Epoch 6/10



Epoch 7/10



Epoch 8/10



Epoch 9/10



Epoch 10/10





In [None]:
# making translation using the model_1
translate(eng_sentences[49], model_1), eng_sentences[49]



('पूर्ण स्थिति', 'Absolute position')

# Model_2: Encoder-Decoder Model (Bidirectional LSTM) with Attention Mechanism

In this model:
* Instead of just sending the enocder's final hidden state to the decoder, as well as the previous target word at each step, we now send all of the encoder's outputs to the decoder as well.
*

In [None]:
# Encoder
encoder_inputs = tf.keras.layers.Input(shape=(), dtype=tf.string)
x = text_vec_english(encoder_inputs)
x = encoder_embedding_layer(x)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=256,
                                                       return_state=True,
                                                       return_sequences=True))(x)
encoder_outputs, *encoder_states = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=256,
                                                                                      return_state=True,
                                                                                      return_sequences=True))(x)

# Concatenating the encoder states
encoder_states = [tf.concat(encoder_states[::2], axis=-1), # short-term (0, 2)
                  tf.concat(encoder_states[1::2], axis=-1)] # long-term (1, 3)

# Decoder
decoder_inputs = tf.keras.layers.Input(shape=(), dtype=tf.string)
x = text_vec_hindi(decoder_inputs)
x = encoder_embedding_layer(x)
x = tf.keras.layers.LSTM(units=512,
                         return_state=True,
                         return_sequences=True)(x, initial_state=encoder_states)
decoder_outputs, *decoder_states = tf.keras.layers.LSTM(units=512,
                                                        return_state=True,
                                                        return_sequences=True)(x)

# Adding attention layer
attention_outputs = tf.keras.layers.Attention()([decoder_outputs, encoder_outputs])

# Output layer
output_y_prob = tf.keras.layers.Dense(units=vocab_size, activation="softmax")(attention_outputs)


# Create the model
model_2 = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                         outputs=output_y_prob, name="encoder_decoder_with_attention")

# Compile the model
model_2.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Nadam(),
                metrics=["accuracy"])

In [None]:
# Fit the model
checkpoint_path_2 = f"Checkpoints/{model_2.name}"
history_2 = model_2.fit(X_train_dataset,
                        epochs=4,
                        steps_per_epoch=int(0.15 * len(X_train_dataset)),
                        validation_data=X_valid_dataset,
                        validation_steps=int(0.15 * len(X_valid_dataset)),
                        callbacks=[# tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path_2),
                                   tf.keras.callbacks.EarlyStopping(monitor="val_loss",
                                                                   patience=3,
                                                                   restore_best_weights=True)])

In [None]:
import multiprocessing

cores = multiprocessing.cpu_count() # Count the number of cores in a computer
cores

2

# Model_3: Transformer Model

In [None]:
# Creating the inputs layers for the encoder and decoder
# encoder_inputs = tf.keras.layers.Input(shape=(None, input_vocab_size), dtype=tf.string)
# decoder_inputs = tf.keras.layers.Input(shape=(None, output_vocab_size), dtype=tf.string)

encoder_inputss = tf.keras.layers.Input(shape=(), dtype=tf.string)
decoder_inputss = tf.keras.layers.Input(shape=(), dtype=tf.string)

# Vectorizing the inputs of encoder and decoder
encoder_input_ids = text_vec_english(encoder_inputss)
decoder_input_ids = text_vec_hindi(decoder_inputss)

In [None]:
embed_size = 128
max_token_length = 53

# Creating embedding layers for both the encoder and decoder
encoder_embedding_layer = tf.keras.layers.Embedding(input_dim=vocab_size,
                                                    output_dim=embed_size,
                                                    mask_zero=True)

decoder_embedding_layer = tf.keras.layers.Embedding(input_dim=vocab_size,
                                                    output_dim=embed_size,
                                                    mask_zero=True)

# Now passing the vectorized input ids to embedding layers
encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

### Creating Positional Encoder

**Positional encoding:** Is a dense vector that encoders the position of a word within a sentence: the ith positional encoding is added to the word embedding of the ith word in the sentence.
* The easiest way to implement this is to use an Embedding layer and make it encode all the positions from 0 to maximum sequence length in the batch, then add the result ot the word embeddings.
* The rules of broadcasting will ensure that the positional encodings get applied to every input sequence.
* The encode and decoder share the same Embedding layer for the positional encodings, since they have the same embedding size.
* Instead of using trainable positional encodings, the authors of the transformer paper choose to use fixed positional encodings, based on the sine and cosine functions at different frequencies.
* We are also going to create the same positional encoding custome layer.

In [None]:
embed_size = 128
max_token_length = 53

# Creating postion embedding layer
pos_embed_layer = tf.keras.layers.Embedding(input_dim=max_token_length,
                                            output_dim=embed_size)

batch_max_len_encoder = tf.shape(encoder_embeddings)[1]
encoder_inputs = encoder_embeddings + pos_embed_layer(tf.range(batch_max_len_encoder))

batch_max_len_decoder = tf.shape(decoder_embeddings)[1]
decoder_inputs = decoder_embeddings + pos_embed_layer(tf.range(batch_max_len_decoder))

In [None]:
class PositionalEncoding(tf.keras.layers.Layer):
  def __init__(self, max_token_size, embed_size, dtype=tf.float32, **kwargs):
    super().__init__(dtype=dtype, **kwargs)
    assert embed_size % 2 == 0, "embed_size must be even"
    p, i = np.meshgrid(np.arange(max_token_length), 2 * np.arange(embed_size // 2))
    pos_emb = np.empty((1, max_token_length, embed_size))
    pos_emb[0, :, ::2] = np.sin(p / 10000 ** (i / embed_size)).T
    pos_emb[0, :, 1::2] = np.cos(p / 10000 ** (i / embed_size)).T
    self.pos_encodings = tf.constant(pos_emb.astype(self.dtype))
    self.supports_masking = True

    def call(self, inputs):
      batch_max_length = tf.shape(inputs)[1]
      return inputs + self.pos_encodings[:, :batch_max_length]

Alternatively, we can use fixed, non-trainable positional encodings:

In [None]:
# # Adding positional encoding to the encoder's inputs
# pos_embed_layer = PositionalEncoding(max_token_size=max_token_length, embed_size=embed_size)

# # Now passing the encoder and decoder's embedding to pos_embed_layer respectively
# encoder_inputs = pos_embed_layer(encoder_embeddings)
# decoder_inputs = pos_embed_layer(decoder_embeddings)

### Encoder of the Transformer model

In [None]:
# Creating the Encoder of the Transformer

N = 3 # as in the original paper
num_heads = 8   # It is used to set the number of parallel attentions used by the multi-head attention layer, generally lies between 12 to 100.
dropout_rate = 0.1
n_units = 128 # For the first dense layer in each Feedforward block
encoder_pad_mask = tf.math.not_equal(encoder_input_ids, 0)[:, tf.newaxis]
Z = encoder_inputs

for _ in range(N):
  skip = Z
  attention_layer = tf.keras.layers.MultiHeadAttention(num_heads=num_heads,
                                                       key_dim=embed_size,
                                                       dropout=dropout_rate)
  Z = attention_layer(Z, value=Z, attention_mask=encoder_pad_mask)
  Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))

  skip = Z # Update the value of skip with Z
  Z = tf.keras.layers.Dense(units=n_units, activation="relu")(Z)
  Z = tf.keras.layers.Dense(units=embed_size)(Z)
  Z = tf.keras.layers.Dropout(dropout_rate)(Z)
  Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))

### Decoder of the Transformer model

In [None]:
decoder_pad_mask = tf.math.not_equal(decoder_input_ids, 0)[:, tf.newaxis]
causal_mask = tf.linalg.band_part(tf.ones((batch_max_len_decoder, batch_max_len_decoder), tf.bool), -1, 0)

In [None]:
# Creating the Decoder of the Transformer
encoder_outputs = Z # Saving the encoder's final output
Z = decoder_inputs  # Decoder start's with it's own inputs

for _ in range(N):
  skip = Z
  masked_attention = tf.keras.layers.MultiHeadAttention(num_heads=num_heads,
                                                        key_dim=embed_size,
                                                        dropout=dropout_rate)
  Z = masked_attention(Z, value=Z, attention_mask=causal_mask & decoder_pad_mask)
  X = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))

  skip = Z # Updating the skip value with updated Z
  attention_layer = tf.keras.layers.MultiHeadAttention(num_heads=num_heads,
                                                       key_dim=embed_size,
                                                       dropout=dropout_rate)
  Z = attention_layer(Z, value=encoder_outputs, attention_mask=encoder_pad_mask)
  Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))

  skip = Z
  Z = tf.keras.layers.Dense(units=n_units, activation="relu")(Z)
  Z = tf.keras.layers.Dense(units=embed_size)(Z)
  Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))

In [None]:
# Crating output layer
output_y_prob = tf.keras.layers.Dense(units=vocab_size, activation="softmax")(Z)

# Creating the model
model_3 = tf.keras.models.Model(inputs=[encoder_inputss, decoder_inputss],
                         outputs=[output_y_prob])

# Compile the model
model_3.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Nadam(),
                metrics=["accuracy"])

# Getting the summary
model_3.summary()

### Fitting the Transformer model on the Training dataset set and validating it on the 10% Test dataset

In [None]:
# Fit the model
history_3 = model_3.fit(X_train_dataset,
                        epochs=3,
                        steps_per_epoch=int(0.01 * len(X_train_dataset)),
                        validation_data=X_valid_dataset,
                        validation_steps=int(0.01 * len(X_valid_dataset)))

# history_3 = model_3.fit((X_train, X_train_dec),
#                         y_train,
#                         epochs=3,
#                         steps_per_epoch=int(0.05 * len(X_train_dataset)),
#                         validation_data=((X_valid, X_valid_dec), y_valid),
#                         validation_steps=int(0.01 * len(X_valid_dataset)))

### Making prediction

In [None]:
translate("I love hindi", model_3), #eng_sentences[0]

In [None]:
sentence_eng = "application and accessibility"

translation = ""
for word in range(max_token_length): # Number of token in each sentences
  X = np.array([sentence_eng])  # Encoder input
  # print(X)
  X_dec = np.array(["startofseq" + translation])  # Decoder input
  print(X_dec)
  y_proba = model_3.predict((X, X_dec))[0, word]  # last token's probability
  # print(y_proba)
  predicted_word_id = np.argmax(y_proba)
  # print(predicted_word_id)
  predicted_word = text_vec_hindi.get_vocabulary()[predicted_word_id]
  # print(predicted_word)
  if predicted_word == "endofseq":
    break
  translation += " " + predicted_word

In [None]:
# translation, y_proba[0],
pred = model_3.predict((X, X_dec))
translation

In [None]:
pred.shape, pred.ndim, pred[0, 1]

In [None]:
arr = np.array([[[1, 2, 3],
                 [4, 5, 6],
                 [7, 8, 9]],
                [[11, 12, 13],
                 [14, 15, 16],
                 [17, 18, 19]]])

arr[0][1], arr[0, 1]