In [1]:
import pathlib
import random
import string
import re
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization

## Parsing the data

In [2]:
text_file = r"npi.txt"
with open(text_file, encoding="utf8") as f:
    lines=f.read().split("\n")[:-2]
text_pairs = []
for line in lines:
    eng,nep= line.split("\t")[:2]
    nep = "[Start]"+ nep+" [end]"
    text_pairs.append((eng,nep))
print(text_pairs[:5])


[('Who?', '[Start]को? [end]'), ('Hide.', '[Start]लुकाउनुहोस्। [end]'), ('Hide.', '[Start]लुक। [end]'), ('Stay.', '[Start]बस्नुहोस्। [end]'), ('Hello!', '[Start]नमस्ते! [end]')]


In [3]:
import pandas as pd
excel_file= r"C:\Users\Leapfrog\Desktop\Nlp\Transformer\english-nepali.xlsx"
sheet_name = "Sheet1"
df = pd.read_excel(excel_file, sheet_name=sheet_name)

rows_as_tuples = [(row[0],f"[Start] {row[1]} [end]") for row in df.values]

print(rows_as_tuples[:5])

[('it happened on the third day, that behold, a man came out of the camp from Saul, with his clothes torn, and earth on his head: and so it was, when he came to David, that he fell to the earth, and showed respect.', '[Start] तब तेस्रो दिनमा एउटा जवान सैनिक सिकलगमा आयो। त्यो मानिस शाऊलको छाउनीबाट आएको थियो। त्यसका लुगाहरू च्यतिएको र शिरमा मैला लागेको थियो। त्यसले दाऊदको अघि धोप्टो परेर उनलाई सम्मान गर्न दण्डवत् गर्यो। [end]'), ('David said to him, "Where do you come from?" He said to him, "I have escaped out of the camp of Israel."', '[Start] दाऊदले त्यसलाई सोधे, “तिमी कहाँबाट आयौ?” त्यस मानिसले जवाफ दियो, “म इस्राएली पालबाट आउँदैछु।” [end]'), ('David said to him, "How did it go? Please tell me." He answered, "The people have fled from the battle, and many of the people also have fallen and are dead; and Saul and Jonathan his son are dead also."', '[Start] दाऊदले भने, “मलाई भन, के भयो?” त्यसले भन्यो, “हाम्रा सबै सैनिकहरू भागे। धेरै मानिसहरू मारिए। शाऊल र तिनका छोरा जोनाथन पनि मरे।” [en

In [4]:
text_pairs =text_pairs+rows_as_tuples


# Here's what our sentence pairs look like

In [5]:
for _ in range(5):
    print(random.choice(text_pairs))

('The waters surrounded me, even to the soul. The deep was around me. The weeds were wrapped around my head.', '[Start] “पानीले मलाई मेरो प्राण सम्मै घेर्यो, गहिरो सगर मेरो चारैतिर थियो, मेरो टाउको भरि समुद्रका झ्याउहरु बेहेरिएका थिए। [end]')
('Now when they had departed, behold, an angel of the Lord appeared to Joseph in a dream, saying, "Arise and take the young child and his mother, and flee into Egypt, and stay there until I tell you, for Herod will seek the young child to destroy him."', '[Start] ज्योतिषीहरू गएपछि, परमप्रभुको एउटा दूत सपनामा यूसुफकहाँ देखा परे। दूतले भने, “उठ, साना बालक र उहाँकी आमालाई साथमा लिएर मिश्रदेशमा जाऊ। हेरोदले साना बालकहरूलाई खोज्न लागिरहेकोछ। उसले सानो बालकलाई मार्न चाहन्छ। मैले नभनेसम्म तिमी त्यहीं बस्नु।” [end]')
('This is the inheritance of the children of Zebulun according to their families, these cities with their villages.', '[Start] यसरी, यी शहरहरू अनि ती वरिपरिका खेतहरू जबूलूनलाई दिए। जबूलूनका प्रत्येक कुलले भूमिका आफ्ना अंश प्राप्त गरे। [end]')

## Let's split the sentence pairs into a training set , a validation set and a test set.

In [6]:
random.shuffle(text_pairs)
num_val_samples = int(0.15*len(text_pairs))
num_train_samples = len(text_pairs) - 2*num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples: num_train_samples+num_val_samples]
test_pairs = text_pairs[num_train_samples +num_val_samples:]

print(f'{len(text_pairs)} total pairs')
print(f'{len(train_pairs)} training pairs')
print(f'{len(val_pairs)} validation pairs')
print(f'{len(test_pairs)} test pairs')

34707 total pairs
24295 training pairs
5206 validation pairs
5206 test pairs


# Vectorizing the text data

In [7]:
strip_chars = string.punctuation + "?"
print(strip_chars)
strip_chars = strip_chars.replace("[","")
strip_chars = strip_chars.replace("]","")

vocab_size = 15000
sequence_length = 10
batch_size = 2


def custom_standardization(input_string):
    lowercase = tf.string.lower(input_string)
    return tf.string.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")

eng_vectorization = TextVectorization(
    max_tokens = vocab_size, output_mode = "int", output_sequence_length=sequence_length,)
nep_vectorization = TextVectorization(
    max_tokens = vocab_size,
    output_mode = "int",
    output_sequence_length = sequence_length+1,
    #standardize = custom_standardization,
)

train_eng_texts = [pair[0] for pair in train_pairs]
train_nep_texts = [pair[1] for pair in train_pairs]
# print(train_nep_texts)
eng_vectorization.adapt(train_eng_texts)
nep_vectorization.adapt(train_nep_texts)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~?


In [8]:
def format_dataset(eng,nep):
    eng = eng_vectorization(eng)
    nep = nep_vectorization(nep)
    return({"encoder_inputs": eng, "decoder_inputs": nep[:, :-1],}, nep[:,1:])

def make_dataset(pairs):
    eng_texts, nep_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    nep_texts = list(nep_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, nep_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.shuffle(2048).prefetch(16).cache()

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [9]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f'targets.shape: {targets.shape}')

inputs["encoder_inputs"].shape: (2, 10)
inputs["decoder_inputs"].shape: (2, 10)
targets.shape: (2, 10)


## Building Model

In [10]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_din, dense_dim, num_heads, **kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads = num_heads, key_dim=embed_din
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation = "relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True
        

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")
        attention_output = self.attention(
            query=inputs, value = inputs, key= inputs, attention_mask= padding_mask
        )
        proj_input = self.layernorm_1(inputs+attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)
    

In [11]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super(TransformerDecoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        
        self.attention_1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.attention_2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        
        self.dense_proj = keras.Sequential(
            [layers.Dense(latent_dim, activation="relu"), layers.Dense(embed_dim)]
        )
        
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        # Generate causal mask
        causal_mask = self.get_causal_attention_mask(inputs)
        
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="float32")
            combined_mask = causal_mask * padding_mask  # Logical AND for masks
        else:
            combined_mask = causal_mask

        # Self-attention block
        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        # Encoder-decoder attention block
        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=combined_mask,
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        # Dense projection
        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size = input_shape[0]
        sequence_length = input_shape[1]

        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="float32")  # Causal mask (lower triangular)
        mask = tf.reshape(mask, (1, sequence_length, sequence_length))
        mask = tf.tile(mask, [batch_size, 1, 1])  # Tile across the batch size
        return mask        

In [12]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super(PositionalEmbedding, self).__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim = vocab_size, output_dim = embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim = sequence_length, output_dim= embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit = length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedding_positions = self.position_embeddings(positions)
        return embedded_tokens + embedding_positions

    def compute_mask(self, inputs, mask = None):
        return tf.math.not_equal(inputs,0)
        

## Training a Model for translation

In [13]:
embed_dim = 256
latent_dim = 512
num_heads = 4

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name ="encoder_inputs")
x= PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)
encoder = keras.Model(encoder_inputs, encoder_outputs)

decoder_inputs = keras.Input(shape=(None,),dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name ="decoder_state_inputs")

x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x,encoded_seq_inputs)
x= layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

decoder_outputs = decoder([decoder_inputs, encoder_outputs])

transformer = keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)



## Training Our Model

In [14]:
epochs = 2

transformer.summary()
transformer.compile(
    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)


Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 positional_embedding (Position  (None, None, 256)   3842560     ['encoder_inputs[0][0]']         
 alEmbedding)                                                                                     
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 transformer_encoder (Transform  (None, None, 256)   1315840     ['positional_embedding[

In [15]:
transformer.fit(train_ds, epochs=epochs, validation_data=val_ds)

Epoch 1/2


InvalidArgumentError: Graph execution error:

Detected at node 'gradient_tape/transformer/transformer_encoder/multi_head_attention/softmax/add/BroadcastGradientArgs' defined at (most recent call last):
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
      app.launch_new_instance()
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
      app.start()
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\site-packages\ipykernel\kernelapp.py", line 739, in start
      self.io_loop.start()
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\site-packages\tornado\platform\asyncio.py", line 205, in start
      self.asyncio_loop.run_forever()
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\asyncio\base_events.py", line 603, in run_forever
      self._run_once()
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\asyncio\base_events.py", line 1909, in _run_once
      handle._run()
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\site-packages\ipykernel\kernelbase.py", line 545, in dispatch_queue
      await self.process_one()
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\site-packages\ipykernel\kernelbase.py", line 534, in process_one
      await dispatch(*args)
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell
      await result
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\site-packages\ipykernel\ipkernel.py", line 362, in execute_request
      await super().execute_request(stream, ident, parent)
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\site-packages\ipykernel\kernelbase.py", line 778, in execute_request
      reply_content = await reply_content
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\site-packages\ipykernel\ipkernel.py", line 449, in do_execute
      res = shell.run_cell(
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell
      return super().run_cell(*args, **kwargs)
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\site-packages\IPython\core\interactiveshell.py", line 3075, in run_cell
      result = self._run_cell(
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\site-packages\IPython\core\interactiveshell.py", line 3130, in _run_cell
      result = runner(coro)
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\site-packages\IPython\core\async_helpers.py", line 128, in _pseudo_sync_runner
      coro.send(None)
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\site-packages\IPython\core\interactiveshell.py", line 3334, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\site-packages\IPython\core\interactiveshell.py", line 3517, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\Leapfrog\AppData\Local\Temp\ipykernel_23300\3062792833.py", line 1, in <module>
      transformer.fit(train_ds, epochs=epochs, validation_data=val_ds)
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\site-packages\keras\engine\training.py", line 1564, in fit
      tmp_logs = self.train_function(iterator)
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\site-packages\keras\engine\training.py", line 1160, in train_function
      return step_function(self, iterator)
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\site-packages\keras\engine\training.py", line 1146, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\site-packages\keras\engine\training.py", line 1135, in run_step
      outputs = model.train_step(data)
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\site-packages\keras\engine\training.py", line 997, in train_step
      self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\site-packages\keras\optimizers\optimizer_v2\optimizer_v2.py", line 576, in minimize
      grads_and_vars = self._compute_gradients(
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\site-packages\keras\optimizers\optimizer_v2\optimizer_v2.py", line 634, in _compute_gradients
      grads_and_vars = self._get_gradients(
    File "C:\Users\Leapfrog\anaconda3\envs\oenv\lib\site-packages\keras\optimizers\optimizer_v2\optimizer_v2.py", line 510, in _get_gradients
      grads = tape.gradient(loss, var_list, grad_loss)
Node: 'gradient_tape/transformer/transformer_encoder/multi_head_attention/softmax/add/BroadcastGradientArgs'
Incompatible shapes: [2,4,10,10] vs. [2,2,10,10]
	 [[{{node gradient_tape/transformer/transformer_encoder/multi_head_attention/softmax/add/BroadcastGradientArgs}}]] [Op:__inference_train_function_9549]