In [13]:
import pandas as pd
import tensorflow as tf

In [14]:
df = pd.read_pickle("./data/sa_to_hi.pkl")

In [15]:
df.head()

Unnamed: 0,sa,hi
0,<hi2> उद वाचमीरयति हिन्वते मती पुरुष्टुतस्य कत...,[start] उन्होंने अपनी आवाज़ और अपने कई प्यारे ...
1,<hi2> तव देव प्रसादाच्च भ्रातुश्च जयतां वर। कृ...,[start] 'हे विजयी स्वामी वह जो आपकी सहायता से ...
2,<hi2> ये नाकस्याधि रोचने दिवि देवास आसते|,[start] जो आकाश के ऊपर स्वर्ग में देवताओं के र...
3,<hi2> यथैव धेनु स्स्रवति स्नेहाद्वत्सस्य वत्सल...,[start] “जिस प्रकार अपने बछड़े की गाय पालने वा...
4,<hi2> क्षिप्रं भवति धर्मात्मा शश्वच्छान्तिं नि...,[start] जल्दी से वह धर्मी (दिमाग वाला) हो जात...


In [16]:
import random
n = random.randint(0,len(df))

df['sa'][n],df['hi'][n]

('<hi2> ततस्तेनाभ्यनुज्ञाताः प्रसन्नेन महात्मना। ताम्रपर्णीं ग्राहजुष्टां तरिष्यथ महानदीम्।।',
 "[start] 'आप तब महानुभावों से प्रभावित तम्रपर्णी महान नदी को पार करेंगे। प्रसन्न (आप के साथ) ऋषि की अनुमति से इसे पार करें। [end]")

In [17]:
len(df)

7336

In [None]:
import sentencepiece as spm
sanskrit_tokenizer = spm.SentencePieceProcessor()
sanskrit_tokenizer.load("./model/sanskrit_tokenizer_2.model")

True

In [None]:
hindi_tokenizer = spm.SentencePieceProcessor()
hindi_tokenizer.load("./model/konkani_tokenizer.model")

True

In [20]:
hindi_tokenizer.decode(19999)

'ऽ'

In [21]:
import numpy as np
def pad_same_lenth(sent,tokenize,max_len):
    sent = tokenize.encode(sent)
    if len(sent) > max_len:
        return tf.convert_to_tensor(sent[:max_len],dtype="int32")
    else:
        return tf.convert_to_tensor(np.pad(sent, (0, max_len - len(sent))),dtype="int32")

In [22]:
max_len = 20
df["sa"] = df["sa"].apply(lambda x:pad_same_lenth(x,sanskrit_tokenizer,max_len))
df["hi"] = df["hi"].apply(lambda x: pad_same_lenth(x, hindi_tokenizer, max_len+1))

I0000 00:00:1743608861.589765    2672 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 4309 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3050 6GB Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


In [23]:
len(df["sa"][38]),len(df["hi"][78])

(20, 21)

In [24]:
val_ds = df.iloc[:100]
train_ds = df.iloc[100:]

In [25]:
import tensorflow as tf

In [26]:
batch_size = 32


def format_dataset(sans, hindi):
    return (
        {
            "english": sans,
            "spanish": hindi[:, :-1],
        },
        hindi[:, 1:],
    )


def make_dataset(pairs,src,trg):
    # eng_texts, spa_texts = zip(*pairs)
    src_text = pairs[src].to_list()
    targ_text = pairs[trg].to_list()
    dataset = tf.data.Dataset.from_tensor_slices((src_text, targ_text))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls=4)
    return dataset.shuffle(2048).prefetch(16).cache()


train_ds = make_dataset(train_ds,"sa","hi")
val_ds = make_dataset(val_ds,"sa","hi")

In [27]:
for inputs, targets in train_ds.take(1):
    print(f"inputs['english'].shape: {inputs['english'].shape}")
    print(f"inputs['spanish'].shape: {inputs['spanish'].shape}")
    print(f"targets.shape: {targets.shape}")

inputs['english'].shape: (32, 20)
inputs['spanish'].shape: (32, 20)
targets.shape: (32, 20)


2025-04-02 21:17:43.633484: W tensorflow/core/kernels/data/cache_dataset_ops.cc:914] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2025-04-02 21:17:43.634099: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [28]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [
                layers.Dense(dense_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
        attention_output = self.attention(inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "embed_dim": self.embed_dim,
                "num_heads": self.num_heads,
                "dense_dim": self.dense_dim,
            }
        )
        return config

In [29]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [
                layers.Dense(dense_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "embed_dim": self.embed_dim,
                "num_heads": self.num_heads,
                "dense_dim": self.dense_dim,
            }
        )
        return config

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(mask, mult)

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)
        else:
            padding_mask = mask
        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        attention_output_1 = self.layernorm_1(inputs + attention_output_1)
        attention_output_2 = self.attention_2(
            query=attention_output_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        attention_output_2 = self.layernorm_2(attention_output_1 + attention_output_2)
        proj_output = self.dense_proj(attention_output_2)
        return self.layernorm_3(attention_output_2 + proj_output)

In [30]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=input_dim, output_dim=output_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim
        )
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    # def compute_mask(self, inputs, mask=None):
    #     return tf.math.not_equal(inputs, 0)

    def get_config(self):
        config = super(PositionalEmbedding, self).get_config()
        config.update(
            {
                "output_dim": self.output_dim,
                "sequence_length": self.sequence_length,
                "input_dim": self.input_dim,
            }
        )
        return config

In [31]:
embed_dim = 256
dense_dim = 2048
num_heads = 8
sequence_length = 20
vocab_size = 20000

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="english")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="spanish")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
transformer = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [32]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint(
    "best_model.keras", monitor="val_loss", save_best_only=True, mode="min", verbose=1
)

In [34]:
transformer.compile(
    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
hist = transformer.fit(train_ds, epochs=6, validation_data=val_ds,callbacks=[checkpoint])

Epoch 1/6


W0000 00:00:1743608975.384808    4615 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert


[1m  3/227[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m12s[0m 54ms/step - accuracy: 0.3238 - loss: 3.6578




[1m186/227[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m2s[0m 52ms/step - accuracy: 0.3281 - loss: 3.7486

W0000 00:00:1743608988.573040    4619 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert


[1m189/227[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m2s[0m 70ms/step - accuracy: 0.3282 - loss: 3.7490




[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - accuracy: 0.3290 - loss: 3.7529

W0000 00:00:1743608994.218338    4620 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert
W0000 00:00:1743608994.748972    4615 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert



Epoch 1: val_loss did not improve from 5.17980
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 75ms/step - accuracy: 0.3290 - loss: 3.7530 - val_accuracy: 0.2625 - val_loss: 5.3143
Epoch 2/6
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.3539 - loss: 3.5651
Epoch 2: val_loss did not improve from 5.17980
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 51ms/step - accuracy: 0.3539 - loss: 3.5650 - val_accuracy: 0.2600 - val_loss: 5.4192
Epoch 3/6
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.3793 - loss: 3.3015
Epoch 3: val_loss did not improve from 5.17980
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 52ms/step - accuracy: 0.3794 - loss: 3.3014 - val_accuracy: 0.2520 - val_loss: 5.6048
Epoch 4/6
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.4051 - loss: 3.0742
Epoch 4: val_loss did not improve from 

In [123]:
transformer.summary()

In [145]:
tf.convert_to_tensor(hindi_tokenizer.encode(["[start]"]))

<tf.Tensor: shape=(1, 2), dtype=int32, numpy=array([[19877,     3]], dtype=int32)>

In [None]:
import numpy as np

# spa_vocab = target_vectorization.get_vocabulary()
# spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab))
max_decoded_sentence_length = 20


def decode_sequence(input_sentence,transformer):
    hehe = sanskrit_tokenizer.encode(input_sentence)[:20]
    tokenized_input_sentence = tf.convert_to_tensor([hehe])
    # print(tokenized_input_sentence)
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = tf.convert_to_tensor([hindi_tokenizer.encode(decoded_sentence)[:][:-1]])
        # print([tokenized_input_sentence, tokenized_target_sentence])
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = np.argmax(predictions[0, i, :])
        print(sampled_token_index)
        sampled_token = hindi_tokenizer.decode(int(sampled_token_index))
        decoded_sentence += " " + sampled_token
        if sampled_token == "[end]":
            break
    return decoded_sentence


# test_eng_texts = [pair[0] for pair in test_pairs]
# for _ in range(20):
#     input_sentence = random.choice(test_eng_texts)
#     print("-")
#     print(input_sentence)
#     print(decode_sequence(input_sentence))

decode_sequence(
    "<hi2> तेन जायामुप परियां मन्दानो याह्यन्धसो योजा|",transformer
)  # [start] Do you celebrate Easter in Brazil? [end]

3
82
4


'[start] [start] हे [end]'

'दिग्राम'