In [11]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hindi-english-parallel-corpus/hindi_english_parallel.csv


In [12]:
import tensorflow as tf
import math
import re
import string

In [13]:
df = pd.read_csv('/kaggle/input/hindi-english-parallel-corpus/hindi_english_parallel.csv')
df.head()

Unnamed: 0,hindi,english
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,Give your application an accessibility workout
1,एक्सेर्साइसर पहुंचनीयता अन्वेषक,Accerciser Accessibility Explorer
2,निचले पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the bottom panel
3,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the top panel
4,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...,A list of plugins that are disabled by default


In [14]:
def preprocess_text(df):
    """
    Cleans 'english_sent' and 'hindi_sent' columns in the DataFrame.
    """
    # Lowercase and handle missing values
    df["english"] = df["english"].fillna("").apply(lambda x: x.lower())
    df["hindi"] = df["hindi"].fillna("").apply(lambda x: x.lower())

    # Remove URLs
    url_pattern = r"http\S+"
    df["english"] = df["english"].apply(lambda x: re.sub(url_pattern, "", x))
    df["hindi"] = df["hindi"].apply(lambda x: re.sub(url_pattern, "", x))

    # Remove digits
    remove_digits = str.maketrans("", "",string.digits)
    df["english"] = df["english"].apply(lambda x : x.translate(remove_digits))
    df["hindi"] = df["hindi"].apply(lambda x : x.translate(remove_digits))
    df["hindi"] = df["hindi"].apply(lambda x : re.sub("[a-zA-z२३०८१५७९४६]", "", x))
    # Remove special characters
    special_characters = set(string.punctuation)
    df['english'] = df['english'].apply(
        lambda x: ''.join(ch for ch in x if ch not in special_characters)
    )
    df['hindi'] = df['hindi'].apply(
        lambda x: ''.join(ch for ch in x if ch not in special_characters)
    )

    # Remove single quotes
    df['english'] = df['english'].apply(lambda x: re.sub("'", '', x))
    df['hindi'] = df['hindi'].apply(lambda x: re.sub("'", '', x))

    # Remove extra spaces
    df['english'] = df['english'].apply(lambda x : x.strip())
    df['hindi'] = df['hindi'].apply(lambda x : x.strip())
    df['english'] = df['english'].apply(lambda x : re.sub(" +"," ",x))
    df['hindi'] = df['hindi'].apply(lambda x : re.sub(" +"," ",x))


#     # Add [start] and [end] tags to Hindi sentences
#     df["hindi"] = df["hindi"].apply(lambda x: "[start] " + x + " [end]")

    return df



In [15]:
df = preprocess_text(df)
df.head()

Unnamed: 0,hindi,english
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,give your application an accessibility workout
1,एक्सेर्साइसर पहुंचनीयता अन्वेषक,accerciser accessibility explorer
2,निचले पटल के लिए डिफोल्ट प्लगइन खाका,the default plugin layout for the bottom panel
3,ऊपरी पटल के लिए डिफोल्ट प्लगइन खाका,the default plugin layout for the top panel
4,उन प्लगइनों की सूची जिन्हें डिफोल्ट रूप से निष...,a list of plugins that are disabled by default


In [16]:
def word_count(sentence):
    return len(sentence.split())


In [17]:
df = df[
    df['english'].apply(lambda x: word_count(x) <= 10) & 
    df['hindi'].apply(lambda x: word_count(x) <= 10)
]

In [18]:

# df = df.iloc[:300000]
df.shape

(844857, 2)

In [19]:
# df_fl = df[df['hindi'].apply(lambda x: word_count(x) <= 10)]
# all_unique_words = set()
# for sentence in df_fl['hindi']:
#     all_unique_words.update(unique_words(sentence))
# uwc = len(all_unique_words)
# print(uwc)

In [20]:
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split

# Modified Tokenization function to handle special tokens correctly
def tokenize_texts(df, src_col, tgt_col):
    tokenizer_src = tf.keras.preprocessing.text.Tokenizer(filters='')
    tokenizer_tgt = tf.keras.preprocessing.text.Tokenizer(filters='')

    # Fit the tokenizers on the source and target columns
    tokenizer_src.fit_on_texts(df[src_col].values)
    tokenizer_tgt.fit_on_texts(df[tgt_col].values)

    # Get indices for special tokens
    start_token_idx_src = len(tokenizer_src.word_index) + 1
    end_token_idx_src = start_token_idx_src + 1
    start_token_idx_tgt = len(tokenizer_tgt.word_index) + 1
    end_token_idx_tgt = start_token_idx_tgt + 1

    # Add special tokens to the tokenizer word index
    tokenizer_src.word_index['<start>'] = start_token_idx_src
    tokenizer_src.word_index['<end>'] = end_token_idx_src
    tokenizer_tgt.word_index['<start>'] = start_token_idx_tgt
    tokenizer_tgt.word_index['<end>'] = end_token_idx_tgt

    # Convert texts to sequences and add <start> and <end> tokens
    src_sequences = []
    tgt_sequences = []

    for text in df[src_col].values:
        seq = tokenizer_src.texts_to_sequences([text])[0]
        seq = [start_token_idx_src] + seq + [end_token_idx_src]
        src_sequences.append(seq)

    for text in df[tgt_col].values:
        seq = tokenizer_tgt.texts_to_sequences([text])[0]
        seq = [start_token_idx_tgt] + seq + [end_token_idx_tgt]
        tgt_sequences.append(seq)

    # Pad sequences to have the same length
    src_sequences = tf.keras.preprocessing.sequence.pad_sequences(src_sequences, padding='post')
    tgt_sequences = tf.keras.preprocessing.sequence.pad_sequences(tgt_sequences, padding='post')

    return src_sequences, tgt_sequences, tokenizer_src, tokenizer_tgt

In [21]:
# Assume df is your DataFrame
src_sequences, tgt_sequences, tokenizer_src, tokenizer_tgt = tokenize_texts(df, 'english', 'hindi')

# Define maximum sequence length
max_seq_len = max(max(len(seq) for seq in src_sequences), max(len(seq) for seq in tgt_sequences))

# Define vocabulary sizes based on the tokenizer
input_vocab_size = len(tokenizer_src.word_index) + 1  # +1 for padding token (if any)
target_vocab_size = len(tokenizer_tgt.word_index) + 1  # +1 for padding token (if any)

# Split the data into train, validation, and test sets
train_src, test_src, train_tgt, test_tgt = train_test_split(src_sequences, tgt_sequences, test_size=0.2, random_state=42)
train_src, val_src, train_tgt, val_tgt = train_test_split(train_src, train_tgt, test_size=0.2, random_state=42)


In [22]:
# Positional Encoding
def positional_encoding(position, d_model):
    angle_rads = np.arange(position)[:, np.newaxis] / np.power(10000, (2 * (np.arange(d_model) // 2)) / np.float32(d_model))
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
    pos_enc = tf.cast(angle_rads[np.newaxis, ...], dtype=tf.float32)
    return tf.Variable(pos_enc, trainable=False)

In [23]:
# Scaled Dot-Product Attention
def scaled_dot_product_attention(q, k, v, mask):
    matmul_qk = tf.matmul(q, k, transpose_b=True)
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
    
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)  # Large negative value to mask out softmax

    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
    output = tf.matmul(attention_weights, v)
    
    return output, attention_weights

In [24]:
# Multi-Head Attention Layer
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.depth = d_model // num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)
        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, q, k, v, mask):
        batch_size = tf.shape(q)[0]
        q = self.split_heads(self.wq(q), batch_size)
        k = self.split_heads(self.wk(k), batch_size)
        v = self.split_heads(self.wv(v), batch_size)

        output, attention_weights = scaled_dot_product_attention(q, k, v, mask)
        output = tf.transpose(output, perm=[0, 2, 1, 3])
        output = tf.reshape(output, (batch_size, -1, self.d_model))

        return self.dense(output), attention_weights


In [25]:
# Feed Forward Network Layer
class FeedForwardNetwork(tf.keras.layers.Layer):
    def __init__(self, d_model, dff):
        super(FeedForwardNetwork, self).__init__()
        self.dense1 = tf.keras.layers.Dense(dff, activation='relu')
        self.dense2 = tf.keras.layers.Dense(d_model)

    def call(self, x):
        x = self.dense1(x)
        return self.dense2(x)

In [26]:
# Encoder Layer
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.3):
        super(EncoderLayer, self).__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = FeedForwardNetwork(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask=None):
        attn_output, _ = self.mha(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)

        return out2

In [27]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, vocab_size, pe_input, rate=0.3):
        super(Encoder, self).__init__()
        self.num_layers = num_layers
        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
        self.pos_encoding = positional_encoding(pe_input, self.d_model)
        
        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask=None):
        seq_len = tf.shape(x)[1]
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]
        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training=training, mask=mask)

        return x

In [28]:
# Decoder Layer
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.3):
        super(DecoderLayer, self).__init__()
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        self.ffn = FeedForwardNetwork(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)

    def call(self, x, enc_output, training, look_ahead_mask=None, dec_padding_mask=None):
        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(x + attn1)

        attn2, attn_weights_block2 = self.mha2(out1, enc_output, enc_output, dec_padding_mask)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(out1 + attn2)

        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(out2 + ffn_output)

        return out3, attn_weights_block1, attn_weights_block2

In [29]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate=0.3):
        super(Decoder, self).__init__()
        self.num_layers = num_layers
        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(pe_target, d_model)
        
        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, enc_output, training, look_ahead_mask=None, dec_padding_mask=None):
        attention_weights = {}
        
        seq_len = tf.shape(x)[1]
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]
        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](x, enc_output, training=training, look_ahead_mask=look_ahead_mask, dec_padding_mask=dec_padding_mask)
            attention_weights[f'decoder_layer{i+1}_block1'] = block1
            attention_weights[f'decoder_layer{i+1}_block2'] = block2

        return x, attention_weights

In [30]:
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.3):
        super(Transformer, self).__init__()
        self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input, rate)
        self.decoder = Decoder(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate)
        self.final_layer = tf.keras.layers.Dense(target_vocab_size)

    def call(self, inp, tar, training=None, enc_padding_mask=None, look_ahead_mask=None, dec_padding_mask=None):
        enc_output = self.encoder(inp, training=training, mask=enc_padding_mask)
        dec_output, attention_weights = self.decoder(
            tar, enc_output, training=training, look_ahead_mask=look_ahead_mask, dec_padding_mask=dec_padding_mask
        )
        final_output = self.final_layer(dec_output)

        return final_output, attention_weights

In [31]:
# Instantiate the Transformer model
d_model = 512
dff = 2048
num_layers = 6
num_heads = 8
pe_input = max_seq_len
pe_target = max_seq_len


In [32]:
# Helper functions for masks
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask

def create_masks(inp, tar):
    enc_padding_mask = create_padding_mask(inp)
    dec_padding_mask = create_padding_mask(inp)
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
    
    return enc_padding_mask, combined_mask, dec_padding_mask


In [33]:

# Loss function
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

In [34]:
# Learning rate schedule
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        step = tf.cast(step, tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

# Metrics
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
val_loss = tf.keras.metrics.Mean(name='val_loss')
val_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='val_accuracy')
test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')

In [35]:

# Training and validation step functions
@tf.function
def train_step(inp, tar, transformer, optimizer, loss_function, train_loss, train_accuracy):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

    with tf.GradientTape() as tape:
        predictions, _ = transformer(
            inp, tar_inp, training=True,
            enc_padding_mask=enc_padding_mask, 
            look_ahead_mask=combined_mask, 
            dec_padding_mask=dec_padding_mask
        )
        loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss.update_state(loss)
    train_accuracy.update_state(tar_real, predictions)

@tf.function
def val_step(inp, tar, transformer, loss_function, val_loss, val_accuracy):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

    predictions, _ = transformer(
        inp, tar_inp, training=False,
        enc_padding_mask=enc_padding_mask, 
        look_ahead_mask=combined_mask, 
        dec_padding_mask=dec_padding_mask
    )
    loss = loss_function(tar_real, predictions)

    val_loss.update_state(loss)
    val_accuracy.update_state(tar_real, predictions)

In [36]:
@tf.function
def val_step(inp, tar, transformer, loss_function, val_loss, val_accuracy):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

    predictions, _ = transformer(
        inp, tar_inp, training=False,
        enc_padding_mask=enc_padding_mask, 
        look_ahead_mask=combined_mask, 
        dec_padding_mask=dec_padding_mask
    )
    loss = loss_function(tar_real, predictions)

    val_loss.update_state(loss)
    val_accuracy.update_state(tar_real, predictions)

In [37]:
# Test step function
@tf.function
def test_step(inp, tar, transformer, loss_function, test_loss, test_accuracy):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

    predictions, _ = transformer(
        inp, tar_inp, training=False,
        enc_padding_mask=enc_padding_mask, 
        look_ahead_mask=combined_mask, 
        dec_padding_mask=dec_padding_mask
    )
    loss = loss_function(tar_real, predictions)

    test_loss.update_state(loss)
    test_accuracy.update_state(tar_real, predictions)

In [38]:
BUFFER_SIZE = 50000
BATCH_SIZE = 128


# Data preparation for training, validation, and testing
train_dataset = tf.data.Dataset.from_tensor_slices((train_src, train_tgt))
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

val_dataset = tf.data.Dataset.from_tensor_slices((val_src, val_tgt))
val_dataset = val_dataset.batch(BATCH_SIZE)

test_dataset = tf.data.Dataset.from_tensor_slices((test_src, test_tgt))
test_dataset = test_dataset.batch(BATCH_SIZE)

# Instantiate the transformer model
num_layers = 4
d_model = 128
num_heads = 8
dff = 512
pe_input = max_seq_len
pe_target = max_seq_len

transformer = Transformer(num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target)

# Define the learning rate schedule and optimizer
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

# Early Stopping Configuration
best_val_loss = float('inf')
patience_counter = 0

In [2]:
EPOCHS = 10

# Lists to track metrics over epochs for plotting
train_loss_history = []
val_loss_history = []
test_loss_history = []
train_accuracy_history = []
val_accuracy_history = []
test_accuracy_history = []

# Training loop with validation and early stopping
for epoch in range(EPOCHS):
    train_loss.reset_state()  # Correct method to reset the metric
    train_accuracy.reset_state()  # Correct method to reset the metric
    val_loss.reset_state()  # Correct method to reset the metric
    val_accuracy.reset_state()  # Correct method to reset the metric

    for (batch, (inp, tar)) in enumerate(train_dataset):
        train_step(inp, tar, transformer, optimizer, loss_function, train_loss, train_accuracy)
        print(f'Epoch {epoch + 1} Train Loss {train_loss.result():.4f} Train Accuracy {train_accuracy.result():.4f}')

    for (batch, (inp, tar)) in enumerate(val_dataset):
        val_step(inp, tar, transformer, loss_function, val_loss, val_accuracy)

    print(f'Epoch {epoch + 1} Train Loss {train_loss.result():.4f} Train Accuracy {train_accuracy.result():.4f}')
    print(f'Epoch {epoch + 1} Validation Loss {val_loss.result():.4f} Validation Accuracy {val_accuracy.result():.4f}')

    # Save metrics for plotting
    train_loss_history.append(train_loss.result().numpy())
    val_loss_history.append(val_loss.result().numpy())
    train_accuracy_history.append(train_accuracy.result().numpy())
    val_accuracy_history.append(val_accuracy.result().numpy())


# Testing the Model after Training
test_loss.reset_state()  # Correct method to reset the metric
test_accuracy.reset_state()  # Correct method to reset the metric

for (batch, (inp, tar)) in enumerate(test_dataset):
    test_step(inp, tar, transformer, loss_function, test_loss, test_accuracy)

print(f'Test Loss: {test_loss.result():.4f}')
print(f'Test Accuracy: {test_accuracy.result():.4f}')

# Save test metrics for plotting
test_loss_history.append(test_loss.result().numpy())
test_accuracy_history.append(test_accuracy.result().numpy())


In [None]:
# Plotting Training, Validation, and Test Loss
plt.figure(figsize=(10, 6))
plt.plot(train_loss_history, label='Training Loss')
plt.plot(val_loss_history, label='Validation Loss')
plt.plot([len(train_loss_history)-1]*len(test_loss_history), test_loss_history, label='Test Loss', linestyle='dashed')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training, Validation, and Test Loss')
plt.legend()
plt.show()

# Plotting Training, Validation, and Test Accuracy
plt.figure(figsize=(10, 6))
plt.plot(train_accuracy_history, label='Training Accuracy')
plt.plot(val_accuracy_history, label='Validation Accuracy')
plt.plot([len(train_accuracy_history)-1]*len(test_accuracy_history), test_accuracy_history, label='Test Accuracy', linestyle='dashed')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training, Validation, and Test Accuracy')
plt.legend()
plt.show()


In [None]:
# Inference function with improved handling of special tokens and sequence generation
def evaluate(inp_sentence, max_output_length=10):
    start_token = [tokenizer_src.word_index['<start>']]
    end_token = [tokenizer_src.word_index['<end>']]

    # Convert input sentence to a sequence of tokens, adding <start> and <end> tokens
    inp_sequence = start_token + tokenizer_src.texts_to_sequences([inp_sentence])[0] + end_token
    inp_sequence = tf.keras.preprocessing.sequence.pad_sequences([inp_sequence], maxlen=max_seq_len, padding='post')

    # Prepare the encoder input and initial decoder output (<start> token)
    encoder_input = tf.convert_to_tensor(inp_sequence)
    output = tf.convert_to_tensor([start_token])

    for i in range(max_output_length):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(encoder_input, output)

        # Run the Transformer to get predictions
        predictions, _ = transformer(
            encoder_input, output, training=False,
            enc_padding_mask=enc_padding_mask, look_ahead_mask=combined_mask, dec_padding_mask=dec_padding_mask
        )

        # Select the last time step's prediction
        predictions = predictions[:, -1:, :]
        
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        # Check if the predicted ID is the <end> token
        if predicted_id == end_token[0]:
            return tf.squeeze(output, axis=0)

        # Concatenate the predicted ID to the output sequence
        output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0)

# Translation function
def translate(sentence):
    # Evaluate the sentence and convert result to human-readable form
    result = evaluate(sentence)
    # Extract non-padding, non-special tokens from the output
    predicted_sentence = tokenizer_tgt.sequences_to_texts([[i for i in result.numpy() if i > 0 and i != tokenizer_tgt.word_index['<start>'] and i != tokenizer_tgt.word_index['<end>']]])[0]
    print(f'Input: {sentence}')
    print(f'Predicted translation: {predicted_sentence}')

# Example usage
translate("pridicting is not good thing which should be promoted")
