In [107]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt

# Load Quran English and Urdu translations
with open('/content/Quran-EN (1)', 'r', encoding='utf-8') as f:
    eng_lines = f.read().strip().split('\n')

with open('/content/Quran-UR (1)', 'r', encoding='utf-8') as f:
    ur_lines = f.read().strip().split('\n')

# Add <start> and <end> tokens to English sentences
en_lines = ['<start> ' + line + ' <end>' for line in eng_lines]

print(f"Total english sentences: {len(eng_lines)}")
print(f"Total urdu sentences: {len(ur_lines)}")



Total english sentences: 6414
Total urdu sentences: 6414


In [108]:
ur_lines = ['<start> ' + line + ' <end>' for line in ur_lines]

In [109]:
ur_lines[0:5]

['<start> \ufeffسب تعریفیں اللہ ہی کے لئے ہیں جو تمام جہانوں کی پرورش فرمانے والا ہے ۔ <end>',
 '<start> نہایت مہربان بہت رحم فرمانے والا ہے ۔ <end>',
 '<start> روزِ جزا کا مالک ہے ۔ <end>',
 '<start> اے اللہ ! ہم تیری ہی عبادت کرتے ہیں اور ہم تجھ ہی سے مدد چاہتے ہیں ۔ <end>',
 '<start> ہمیں سیدھا راستہ دکھا ۔ <end>']

In [110]:
# Urdu tokenizer (input)
ur_tokenizer = Tokenizer(filters='')
ur_tokenizer.fit_on_texts(ur_lines)
ur_tensor = ur_tokenizer.texts_to_sequences(ur_lines)
ur_tensor = pad_sequences(ur_tensor, padding='post')

# English tokenizer (target)
en_tokenizer = Tokenizer(filters='')
en_tokenizer.fit_on_texts(en_lines)  # ✅ Corrected
en_tensor = en_tokenizer.texts_to_sequences(en_lines)  # ✅ Corrected
en_tensor = pad_sequences(en_tensor, padding='post')

In [111]:
!pip install nltk
import nltk
nltk.download('punkt')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [112]:
from sklearn.model_selection import train_test_split

input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(
    ur_tensor, en_tensor, test_size=0.1)

print("Train size:", len(input_tensor_train))
print("Validation size:", len(input_tensor_val))

Train size: 5772
Validation size: 642


In [113]:
ur_vocab_size = len(ur_tokenizer.word_index) + 1
en_vocab_size = len(en_tokenizer.word_index) + 1

print("Urdu vocab size:", ur_vocab_size)
print("English vocab size:", en_vocab_size)


Urdu vocab size: 8144
English vocab size: 8138


In [114]:
from tensorflow.keras.layers import Embedding, LSTM
import tensorflow as tf

class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units):
        super(Encoder, self).__init__()
        self.enc_units = enc_units
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.lstm = LSTM(enc_units, return_sequences=True, return_state=True)

    def call(self, x):
        x = self.embedding(x)  # shape: (batch_size, seq_len, embedding_dim)
        output, state_h, state_c = self.lstm(x)
        return output, state_h, state_c


In [115]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)  # for encoder hidden states
        self.W2 = tf.keras.layers.Dense(units)  # for decoder hidden state
        self.V = tf.keras.layers.Dense(1)       # for calculating the attention score

    def call(self, encoder_outputs, decoder_hidden):
        # encoder_outputs shape: all hidden states
        # decoder_hidden shape: current hidden state
        # Wused for dimension same
        decoder_hidden_time_axis = tf.expand_dims(decoder_hidden, 1)

        # Score calculation (alignment model)
        score = self.V(tf.nn.tanh(
            self.W1(encoder_outputs) + self.W2(decoder_hidden_time_axis)
        ))  # score shape: (batch_size, max_length, 1)

        # Attention weights (softmax over time steps) gets distribution of all tokens
        attention_weights = tf.nn.softmax(score, axis=1)

        # Context vector: weighted sum of encoder_outputs
        context_vector = attention_weights * encoder_outputs
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights


In [116]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, decoder_units):
        super(Decoder, self).__init__()
        self.decoder_units = decoder_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(decoder_units,
                                         return_sequences=True,
                                         return_state=True,
                                         recurrent_initializer='glorot_uniform')
        self.attention = BahdanauAttention(decoder_units)
        self.fc = tf.keras.layers.Dense(vocab_size)

    def call(self, x, hidden, enc_output):


        # Step 1: Attention
        context_vector, attention_weights = self.attention(enc_output, hidden)

        # Step 2: Embed current input word (like <start>, then next predicted words)
        x = self.embedding(x)  # (batch_size, 1, embedding_dim)

        # Step 3: Concatenate context vector with embedded input word
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # Step 4: Pass through LSTM
        output, state_h, state_c = self.lstm(x)

        # Step 5: Output shape -> (batch_size, vocab_size)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)

        return x, state_h, state_c, attention_weights


In [117]:
def translate(inp_sentence):
    # Preprocess input sentence
    inp_sentence = '<start> ' + inp_sentence + ' <end>'  # Add tokens
    inp_sentence_seq = ur_tokenizer.texts_to_sequences([inp_sentence])
    inp_sentence_seq = pad_sequences(inp_sentence_seq, padding='post', maxlen=input_tensor_train.shape[1])

    # Get the encoder's output
    enc_output, enc_hidden_h, enc_hidden_c = encoder(inp_sentence_seq)

    # Start the translation (decoder input: <start> token)
    start_token = en_tokenizer.word_index['<start>']
    dec_input = tf.expand_dims([start_token], 1)

    # Initialize with encoder's hidden state
    dec_hidden_h = enc_hidden_h
    dec_hidden_c = enc_hidden_c

    predicted_sentence = []

    # Run the decoder to predict words step by step
    for t in range(1, en_tensor.shape[1]):  # Max sentence length
        predictions, dec_hidden_h, dec_hidden_c, _ = decoder(dec_input, dec_hidden_h, enc_output)

        predicted_id = tf.argmax(predictions, axis=1).numpy()[0]  # Get the predicted word index

        if predicted_id == 0:  # Skip padding
            continue

        predicted_word = en_tokenizer.index_word.get(predicted_id, '<unknown>')

        if predicted_word == '<end>':
            break

        if predicted_word != '<start>':  # Don't add <start> to the result
            predicted_sentence.append(predicted_word)

        # Use the predicted word as the next input to the decoder
        dec_input = tf.expand_dims([predicted_id], 1)

    return ' '.join(predicted_sentence)

In [118]:
def calculate_bleu_score():
    references = []
    hypotheses = []

    # Only evaluate a sample for faster computation
    sample_size = min(100, len(input_tensor_val))

    for i in range(sample_size):
        # Get input sentence
        inp_sentence = ur_tokenizer.sequences_to_texts([input_tensor_val[i]])[0]

        # Get target sentence and clean it
        targ_sentence = en_tokenizer.sequences_to_texts([target_tensor_val[i]])[0]
        targ_sentence = targ_sentence.replace('<start>', '').replace('<end>', '').strip()

        # Translate the input sentence
        predicted_sentence = translate(inp_sentence)

        # Add to lists
        references.append([targ_sentence.split()])
        hypotheses.append(predicted_sentence.split())

    # Calculate BLEU score with different n-grams
    bleu1 = corpus_bleu(references, hypotheses, weights=(1.0, 0, 0, 0))
    bleu2 = corpus_bleu(references, hypotheses, weights=(0.5, 0.5, 0, 0))
    bleu3 = corpus_bleu(references, hypotheses, weights=(0.33, 0.33, 0.33, 0))
    bleu4 = corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25))

    print(f"BLEU-1: {bleu1:.4f}")
    print(f"BLEU-2: {bleu2:.4f}")
    print(f"BLEU-3: {bleu3:.4f}")
    print(f"BLEU-4: {bleu4:.4f}")

    return bleu4

In [119]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))  # Mask padded tokens
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)


In [120]:
import tensorflow as tf

# Ensure eager execution
tf.compat.v1.enable_eager_execution()
optimizer = tf.keras.optimizers.Adam()

In [121]:
@tf.function
def train_step(inp, targ, enc_hidden_h, enc_hidden_c):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden_h, enc_hidden_c = encoder(inp)

        dec_hidden_h = enc_hidden_h
        dec_hidden_c = enc_hidden_c

        dec_input = tf.expand_dims([en_tokenizer.word_index['<start>']] * inp.shape[0], 1)

        for t in range(1, targ.shape[1]):
            predictions, dec_hidden_h, dec_hidden_c, _ = decoder(dec_input, dec_hidden_h, enc_output)

            loss += loss_function(targ[:, t], predictions)

            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = loss / int(targ.shape[1])

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss


In [122]:

BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 16
steps_per_epoch = len(input_tensor_train) // BATCH_SIZE

embedding_dim = 256
units = 512

encoder = Encoder(ur_vocab_size, embedding_dim, units)
decoder = Decoder(en_vocab_size, embedding_dim, units)


dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

EPOCHS = 5

for epoch in range(EPOCHS):
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset):
        enc_hidden_h = tf.zeros((BATCH_SIZE, units))
        enc_hidden_c = tf.zeros((BATCH_SIZE, units))

        batch_loss = train_step(inp, targ, enc_hidden_h, enc_hidden_c)
        total_loss += batch_loss

    print(f'Epoch {epoch+1} Loss {total_loss / steps_per_epoch:.4f}')


Epoch 1 Loss 0.9368
Epoch 2 Loss 0.8173
Epoch 3 Loss 0.7641
Epoch 4 Loss 0.7199
Epoch 5 Loss 0.6828


In [123]:
# Calculate BLEU score on validation set
print("Calculating BLEU score...")
bleu_score = calculate_bleu_score()
print(f"Final BLEU-4 Score: {bleu_score:.4f}")

# Test the translation with a few examples
test_sentences = [
    ur_lines[0].replace('<start>', '').replace('<end>', '').strip(),
    ur_lines[5].replace('<start>', '').replace('<end>', '').strip(),
    ur_lines[10].replace('<start>', '').replace('<end>', '').strip()
]

for sentence in test_sentences:
    translation = translate(sentence)
    print(f"Urdu: {sentence}")
    print(f"Translation: {translation}")
    print("-" * 50)

Calculating BLEU score...


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU-1: 0.0397
BLEU-2: 0.0141
BLEU-3: 0.0044
BLEU-4: 0.0000
Final BLEU-4 Score: 0.0000
Urdu: ﻿سب تعریفیں اللہ ہی کے لئے ہیں جو تمام جہانوں کی پرورش فرمانے والا ہے ۔
Translation: ( o beloved ! ) we have been sent down to the people of the earth .
--------------------------------------------------
Urdu: ان لوگوں کا راستہ جن پر تو نے انعام فرمایا ۔
Translation: ( o beloved ! )
--------------------------------------------------
Urdu: اور وہ لوگ جو آپ کی طرف نازل کیا گیا اور جو آپ سے پہلے نازل کیا گیا سب پر ایمان لاتے ہیں ، اور وہ آخرت پر بھی کامل یقین رکھتے ہیں ۔
Translation: and ( o beloved ! ) we have been sent down to the earth .
--------------------------------------------------
