### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, SimpleRNN, Embedding, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import load_model
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from joblib import Parallel, delayed
import re

2024-10-13 16:25:02.287669: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-13 16:25:02.288043: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-13 16:25:02.290210: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-13 16:25:02.296730: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-13 16:25:02.306783: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been 

### Loading the Data & Preprocessing

In [3]:
# Load the dataset
df = pd.read_excel("parallel-corpus.xlsx")

In [4]:
print(df["SENTENCES "].isnull().sum())
print(df["MEANING"].isnull().sum())
df.dropna(subset=['SENTENCES ', 'MEANING'], inplace=True)

44
546


In [5]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)
def normalize_whitespace(text):
    return re.sub(r'\s+', ' ', text).strip()
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [7]:
# Separate the English and Urdu sentences
english_sentences = df['SENTENCES '].values
urdu_sentences = df['MEANING'].values
english_sentences = english_sentences.astype(str)
urdu_sentences = urdu_sentences.astype(str)


In [8]:
# Preprocess the English sentences
english_sentences = [remove_url(sentence) for sentence in english_sentences]
english_sentences = [normalize_whitespace(sentence) for sentence in english_sentences]
english_sentences = [remove_html_tags(sentence) for sentence in english_sentences]
urdu_sentences = [remove_url(sentence) for sentence in urdu_sentences]
urdu_sentences = [normalize_whitespace(sentence) for sentence in urdu_sentences]
urdu_sentences = [remove_html_tags(sentence) for sentence in urdu_sentences]

In [11]:
# Add start and end tokens to Urdu sentences
urdu_sentences = ['start ' + sentence + ' end' for sentence in urdu_sentences]

### Parameters

In [12]:
max_words = 15000  # Maximum vocabulary size for both languages
max_sequence_length = 20  # Max length of sentences
max_urdu_len = max_sequence_length + 2  # Account for start and end tokens

### Tokenization & Padding

In [13]:
# Tokenize English sentences
english_tokenizer = Tokenizer(num_words=max_words)
english_tokenizer.fit_on_texts(english_sentences)
english_sequences = english_tokenizer.texts_to_sequences(english_sentences)
english_word_index = english_tokenizer.word_index
english_padded = pad_sequences(english_sequences, maxlen=max_sequence_length, padding='post')


In [14]:
# Tokenize Urdu sentences
urdu_tokenizer = Tokenizer(num_words=max_words)
urdu_tokenizer.fit_on_texts(urdu_sentences)
urdu_sequences = urdu_tokenizer.texts_to_sequences(urdu_sentences)
urdu_word_index = urdu_tokenizer.word_index
urdu_padded = pad_sequences(urdu_sequences, maxlen=max_urdu_len, padding='post')

### Vocabulary Sizes

In [15]:
english_vocab_size = len(english_word_index) + 1
urdu_vocab_size = len(urdu_word_index) + 1
# Print vocabulary sizes
print(f"English Vocabulary Size: {english_vocab_size}")
print(f"Urdu Vocabulary Size: {urdu_vocab_size}")


English Vocabulary Size: 16471
Urdu Vocabulary Size: 17611


### Train-Test Split

In [16]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(english_padded, urdu_padded, test_size=0.2, random_state=42)

### Defining Model Architecture

In [17]:
latent_dim = 256  # Number of units in the SimpleRNN layers

# Encoder
encoder_inputs = Input(shape=(max_sequence_length,))
encoder_embedding = Embedding(input_dim=english_vocab_size, output_dim=latent_dim)(encoder_inputs)
encoder_rnn = SimpleRNN(latent_dim, return_state=True)
_, state_h = encoder_rnn(encoder_embedding)
encoder_states = [state_h]

# Decoder
decoder_inputs = Input(shape=(max_urdu_len,))
decoder_embedding = Embedding(input_dim=urdu_vocab_size, output_dim=latent_dim)(decoder_inputs)
decoder_rnn = SimpleRNN(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _ = decoder_rnn(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(urdu_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [18]:
# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
# Compile the model
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


### Model Training 

In [19]:
# Prepare the target sequences with one time step shift for training
y_train_shifted = np.zeros_like(y_train)
y_train_shifted[:, :-1] = y_train[:, 1:]
y_train_shifted[:, -1] = urdu_tokenizer.word_index['end']  # End token

In [20]:
# Prepare the target sequences with one time step shift for evaluation
y_test_shifted = np.zeros_like(y_test)
y_test_shifted[:, :-1] = y_test[:, 1:]
y_test_shifted[:, -1] = urdu_tokenizer.word_index['end'] 

In [15]:
batch_size = 64
epochs = 100

history = model.fit([X_train, y_train], y_train_shifted,
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_split=0.2)

Epoch 1/100




[1m302/302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m244s[0m 796ms/step - accuracy: 0.4437 - loss: 4.5457 - val_accuracy: 0.4910 - val_loss: 3.3380
Epoch 2/100
[1m302/302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 599ms/step - accuracy: 0.4951 - loss: 3.2732 - val_accuracy: 0.5092 - val_loss: 3.1911
Epoch 3/100
[1m302/302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 652ms/step - accuracy: 0.5118 - loss: 3.1263 - val_accuracy: 0.5220 - val_loss: 3.0581
Epoch 4/100
[1m302/302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m218s[0m 704ms/step - accuracy: 0.5217 - loss: 3.0507 - val_accuracy: 0.5272 - val_loss: 2.9612
Epoch 5/100
[1m302/302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 650ms/step - accuracy: 0.5343 - loss: 2.8850 - val_accuracy: 0.5396 - val_loss: 2.8708
Epoch 6/100
[1m302/302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 550ms/step - accuracy: 0.5447 - loss: 2.7784 - val_accuracy: 0.5531 - val_loss: 2.7761
Epoch 7/10

### Inference Models

In [21]:
# Encoder model
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder model
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h]

decoder_rnn_outputs, state_h_dec = decoder_rnn(
    decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h_dec]
decoder_outputs = decoder_dense(decoder_rnn_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)

### Saving the Model

In [17]:
# # Save the full model
# model.save("english_to_urdu_translation_rnn_model.h5")

# # Save the encoder model
# encoder_model.save("encoder_rnn_model.h5")

# # Save the decoder model
# decoder_model.save("decoder_rnn_model.h5")



### Loading the Model

In [22]:
# Load the full model
model = load_model("english_to_urdu_translation_rnn_model.h5")

# Load the encoder and decoder models for inference
encoder_model = load_model("encoder_rnn_model.h5")
decoder_model = load_model("decoder_rnn_model.h5")



### Decoding For Translation

In [23]:
def decode_sequence(input_seq):
    # Encode the input as state vectors
    states_value = encoder_model.predict(input_seq)
    
    # Create a target sequence with just the start token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = urdu_tokenizer.word_index['start']
    
    stop_condition = False
    decoded_sentence = ''
    
    while not stop_condition:
        # Predict the next token
        output_tokens, h = decoder_model.predict([target_seq] + [states_value])
        
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = ''
        
        for word, index in urdu_tokenizer.word_index.items():
            if sampled_token_index == index:
                sampled_word = word
                break
        
        if sampled_word == 'end' or len(decoded_sentence.split()) > max_urdu_len:
            stop_condition = True
        else:
            decoded_sentence += sampled_word + ' '
        
        # Update the target sequence with the predicted token
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        
        # Update states
        states_value = h
    
    return decoded_sentence

In [24]:
def preprocess_input(sentence):
    # Tokenize the sentence
    sequence = english_tokenizer.texts_to_sequences([sentence])
    # Pad the sequence
    padded_sequence = pad_sequences(sequence, maxlen=max_sequence_length, padding='post')
    return padded_sequence

test_sentences = [
  "The sun is shining.",
  "I like to read books.",
  "The cat sleeps on the sofa.",
  "The children play in the park.",
    "I like tea.",
    "We go home.",
    "They are happy.",
    "I am a student.",
    "He is my friend.",
    "Work"

]

for sentence in test_sentences:
    input_seq = preprocess_input(sentence)
    decoded_sentence = decode_sequence(input_seq)
    print(f"English: {sentence} \nUrdu: {decoded_sentence}\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
English: The sun is shining. 
Urdu: چکن jalapeno 

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
English: I like to read books. 
Urdu: میں نے اسے گا۔ نہیں کی تھے۔ ضرورت ہیں۔ 

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

### Model Evaluation

In [25]:
def generate_predictions(input_seq):
    decoded_sentence = decode_sequence(input_seq) 
    return decoded_sentence

# compute BLEU score for a single prediction
def compute_bleu(actual, predicted):
    smoothing_function = SmoothingFunction().method1  # Choose your smoothing method
    return sentence_bleu(actual, predicted, smoothing_function=smoothing_function)

# reduced test size
reduced_test_size = 1000  
X_test_reduced = X_test[:reduced_test_size]
y_test_reduced = y_test[:reduced_test_size]


actual_sentences = []
predicted_sentences = []

#  predictions for all test sequences 
for i in range(len(X_test_reduced)):
    input_seq = X_test_reduced[i:i+1]
    decoded_sentence = generate_predictions(input_seq)
    predicted_sentences.append(decoded_sentence.split())
    actual_sentences.append([urdu_tokenizer.sequences_to_texts([y_test_reduced[i]])[0].split()])

# Computing BLEU scores 
bleu_scores = Parallel(n_jobs=-1)(
    delayed(compute_bleu)(actual_sentences[i], predicted_sentences[i]) for i in range(len(X_test_reduced))
)

# Calculating avg BLEU score
average_bleu_score = np.mean(bleu_scores)
print(f"Average BLEU score: {average_bleu_score:.4f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15

In [25]:
def print_model_accuracy(model, X_test, y_test):
    loss, accuracy = model.evaluate([X_test, y_test], y_test_shifted)
    
    print(f"Model Loss: {loss:.4f}")
    print(f"Model Accuracy: {accuracy:.4f}")

print_model_accuracy(model, X_test, y_test)



[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 60ms/step - accuracy: 0.6447 - loss: 2.5049
Model Loss: 2.4759
Model Accuracy: 0.6490
