# Implementing Many-to-Many RNN for English-to-Urdu Language Translation and Exploring Its Limitations

## Importing Libraries

In [52]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
import string
from textblob import TextBlob
import seaborn as sns
import matplotlib.pyplot as plt
import urduhack
from urduhack.preprocessing import normalize_whitespace
from urduhack.preprocessing import remove_accents
import demoji
from keras.preprocessing.text import Tokenizer
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, SimpleRNN, Dense, TimeDistributed
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

## Loading Datasets

In [7]:
df = pd.read_excel("parallel-corpus.xlsx")

df = df.iloc[:, :2]  # since every other column is meaningless and is not being dropped by pandas

df.head()


Unnamed: 0,SENTENCES,MEANING
0,How can I communicate with my parents?,میں اپنے والدین سے کیسے بات کروں ؟
1,How can I make friends?’,میں دوست کیسے بنائوں ؟
2,Why do I get so sad?’,میں اتنا اداس کیوں ہوں؟.
3,"If you’ve asked yourself such questions, you’r...",اگر آپ نے اپنے آپ سے ایسے سوالات کیے ہیں، تو آ...
4,"Depending on where you’ve turned for guidance,...",اس بات پر منحصر ہے کہ آپ رہنمائی کے لیے کہاں ...


## Preprocessing Datasets

### Lowercasing English

In [8]:
df['SENTENCES '] = df['SENTENCES '].str.lower()
df.head()

Unnamed: 0,SENTENCES,MEANING
0,how can i communicate with my parents?,میں اپنے والدین سے کیسے بات کروں ؟
1,how can i make friends?’,میں دوست کیسے بنائوں ؟
2,why do i get so sad?’,میں اتنا اداس کیوں ہوں؟.
3,"if you’ve asked yourself such questions, you’r...",اگر آپ نے اپنے آپ سے ایسے سوالات کیے ہیں، تو آ...
4,"depending on where you’ve turned for guidance,...",اس بات پر منحصر ہے کہ آپ رہنمائی کے لیے کہاں ...


### Removing Null Values

In [9]:
print(df.shape)

(30164, 2)


In [10]:
print("Before null values", df['SENTENCES '].isnull().sum())
print("Before null values", df['MEANING'].isnull().sum())


df = df.dropna(subset=['SENTENCES ', 'MEANING'])

# Verify the size after removing null values
print(df.shape)

print("After null values", df['SENTENCES '].isnull().sum())
print("After null values", df['MEANING'].isnull().sum())


Before null values 47
Before null values 546
(29614, 2)
After null values 0
After null values 0


### Removing URLs

In [11]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

In [12]:
df['SENTENCES '] = df['SENTENCES '].apply(remove_url)
df['MEANING'] = df['MEANING'].apply(remove_url)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['SENTENCES '] = df['SENTENCES '].apply(remove_url)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['MEANING'] = df['MEANING'].apply(remove_url)


### Removing accent Urdu

In [13]:
df['MEANING'] = df['MEANING'].apply(remove_accents)

### Removing HTML Tags

In [14]:
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [15]:
df['SENTENCES '] = df['SENTENCES '].apply(remove_html_tags)
df['MEANING'] = df['MEANING'].apply(remove_html_tags)

### Spelling Correction English

In [16]:
def correct_spell(text):
    return str(TextBlob(text).correct())

In [17]:
# df['SENTENCES '] = df['SENTENCES '].apply(correct_spell)

df.head()

Unnamed: 0,SENTENCES,MEANING
0,how can i communicate with my parents?,میں اپنے والدین سے کیسے بات کروں ؟
1,how can i make friends?’,میں دوست کیسے بنائوں ؟
2,why do i get so sad?’,میں اتنا اداس کیوں ہوں؟.
3,"if you’ve asked yourself such questions, you’r...",اگر آپ نے اپنے آپ سے ایسے سوالات کیے ہیں، تو آ...
4,"depending on where you’ve turned for guidance,...",اس بات پر منحصر ہے کہ آپ رہنمائی کے لیے کہاں ...


### Whitespace Remover

In [18]:
df['SENTENCES '] = df['SENTENCES '].apply(normalize_whitespace)
df['MEANING'] = df['MEANING'].apply(normalize_whitespace)

### Handling Short Conversations English

In [19]:
# Removing Short Convo / Chat Words
chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "B4N": "Bye For Now",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "ILU: I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don't care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "BFF": "Best friends forever",
    "CSL": "Can't stop laughing"
}

In [20]:
def chat_conversion(text):
    new_text = []
    for i in text.split():
        if i.upper() in chat_words:
            new_text.append(chat_words[i.upper()])
        else:
            new_text.append(i)
    return " ".join(new_text)

In [21]:
df['SENTENCES '] = df['SENTENCES '].apply(chat_conversion)

df.head()

Unnamed: 0,SENTENCES,MEANING
0,how can i communicate with my parents?,میں اپنے والدین سے کیسے بات کروں ؟
1,how can i make friends?’,میں دوست کیسے بنائوں ؟
2,why do i get so sad?’,میں اتنا اداس کیوں ہوں؟.
3,"if you’ve asked yourself such questions, you’r...",اگر آپ نے اپنے آپ سے ایسے سوالات کیے ہیں، تو آ...
4,"depending on where you’ve turned for guidance,...",اس بات پر منحصر ہے کہ آپ رہنمائی کے لیے کہاں گ...


### Handling Short Conversations Urdu

In [22]:
def filter_short_posts(text):
    words = text.split()
    return len(words) >= 3

In [23]:
mask = df['MEANING'].apply(filter_short_posts)
df = df[mask].reset_index(drop=True)

df.head()

Unnamed: 0,SENTENCES,MEANING
0,how can i communicate with my parents?,میں اپنے والدین سے کیسے بات کروں ؟
1,how can i make friends?’,میں دوست کیسے بنائوں ؟
2,why do i get so sad?’,میں اتنا اداس کیوں ہوں؟.
3,"if you’ve asked yourself such questions, you’r...",اگر آپ نے اپنے آپ سے ایسے سوالات کیے ہیں، تو آ...
4,"depending on where you’ve turned for guidance,...",اس بات پر منحصر ہے کہ آپ رہنمائی کے لیے کہاں گ...


### Tokenization


In [24]:
def tokenize(x):
    tokenizer = Tokenizer(split=' ', char_level=False)
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer

In [25]:
text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',
    'This is a short sentence .']
text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)

{'the': 1, 'quick': 2, 'a': 3, 'brown': 4, 'fox': 5, 'jumps': 6, 'over': 7, 'lazy': 8, 'dog': 9, 'by': 10, 'jove': 11, 'my': 12, 'study': 13, 'of': 14, 'lexicography': 15, 'won': 16, 'prize': 17, 'this': 18, 'is': 19, 'short': 20, 'sentence': 21}


In [26]:
text_sentences = [
    'تیز بھوری گدھ جو سست کتے پر چھلانگ لگاتی ہے۔',
    'اوہ خدا، میری تیز مطالعہ لغت نویسی نے انعام جیتا۔',
    'یہ ایک مختصر جملہ ہے۔'
]
text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)

{'تیز': 1, 'ہے۔': 2, 'بھوری': 3, 'گدھ': 4, 'جو': 5, 'سست': 6, 'کتے': 7, 'پر': 8, 'چھلانگ': 9, 'لگاتی': 10, 'اوہ': 11, 'خدا،': 12, 'میری': 13, 'مطالعہ': 14, 'لغت': 15, 'نویسی': 16, 'نے': 17, 'انعام': 18, 'جیتا۔': 19, 'یہ': 20, 'ایک': 21, 'مختصر': 22, 'جملہ': 23}


### Padding

In [27]:
def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])

    return pad_sequences(x, maxlen=length, padding='post', truncating='post')

In [28]:
preproc_english_sentences, english_tokenizer = tokenize(df["SENTENCES "])
preproc_urdu_sentences, urdu_tokenizer = tokenize(df["MEANING"])

In [29]:
# Find the maximum length of both English and Urdu sequences
max_length = max(
    max([len(seq) for seq in preproc_english_sentences]),
    max([len(seq) for seq in preproc_urdu_sentences])
)
# Pad both English and Urdu sequences to the same length
preproc_english_sentences = pad(preproc_english_sentences, length=max_length)
preproc_urdu_sentences = pad(preproc_urdu_sentences, length=max_length)


In [30]:

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(preproc_english_sentences, preproc_urdu_sentences, test_size=0.2, random_state=42)

# Reshape input data
X_train = X_train.reshape((*X_train.shape, 1))
X_val = X_val.reshape((*X_val.shape, 1))


In [53]:
def build_english_to_urdu_rnn(english_vocab_size, urdu_vocab_size, output_sequence_length):
    """
    Build and compile a simple RNN model for English to Urdu translation with an embedding layer
    """
    # Initialize a Sequential model
    model = Sequential()

    # Embedding layer for English sentences
    model.add(Embedding(input_dim=english_vocab_size, output_dim=128, input_length=None))  # input_length will be defined in the first layer

    # RNN layer processing the embeddings
    model.add(SimpleRNN(128, return_sequences=True))

    # Output layer predicting the Urdu vocabulary
    model.add(TimeDistributed(Dense(urdu_vocab_size, activation='softmax')))

    # Compile the model
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

# Get the shapes and vocabulary sizes
english_vocab_size = len(english_tokenizer.word_index) + 1
urdu_vocab_size = len(urdu_tokenizer.word_index) + 1

# Build the model (you can define a fixed output sequence length if needed)
model = build_english_to_urdu_rnn(english_vocab_size, urdu_vocab_size, output_sequence_length)

# Print model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 128)         2092288   
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, None, 128)         32896     
                                                                 
 time_distributed_1 (TimeDi  (None, None, 16338)       2107602   
 stributed)                                                      
                                                                 
Total params: 4232786 (16.15 MB)
Trainable params: 4232786 (16.15 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [54]:
# Check if TensorFlow is using the GPU
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available: 1


In [55]:
# Train the model
# Define EarlyStopping
early_stopping = EarlyStopping(
    monitor='val_loss',  # Metric to monitor
    patience=2,          # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True  # Restore model weights from the epoch with the best value of the monitored quantity
)
# Train the model with EarlyStopping
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=32,
    epochs=3,
    callbacks=[early_stopping]  # Add the EarlyStopping callback here
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [57]:
 model.save('rnn_model_updated.h5')

In [58]:
# def translate_english_to_urdu(model, sentence, english_tokenizer, urdu_tokenizer, max_length):
#     # Tokenize the input sentence
#     input_seq = english_tokenizer.texts_to_sequences([sentence])
#     input_seq = pad_sequences(input_seq, maxlen=max_length, padding='post')

#     # Get the model's prediction
#     prediction = model.predict(input_seq.reshape((1, max_length, 1)))

#     # Convert the prediction to text
#     output_seq = np.argmax(prediction[0], axis=-1)

#     # Convert token IDs to words
#     output_words = []
#     for token in output_seq:
#         word = urdu_tokenizer.index_word.get(token, '')
#         if word != '' and word != '<PAD>':
#             output_words.append(word)

#     return ' '.join(output_words)

# # Test the translation function with some examples
# test_sentences = [
#     "Hello, how are you?",
#     "What is your name?",
#     "I love learning languages.",
#     "The weather is nice today.",
#     "Why do I get so sad?"
# ]

# for sentence in test_sentences:
#     translated = translate_english_to_urdu(model, sentence, english_tokenizer, urdu_tokenizer, max_length)
#     print(f"English: {sentence}")
#     print(f"Urdu: {translated}")
#     print()

English: Hello, how are you?
Urdu: گوگل کے

English: What is your name?
Urdu: اچھا اچھا کے کے

English: I love learning languages.
Urdu: اچھا کے کے کے

English: The weather is nice today.
Urdu: اچھا

English: Why do I get so sad?
Urdu: اچھا کے کے کے کے کے



In [59]:
# def translate_english_to_urdu(model, sentence, english_tokenizer, urdu_tokenizer, max_length):
#     # Tokenize the input sentence
#     input_seq = english_tokenizer.texts_to_sequences([sentence])
#     input_seq = pad_sequences(input_seq, maxlen=max_length, padding='post')

#     # Get the model's prediction
#     prediction = model.predict(input_seq.reshape((1, max_length, 1)))

#     # Convert the prediction to text
#     output_seq = np.argmax(prediction[0], axis=-1)

#     # Convert token IDs to words
#     output_words = [urdu_tokenizer.index_word.get(token, '<UNK>') for token in output_seq]

#     return ' '.join(output_words).replace('<PAD>', '').strip()


In [60]:
# # Test the translation function with some examples
# test_sentences = [
#     "Hello, how are you?",
#     "What is your name?",
#     "I love learning languages.",
#     "The weather is nice today.",
#     "Why do I get so sad?"
# ]

# for sentence in test_sentences:
#     translated = translate_english_to_urdum(model, sentence, english_tokenizer, urdu_tokenizer, max_length)
#     print(f"English: {sentence}")
#     print(f"Urdu: {translated}")
#     print()

English: Hello, how are you?
Urdu: گوگل کے <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UN

In [65]:
# # Function to convert logits to text
# def logits_to_text(logits, tokenizer):
#     index_to_words = {id: word for word, id in tokenizer.word_index.items()}
#     index_to_words[0] = '<PAD>'

#     return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, axis=1) if index_to_words[prediction] != '<PAD>'])

# # Function to convert token sequences to words
# def token_to_words(sequence, tokenizer):
#     index_to_words = {id: word for word, id in tokenizer.word_index.items()}
#     index_to_words[0] = '<PAD>'

#     return [index_to_words[token] for token in sequence if index_to_words[token] != '<PAD>']

# # Function to display predictions versus gold standard
# def translate(prediction, gold_standard):
#     translation = logits_to_text(prediction[0], urdu_tokenizer)  # Use urdu_tokenizer for your output
#     # The change is in the following line. gold_standard[0] is already a 1D array.
#     # We need to pass this directly to token_to_words
#     standard = ' '.join(token_to_words(gold_standard[0], urdu_tokenizer))

#     print('---- Gold standard ----')
#     print(standard)
#     print()
#     print('---- Prediction ----')
#     for w_t, w_s in zip(translation.split(), standard.split()):
#         if w_t == w_s:
#             print('\033[0;30;0m', '{}'.format(w_t), end=' ')
#         else:
#             print('\033[0;31;47m', w_t, end=' ')
#     print()

# # Print prediction(s)
# print('---- Original ----')
# print(' '.join(token_to_words(X_train[:1][0][:,0],english_tokenizer) ))
# print()
# translate(model.predict(X_train[:1]), y_train[:1])

---- Original ----
food quality is good service is also good parking is bit problematic

---- Gold standard ----
کھانے کی کوالٹی اچھی ہے سروس بھی اچھی ہے پارکنگ تھوڑا مسئلہ ہے۔

---- Prediction ----
[0;31;47m اچھا [0;31;47m اچھا [0;31;47m اچھا [0;31;47m اچھا [0;31;47m کے [0;31;47m کے [0;31;47m کے [0;31;47m کے [0;31;47m کے [0;31;47m کے [0;31;47m کے [0;31;47m کے [0;30;0m ہے۔ 
