# Implementing Many-to-Many RNN for English-to-Urdu Language Translation and Exploring Its Limitations

## Importing Libraries

In [24]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
#from keras.preprocessing.text import Tokenizer
import string
from textblob import TextBlob
import seaborn as sns
import matplotlib.pyplot as plt
import urduhack
from urduhack.preprocessing import normalize_whitespace
from urduhack.preprocessing import remove_accents
import demoji

ModuleNotFoundError: No module named 'keras.preprocessing.text'

## Loading Datasets

In [4]:
df = pd.read_excel("parallel-corpus.xlsx")

df = df.iloc[:, :2]  # since every other column is meaningless and is not being dropped by pandas

df.head()


Unnamed: 0,SENTENCES,MEANING
0,How can I communicate with my parents?,میں اپنے والدین سے کیسے بات کروں ؟
1,How can I make friends?’,میں دوست کیسے بنائوں ؟
2,Why do I get so sad?’,میں اتنا اداس کیوں ہوں؟.
3,"If you’ve asked yourself such questions, you’r...",اگر آپ نے اپنے آپ سے ایسے سوالات کیے ہیں، تو آ...
4,"Depending on where you’ve turned for guidance,...",اس بات پر منحصر ہے کہ آپ رہنمائی کے لیے کہاں ...


## Preprocessing Datasets

### Lowercasing English

In [5]:
df['SENTENCES '] = df['SENTENCES '].str.lower()
df.head()

Unnamed: 0,SENTENCES,MEANING
0,how can i communicate with my parents?,میں اپنے والدین سے کیسے بات کروں ؟
1,how can i make friends?’,میں دوست کیسے بنائوں ؟
2,why do i get so sad?’,میں اتنا اداس کیوں ہوں؟.
3,"if you’ve asked yourself such questions, you’r...",اگر آپ نے اپنے آپ سے ایسے سوالات کیے ہیں، تو آ...
4,"depending on where you’ve turned for guidance,...",اس بات پر منحصر ہے کہ آپ رہنمائی کے لیے کہاں ...


### Removing Null Values

In [6]:
print(df.shape)

(30164, 2)


In [7]:
print("Before null values", df['SENTENCES '].isnull().sum())
print("Before null values", df['MEANING'].isnull().sum())


df = df.dropna(subset=['SENTENCES ', 'MEANING'])

# Verify the size after removing null values
print(df.shape)

print("After null values", df['SENTENCES '].isnull().sum())
print("After null values", df['MEANING'].isnull().sum())


Before null values 47
Before null values 546
(29614, 2)
After null values 0
After null values 0


### Removing URLs

In [8]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

In [9]:
df['SENTENCES '] = df['SENTENCES '].apply(remove_url)
df['MEANING'] = df['MEANING'].apply(remove_url)

### Removing accent Urdu

In [10]:
df['MEANING'] = df['MEANING'].apply(remove_accents)

### Removing HTML Tags

In [11]:
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [12]:
df['SENTENCES '] = df['SENTENCES '].apply(remove_html_tags)
df['MEANING'] = df['MEANING'].apply(remove_html_tags)

### Spelling Correction English

In [13]:
def correct_spell(text):
    return str(TextBlob(text).correct())

In [14]:
# df['SENTENCES '] = df['SENTENCES '].apply(correct_spell)

# df.head()

### Whitespace Remover 

In [15]:
df['SENTENCES '] = df['SENTENCES '].apply(normalize_whitespace)
df['MEANING'] = df['MEANING'].apply(normalize_whitespace)

### Handling Short Conversations English

In [16]:
# Removing Short Convo / Chat Words
chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "B4N": "Bye For Now",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "ILU: I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don't care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "BFF": "Best friends forever",
    "CSL": "Can't stop laughing"
}

In [17]:
def chat_conversion(text):
    new_text = []
    for i in text.split():
        if i.upper() in chat_words:
            new_text.append(chat_words[i.upper()])
        else:
            new_text.append(i)
    return " ".join(new_text)

In [18]:
df['SENTENCES '] = df['SENTENCES '].apply(chat_conversion)

df.head()

Unnamed: 0,SENTENCES,MEANING
0,how can i communicate with my parents?,میں اپنے والدین سے کیسے بات کروں ؟
1,how can i make friends?’,میں دوست کیسے بنائوں ؟
2,why do i get so sad?’,میں اتنا اداس کیوں ہوں؟.
3,"if you’ve asked yourself such questions, you’r...",اگر آپ نے اپنے آپ سے ایسے سوالات کیے ہیں، تو آ...
4,"depending on where you’ve turned for guidance,...",اس بات پر منحصر ہے کہ آپ رہنمائی کے لیے کہاں گ...


### Handling Short Conversations Urdu

In [19]:
def filter_short_posts(text):
    words = text.split()
    return len(words) >= 3

In [20]:
mask = df['MEANING'].apply(filter_short_posts)
df = df[mask].reset_index(drop=True)

df.head()

Unnamed: 0,SENTENCES,MEANING
0,how can i communicate with my parents?,میں اپنے والدین سے کیسے بات کروں ؟
1,how can i make friends?’,میں دوست کیسے بنائوں ؟
2,why do i get so sad?’,میں اتنا اداس کیوں ہوں؟.
3,"if you’ve asked yourself such questions, you’r...",اگر آپ نے اپنے آپ سے ایسے سوالات کیے ہیں، تو آ...
4,"depending on where you’ve turned for guidance,...",اس بات پر منحصر ہے کہ آپ رہنمائی کے لیے کہاں گ...


### Tokenization Urdu

['میں', 'اپنے', 'والدین', 'سے', 'کیسے', 'بات', 'کروں', '؟']
['میں', 'دوست', 'کیسے', 'بنائوں', '؟']
['میں', 'اتنا', 'اداس', 'کیوں', 'ہوں', '؟', '.']
['اگر', 'آپ', 'نے', 'اپنے', 'آپ', 'سے', 'ایسے', 'سوالات', 'کیے', 'ہیں', '،', 'تو', 'آپ', 'اکیلے', 'نہیں', 'ہیں']
['اس', 'بات', 'پر', 'منحصر', 'ہے', 'کہ', 'آپ', 'رہنمائی', 'کے', 'لیے', 'کہاں', 'گئے', 'ہیں', '،', 'ہو', 'سکتا', 'ہے', 'آپ', 'کو', 'متضاد', 'جوابات', 'دیے', 'گئے', 'ہوں', '۔']
['نوجوانوں', 'کو', 'ٹھوس', 'مشورے', 'حاصل', 'کرنے', 'میں', 'مدد', 'کرنے', 'کے', 'لیے', 'جس', 'پر', 'وہ', 'بھروسہ', 'کر', 'سکتے', 'ہیں', '،', 'جاگو', '!', 'میگزین', 'نے', 'بائبل', 'پر', 'مبنی', 'رسالہ', 'شروع', 'کیا', '۔']
['8', 'جنوری', '1982', '۔', 'دہائیوں', 'کے', 'بعد', '،', 'سیریز', 'اب', 'بھی', 'ایک', 'پرجوش', 'ردعمل', 'کھینچتی', 'ہے', '۔']
['درحقیقت', '،', 'اس', 'بات', 'کا', 'تعین', 'کرنے', 'کے', 'لیے', 'کہ', 'نوجوان', 'کیسے', 'سوچتے', 'اور', 'محسوس', 'کرتے', 'ہیں', '،', 'جاگو', '!']
['جو', 'کتاب', 'آپ', 'کے', 'پاس', 'ہے', 'وہ', 'اصل', 'میں', '1989', '

### Tokenization English