In [2]:
# pip install fasteda
# pip install cufflinks
# pip install plotly_express
# pip install scikit-learn
# pip install transformers -U -q
# pip install sentencepiece

In [3]:
import string
import re
import transformers
import numpy as np
import pandas as pd
import cufflinks as cf
import plotly_express
from fasteda import fast_eda
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, Embedding, Input
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras import optimizers
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', None)

In [4]:
df = pd.read_csv("Hindi_English_Truncated_Corpus.csv",encoding='utf-8')

In [5]:
df.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what needs to be done.,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह करने कि अनुमति नहीं है ."
1,ted,"I'd like to tell you about one such child,","मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी,"
2,indic2012,This percentage is even greater than the percentage in India.,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not paying attention.,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called Upanishad.,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [6]:
df.tail()

Unnamed: 0,source,english_sentence,hindi_sentence
127602,indic2012,Examples of art deco construction can be found around Marine Drive and Oval Ground areas.,आर्ट डेको शैली के निर्माण मैरीन ड्राइव और ओवल मैदान के किनारे दिखाई देते हैं।
127603,ted,and put it in our cheeks.,और अपने गालों में डाल लेते हैं।
127604,tides,"As for the other derivatives of sulphur , the country 's needs of iron , copper , sodium , etc . sulphates were limited , and the production achieved after the war was generally adequate .","जहां तक गंधक के अन्य उत्पादों का प्रश्न है , देश में लोहे , तांबे , सोडियम , सल्फेट आदि की आवश्यकता सीमित थी और युद्धोपरांत हुआ उत्पादन सामान्य रूप से पर्याप्त था ."
127605,tides,its complicated functioning is defined thus in a popular riddle :,Zरचना-प्रकिया को उसने एक पहेली में यों बांधा है .
127606,ted,"They've just won four government contracts to build off their 100 ambulances,","हाल ही में उन्हें सरकारी ठेका मिला है करीब सौ नई अम्बुलेन्स बनाने का,"


Missing Values

In [7]:
df.isna().sum()

source              0
english_sentence    2
hindi_sentence      0
dtype: int64

Dropping Missing Records

In [8]:
df = df.dropna()

In [9]:
df.isna().sum()

source              0
english_sentence    0
hindi_sentence      0
dtype: int64

Description of Dataset

In [10]:
df.describe()

Unnamed: 0,source,english_sentence,hindi_sentence
count,127605,127605,127605
unique,3,124317,97662
top,tides,(Laughter),(हँसी)
freq,50000,555,212


Tokens and Char Count

In [11]:
cf.set_config_file(offline=True)
df["eng_char_count"] = df["english_sentence"].str.len()
df["hindi_char_count"] = df["hindi_sentence"].str.len()
df["hindi_tok_count"] = df["hindi_sentence"].str.split(" ").str.len()
df["eng_tok_count"] = df["english_sentence"].str.split(" ").str.len()

Most Common Words

In [12]:
from collections import Counter

tot_eng_sen = ""
for eng_sent in df["english_sentence"].tolist():
    tot_eng_sen += eng_sent + " "
    
Counter(tot_eng_sen.split(" ")).most_common(10)

[('the', 115944),
 ('of', 74298),
 ('and', 55015),
 (',', 52397),
 ('.', 50695),
 ('to', 47045),
 ('in', 41916),
 ('a', 34619),
 ('is', 29280),
 ('that', 17422)]

In [13]:
tot_hindi_sen = ""
for hin_sent in df["hindi_sentence"].tolist():
    tot_hindi_sen += hin_sent + " "
    
Counter(tot_hindi_sen.split(" ")).most_common(10)

[('के', 87750),
 ('में', 63389),
 ('है', 49672),
 ('की', 49069),
 ('.', 47650),
 ('और', 47371),
 ('से', 38061),
 (',', 33932),
 ('का', 33146),
 ('को', 31447)]

In [15]:
df

Unnamed: 0,source,english_sentence,hindi_sentence,eng_char_count,hindi_char_count,hindi_tok_count,eng_tok_count
0,ted,politicians do not have permission to do what needs to be done.,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह करने कि अनुमति नहीं है .",63,68,14,12
1,ted,"I'd like to tell you about one such child,","मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी,",42,50,11,9
2,indic2012,This percentage is even greater than the percentage in India.,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।,61,48,9,10
3,ted,what we really mean is that they're bad at not paying attention.,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते,64,46,11,12
4,indic2012,.The ending portion of these Vedas is called Upanishad.,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।,55,43,8,9
...,...,...,...,...,...,...,...
127602,indic2012,Examples of art deco construction can be found around Marine Drive and Oval Ground areas.,आर्ट डेको शैली के निर्माण मैरीन ड्राइव और ओवल मैदान के किनारे दिखाई देते हैं।,89,77,15,15
127603,ted,and put it in our cheeks.,और अपने गालों में डाल लेते हैं।,25,31,7,6
127604,tides,"As for the other derivatives of sulphur , the country 's needs of iron , copper , sodium , etc . sulphates were limited , and the production achieved after the war was generally adequate .","जहां तक गंधक के अन्य उत्पादों का प्रश्न है , देश में लोहे , तांबे , सोडियम , सल्फेट आदि की आवश्यकता सीमित थी और युद्धोपरांत हुआ उत्पादन सामान्य रूप से पर्याप्त था .",188,164,34,36
127605,tides,its complicated functioning is defined thus in a popular riddle :,Zरचना-प्रकिया को उसने एक पहेली में यों बांधा है .,65,49,10,11


Preprocessing

In [16]:
df=df[df['source']=='ted']

In [17]:

df.english_sentence = df.english_sentence.apply(lambda x: x.lower())
df.hindi_sentence = df.hindi_sentence.apply(lambda x: x.lower())

Removing single quotes and replacing comma with spaces

In [18]:
df.english_sentence = df.english_sentence.apply(lambda x: re.sub("'", '', x)).apply(lambda x: re.sub(",", ' ', x))
df.hindi_sentence = df.hindi_sentence.apply(lambda x: re.sub("'", '', x)).apply(lambda x: re.sub(",", ' ', x))

In [19]:

df['english_sentence']=df['english_sentence'].apply(lambda x: re.sub("'", '', x))
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: re.sub("'", '', x))

In [20]:
exclude = set(string.punctuation)
df['english_sentence']=df['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

Removing all numbers and extra spaces from texts

In [21]:
from string import digits
remove_digits = str.maketrans('', '', digits)
df['english_sentence']=df['english_sentence'].apply(lambda x: x.translate(remove_digits))
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: x.translate(remove_digits))
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

df['english_sentence']=df['english_sentence'].apply(lambda x: x.strip())
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: x.strip())
df['english_sentence']=df['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))

In [22]:
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x :'START_ ' + x + ' _END')

In [23]:
df.sample(10)

Unnamed: 0,source,english_sentence,hindi_sentence,eng_char_count,hindi_char_count,hindi_tok_count,eng_tok_count
74830,ted,youve already learned first of all,START_ आप पहले से ही सब सीख चुके हैं _END,37,30,8,6
16492,ted,things are changing,START_ चीजें बदल रही हैं। _END,20,18,4,3
43211,ted,trying to understand something of the pain of others,START_ दूसरों का दर्द समझने की कोशिश करना _END,53,35,7,9
122737,ted,for our vacant storefronts so our communities,START_ जिससे आज हमारा समाज हमारी जरूरतों और सपनों को _END,46,45,9,7
79785,ted,he was the president of a corporation in ohio,START_ वे ओहायो की किसी कंपनी के अध्यक्ष थे _END,46,37,8,9
92434,ted,so does that,START_ और इसका भी _END,13,10,3,3
63399,ted,applause,START_ तालियाँ _END,10,9,1,1
53012,ted,and we create everything we can see in any direction right,START_ हर दिशा में दिखने वाली हर चीज़ बनाई जा सकती है _END,60,47,11,11
56162,ted,i see now i never was one and not the other,START_ मेंने अब जाना कि ऎसा कभी नहीं था कि मैं एक थी और दूसरी नहीं _END,45,61,15,11
28544,ted,is understanding relationships,START_ रिश्तों को समझना भर है। _END,31,23,5,3


In [24]:
all_eng_words=set()
for eng in df['english_sentence']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

all_hindi_words=set()
for hin in df['hindi_sentence']:
    for word in hin.split():
        if word not in all_hindi_words:
            all_hindi_words.add(word)

In [25]:

df['length_eng_sentence']=df['english_sentence'].apply(lambda x:len(x.split(" ")))
df['length_hin_sentence']=df['hindi_sentence'].apply(lambda x:len(x.split(" ")))

In [26]:
df.head()

Unnamed: 0,source,english_sentence,hindi_sentence,eng_char_count,hindi_char_count,hindi_tok_count,eng_tok_count,length_eng_sentence,length_hin_sentence
0,ted,politicians do not have permission to do what needs to be done,START_ राजनीतिज्ञों के पास जो कार्य करना चाहिए वह करने कि अनुमति नहीं है _END,63,68,14,12,12,15
1,ted,id like to tell you about one such child,START_ मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी _END,42,50,11,9,9,13
3,ted,what we really mean is that theyre bad at not paying attention,START_ हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते _END,64,46,11,12,12,13
7,ted,and who are we to say even that they are wrong,START_ और हम होते कौन हैं यह कहने भी वाले कि वे गलत हैं _END,48,48,13,11,11,15
13,ted,so there is some sort of justice,START_ तो वहाँ न्याय है _END,32,16,4,7,7,6


In [27]:
print('length of english words: ', len(all_eng_words))
print('length of hindi words: ', len(all_hindi_words))

length of english words:  17292
length of hindi words:  22210


In [28]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_hindi_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_hindi_words)
num_encoder_tokens, num_decoder_tokens

(17292, 22210)

In [29]:
num_decoder_tokens += 1 #for zero padding

In [30]:

input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

In [31]:

reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [33]:

df = shuffle(df)
df.head(10)

Unnamed: 0,source,english_sentence,hindi_sentence,eng_char_count,hindi_char_count,hindi_tok_count,eng_tok_count,length_eng_sentence,length_hin_sentence
1304,ted,most dynamic and young photographers,START_ सबसे हुनरशुदा और युवा फोटोग्राफ़रों ने लिये हैं _END,37,48,8,5,5,10
15752,ted,no the boy said to me,START_ नहीं उस लड़के ने मुझसे कहा _END,24,29,7,7,6,8
109093,ted,you make two halfcuts,START_ और दो जगह आधा काट दीजिये। _END,23,25,6,4,4,8
56835,ted,is that were writing things,START_ कि हम लिख रहे हैं _END,29,18,5,5,5,7
11906,ted,especially if you have two options you have the brown one or the clear one,START_ खासकर जब वो आपसे पूछता हो कि भूर वाल दूँ या सफ़ेद । _END,76,52,13,15,15,15
67840,ted,churches temples other things,START_ गिरजे मंदिर इत्यादि _END,33,22,4,5,4,5
38502,ted,but it does tend to leave you waking up crying at three oclock in the morning,START_ मगर हो सकता है कि आप सुबह के तीन बजे जागे हुए पाये जायें। _END,79,57,14,16,16,16
6230,ted,and also this,START_ और यह भी _END,13,8,3,3,3,5
45429,ted,i couldnt believe my eyes when i first saw that,START_ जब मैंने पहली बार इसे देखा तो मुझे अपनी आँखों पर विश्वास नहीं हुआ _END,49,66,14,10,10,16
116527,ted,laughter,START_ हँसी _END,10,6,1,1,1,3


In [34]:
# getting maximum sentence length of english sentences
length_list = []
for l in df.english_sentence:
    length_list.append(len(l.split(' ')))

max_input_length = np.max(length_list)
print('max_input_length: ', max_input_length)

max_input_length:  21


In [35]:
 #getting maximum sentence length of Hindi sentences
length_list = []
for l in df.hindi_sentence:
    length_list.append(len(l.split(' ')))

max_output_length = np.max(length_list)
print('max_output_length: ', max_output_length)

max_output_length:  32


In [36]:
df_filtered = df[(df['length_eng_sentence'] <= 20) & (df['length_hin_sentence'] <= 20)]

df_=df[df['length_eng_sentence']<=20]
df_=df[df['length_hin_sentence']<=20]


In [37]:

print("maximum length of Hindi Sentence ",max(df_['length_hin_sentence']))
print("maximum length of English Sentence ", max(df_['length_eng_sentence']))


maximum length of Hindi Sentence  20
maximum length of English Sentence  20


In [38]:
x, y = df_filtered['english_sentence'], df_filtered['hindi_sentence']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3,random_state=42)
x_train.shape, x_test.shape


((27687,), (11867,))

In [39]:
x_train

9628                                     to talk about an idea worth spreading
20845                                    is this something other people can do
50705                                     and this is how we develop our ideas
31618                         that this separation is within acceptable levels
117212    what if my roommate knew about my wonderfully ambitious hair braider
                                          ...                                 
91375                                          and i needed to be back helping
37516                                          no machines have been developed
105192                                     they said “we look at it every day”
81887                                                           we smile often
75950                                   i had a friend a history major like me
Name: english_sentence, Length: 27687, dtype: object

In [40]:
y_train

9628                                                           START_ एक बाँटने योग्य विचार के बारे में बात करने के लिए _END
20845                                                                   START_ क्या ये कुछ ऐसा है जो और लोग कर सकते हैं _END
50705                                                                         START_ और इस तरह हम अपनी योजनाएं बनाते है _END
31618                                                                            START_ की यह अंतर स्वीकार्य स्तर पर है _END
117212    START_ क्या होता यदि मेरी रूममेट को मेरी चोटी बनानेवाली उस ज़बर्दस्त महत्वाकांक्षी लड़की के बारे में पता होता _END
                                                                 ...                                                        
91375                                                                           START_ और मुझे वापस जाकर मदद करनी चाहिए _END
37516                                                                 START_ कोई मशीन नहीं बनाई गयी है इस काम के लिये । _END


In [41]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import tensorflow as tf

In [47]:
# Training
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX")


In [49]:
# Inference with the fine-tuned model
def translate_any_sentence(in_text):
    model_inputs = tokenizer(in_text, return_tensors="pt")

    generated_tokens = model.generate(
        **model_inputs,
        forced_bos_token_id=tokenizer.lang_code_to_id["hi_IN"]
    )
    translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    return translation[0] if translation else "Translation not available"

english_sentence = "What are you doing?"
translated_hindi_sentence = translate_any_sentence(english_sentence)

print(f"English : {english_sentence}\nTranslated Hindi : {translated_hindi_sentence}")

English : What are you doing?
Translated Hindi : तुम क्या कर रहे हो?
