## Neural Machine Translation Experiments for Hindi-English using an Encoder-Decoder Architecture along with the Attention mechanism

In [1]:
# Imports
import numpy as np
import pandas as pd
import re
import string
from string import digits
import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shrea\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, Bidirectional, RepeatVector, Concatenate, Dot, Lambda
from keras.callbacks import ModelCheckpoint
from keras.models import Input, Model
import keras.backend as K
from sklearn.model_selection import train_test_split

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [3]:
# Reading data
data = pd.read_csv('hi_en_corpus.csv')
data.shape

(127607, 3)

In [4]:
data.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [5]:
data['source'].value_counts()

tides        50000
ted          39881
indic2012    37726
Name: source, dtype: int64

In [6]:
data = data[data['source']=='ted']
data.drop_duplicates(inplace=True)
data.isna().sum()

source              0
english_sentence    0
hindi_sentence      0
dtype: int64

In [7]:
data.shape

(38803, 3)

### Data Preprocessing

In [8]:
# Converting null data from float to string
data['english_sentence'] = data['english_sentence'].astype(str)
data['hindi_sentence'] = data['hindi_sentence'].astype(str)

In [9]:
# Lowercase all characters
data['english_sentence'] = data['english_sentence'].apply(lambda x: x.lower())
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: x.lower())

In [10]:
# Remove quotes
data['english_sentence'] = data['english_sentence'].apply(lambda x: re.sub("'", '', x))
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: re.sub("'", '', x))

In [11]:
# Remove all the special characters
exclude = set(string.punctuation) # Set of all special characters
data['english_sentence'] = data['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [12]:
# Remove all numbers from text
remove_digits = str.maketrans('', '', digits)
data['english_sentence'] = data['english_sentence'].apply(lambda x: x.translate(remove_digits))
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: x.translate(remove_digits))
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: re.sub("[०१२३४५६७८९]", "", x))

In [16]:
# Remove extra spaces
data['english_sentence'] = data['english_sentence'].apply(lambda x: x.strip())
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: x.strip())
data['english_sentence'] = data['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))

### Tokenization and Padding

In [17]:
# Add start and end tokens to target sequences
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x : 'START_ '+ x + ' _END')

In [18]:
data.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,START_ राजनीतिज्ञों के पास जो कार्य करना चाहिए...
1,ted,id like to tell you about one such child,START_ मई आपको ऐसे ही एक बच्चे के बारे में बता...
3,ted,what we really mean is that theyre bad at not ...,START_ हम ये नहीं कहना चाहते कि वो ध्यान नहीं ...
7,ted,and who are we to say even that they are wrong,START_ और हम होते कौन हैं यह कहने भी वाले कि व...
13,ted,so there is some sort of justice,START_ तो वहाँ न्याय है _END


In [19]:
# Creating the English and Hindi Vocabulary
all_eng_words=set()
for eng in data['english_sentence']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

all_hindi_words=set()
for hin in data['hindi_sentence']:
    for word in hin.split():
        if word not in all_hindi_words:
            all_hindi_words.add(word)

In [20]:
len(all_eng_words)

17345

In [21]:
len(all_hindi_words)

22285

In [22]:
data['length_eng_sentence'] = data['english_sentence'].apply(lambda x:len(x.split(" ")))
data['length_hin_sentence'] = data['hindi_sentence'].apply(lambda x:len(x.split(" ")))

In [24]:
data.head()

Unnamed: 0,source,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
0,ted,politicians do not have permission to do what ...,START_ राजनीतिज्ञों के पास जो कार्य करना चाहिए...,12,15
1,ted,id like to tell you about one such child,START_ मई आपको ऐसे ही एक बच्चे के बारे में बता...,9,13
3,ted,what we really mean is that theyre bad at not ...,START_ हम ये नहीं कहना चाहते कि वो ध्यान नहीं ...,12,13
7,ted,and who are we to say even that they are wrong,START_ और हम होते कौन हैं यह कहने भी वाले कि व...,11,15
13,ted,so there is some sort of justice,START_ तो वहाँ न्याय है _END,7,6


In [13]:
max_eng_length = max(data['english_sentence'].apply(lambda x: len(x.split(' '))))
max_hindi_length = max(data['hindi_sentence'].apply(lambda x: len(x.split(' '))))
print(max_eng_length)
print(max_hindi_length)

21
31


In [15]:
MAX_VOCAB = 30000

In [14]:
X = data.english_sentence.values
y = data.hindi_sentence.values