# Neural Machine Translation Experiments for Hindi-English using an Encoder-Decoder Architecture along with the Attention mechanism

In [1]:
# Imports
import numpy as np
import pandas as pd
import re
import string
from string import digits
import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shrea\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, Bidirectional, RepeatVector, Concatenate, Dot, Lambda
from keras.callbacks import ModelCheckpoint
from keras.models import Input, Model
import keras.backend as K
from sklearn.model_selection import train_test_split

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [3]:
# Reading data
data = pd.read_csv('hi_en_corpus.csv')
data.shape

(127607, 3)

In [4]:
data.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [5]:
data['source'].value_counts()

tides        50000
ted          39881
indic2012    37726
Name: source, dtype: int64

In [6]:
data = data[data['source']=='ted']
data.drop_duplicates(inplace=True)
data.isna().sum()

source              0
english_sentence    0
hindi_sentence      0
dtype: int64

In [7]:
data.shape

(38803, 3)

## Data Preprocessing

In [8]:
# Converting null data from float to string
data['english_sentence'] = data['english_sentence'].astype(str)
data['hindi_sentence'] = data['hindi_sentence'].astype(str)

In [9]:
# Lowercase all characters
data['english_sentence'] = data['english_sentence'].apply(lambda x: x.lower())
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: x.lower())

In [10]:
# Remove quotes
data['english_sentence'] = data['english_sentence'].apply(lambda x: re.sub("'", '', x))
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: re.sub("'", '', x))

In [11]:
# Remove all the special characters
exclude = set(string.punctuation) # Set of all special characters
data['english_sentence'] = data['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [12]:
# Remove all numbers from text
remove_digits = str.maketrans('', '', digits)
data['english_sentence'] = data['english_sentence'].apply(lambda x: x.translate(remove_digits))
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: x.translate(remove_digits))
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: re.sub("[०१२३४५६७८९]", "", x))

In [13]:
# Remove extra spaces
data['english_sentence'] = data['english_sentence'].apply(lambda x: x.strip())
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: x.strip())
data['english_sentence'] = data['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))

## Tokenization and Padding

As a part of this process we'll tokenize both the input and output. The important thing over here is the addition of the <START>(START) and <END>(END) tags on the output sequence. This is for our neural network to understand the sequence and when to break the processing in our architecture. Then we will padd the tokens to the largest length of the sentence.

In [14]:
# Add start and end tokens to target sequences
# data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x : 'START_ '+ x + ' _END')

In [15]:
# Creating the English and Hindi Vocabulary
all_eng_words=set()
for eng in data['english_sentence']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

all_hindi_words=set()
for hin in data['hindi_sentence']:
    for word in hin.split():
        if word not in all_hindi_words:
            all_hindi_words.add(word)

In [16]:
len(all_eng_words)

17345

In [17]:
len(all_hindi_words)

22283

In [18]:
data['length_eng_sentence'] = data['english_sentence'].apply(lambda x:len(x.split(" ")))
data['length_hin_sentence'] = data['hindi_sentence'].apply(lambda x:len(x.split(" ")))

In [19]:
data.head()

Unnamed: 0,source,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
0,ted,politicians do not have permission to do what ...,राजनीतिज्ञों के पास जो कार्य करना चाहिए वह करन...,12,13
1,ted,id like to tell you about one such child,मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी,9,11
3,ted,what we really mean is that theyre bad at not ...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते,12,11
7,ted,and who are we to say even that they are wrong,और हम होते कौन हैं यह कहने भी वाले कि वे गलत हैं,11,13
13,ted,so there is some sort of justice,तो वहाँ न्याय है,7,4


In [20]:
max_eng_length = max(data['english_sentence'].apply(lambda x: len(x.split(' '))))
max_hindi_length = max(data['hindi_sentence'].apply(lambda x: len(x.split(' '))))
print(max_eng_length)
print(max_hindi_length)

21
30


In [21]:
MAX_VOCAB = 30000

In [22]:
X = data.english_sentence.values
y = data.hindi_sentence.values

In [23]:
def tokenize(x,is_hindi = False):
    # Add START and END tag to the sentence
    tokenizer_input = Tokenizer(num_words=MAX_VOCAB)
    if is_hindi:
        target_input_sequences = []
        target_sequences = []
        for sentence in x:
            target_input_sequences.append('START_ '+sentence)
            target_sequences.append(sentence+' _END')
        tokenizer_output = Tokenizer(num_words=30000)
        tokenizer_output.fit_on_texts(target_input_sequences+target_sequences)
        target_input_sequences = tokenizer_output.texts_to_sequences(target_input_sequences)
        target_sequences = tokenizer_output.texts_to_sequences(target_sequences)
        return target_input_sequences,target_sequences,tokenizer_output
    else:
        tokenizer_input = Tokenizer(num_words=30000) 
        tokenizer_input.fit_on_texts(x)
        input_sequences = tokenizer_input.texts_to_sequences(x)
        return input_sequences,tokenizer_input 

In [24]:
def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
    padded_x = pad_sequences(x, maxlen = length, padding = 'post', truncating = 'post')
    return padded_x

In [25]:
input_sequences, tokenizer_input = tokenize(X)
target_input_sequences,target_sequences, tokenizer_output = tokenize(y,is_hindi=True)

In [26]:
len(target_input_sequences),len(target_sequences)

(38803, 38803)

In [27]:
input_sequences = pad(input_sequences)
target_input_sequences = pad(target_input_sequences)
target_sequences = pad(target_sequences)

input_num_words = len(tokenizer_input.word_index) + 1
target_num_words = len(tokenizer_output.word_index) + 1  

max_input_len = len(input_sequences[0])
max_target_len = len(target_sequences[0])

words_input = tokenizer_input.word_index
words_output = tokenizer_output.word_index

In [28]:
print("English vocab size: ", input_num_words)
print("Hindi vocab size: ", target_num_words)

English vocab size:  17346
Hindi vocab size:  22286


## Model Architecture

Now that the data is prepared, we can proceed by creating the model architecture

We will create the input and the target word embedding matrix.

We will be working with Bi-Directional LSTM as we need to make sure that the context of the output is maintained along with the sequence structure. This is the core advantage of using BiLSTM as it not only preserver information and data from past (like single unit LSTM) but also considers future data. 

The output structure will be closesly related to the context of input which is very essential while converting sentences from one language to another.

In [29]:
DIMENSIONS = 100
LSTM_UNITS = 256

In [30]:
# Converting word to word vectors
word2Vec = {}
with open('C:/Users/shrea/Desktop/Jupyter Notebooks/IITB Internship/Russian Translation/Experiments/Pretrained/glove.6B.100d.txt', encoding="utf8") as f:
    for line in f:
        line = line.split(' ')
        word = line[0]
        word2Vec[word] = line[1:]

In [31]:
input_embedding_matrix = np.zeros((input_num_words, DIMENSIONS))
for word, k in words_input.items():
    if k < input_num_words:
        embedding_vector = word2Vec.get(word)
        if embedding_vector is not None:
            input_embedding_matrix[k] = embedding_vector

In [32]:
target_embedding_matrix = np.zeros((target_num_words, DIMENSIONS))
for word, k in words_input.items():
    if k < target_num_words:
        embedding_vector = word2Vec.get(word)
        if embedding_vector is not None:
            target_embedding_matrix[k] = embedding_vector

In [34]:
# Creating the input layer embeddings
# We take the 2D input and convert it to a tensor shaped embedding matrix.
embedding_input = Input(shape=(max_input_len,))
embedding_input_layer = Embedding(input_num_words, DIMENSIONS, weights=[input_embedding_matrix], trainable=True)
x = embedding_input_layer(embedding_input)

Instructions for updating:
Colocations handled automatically by placer.


In Neural Machine Translation, it is been found that deeper architectures tends to have much better performance than a single unit neural network. In [this paper](https://www.aclweb.org/anthology/W17-4710.pdf) the author goes on to explain why deeper architectures are much useful for NMT and goes on to suggest that variation of DeepBi-RNN with depth 8 performs the best. To keep the computational and understanding simpler, we will restrict ourselves to 2 units of Bi-LSTM to improve the performance over single unit Neural Nets.

In [35]:
input_lstm1 = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))
input_lstm1_output = input_lstm1(x)

input_lstm2 = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))
encoder_output = input_lstm2(input_lstm1_output)

In [36]:
def softmax_over_time(x):
    assert(K.ndim(x) > 2)
    e = K.exp(x - K.max(x, axis=1, keepdims=True))
    s = K.sum(e, axis=1, keepdims=True)
    return e / s

# Attention

Attention is needed so that the long sequences of the textual data can be processed with maximum precision. Repeat vector is used to repeat the same input vectors at every iteration with different hidden state. We will use the tanh and softmax activation functions for the same.

In [None]:
atten_layer_repeat = RepeatVector(max_input_len)
atten_concatenate = Concatenate(axis=-1)
atten_dense1 = Dense(30, activation='tanh')
atten_dense2 = Dense(1, activation=softmax_over_time)
attn_dot = Dot(axes=1)