# Neural Machine Translation Experiments for Hindi-English using an Encoder-Decoder Architecture along with the Attention mechanism

In [1]:
# Imports
import numpy as np
import pandas as pd
import re
import string
from string import digits
import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shrea\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, Bidirectional, RepeatVector, Concatenate, Dot, Lambda
from keras.callbacks import ModelCheckpoint
from keras.models import Input, Model
import keras.backend as K
from sklearn.model_selection import train_test_split

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [3]:
# Reading data
data = pd.read_csv('hi_en_corpus.csv')
data.shape

(127607, 3)

In [4]:
data.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [5]:
data['source'].value_counts()

tides        50000
ted          39881
indic2012    37726
Name: source, dtype: int64

In [6]:
data.shape

(127607, 3)

## Data Preprocessing

In [7]:
# Converting null data from float to string
data['english_sentence'] = data['english_sentence'].astype(str)
data['hindi_sentence'] = data['hindi_sentence'].astype(str)

In [8]:
max_eng_length = max(data['english_sentence'].apply(lambda x: len(x.split(' '))))
max_hindi_length = max(data['hindi_sentence'].apply(lambda x: len(x.split(' '))))
print(max_eng_length)
print(max_hindi_length)

398
418


In [9]:
count = 0
delete_rows = []
for i in range(0, data.shape[0]):
    len_eng = len(data['english_sentence'][i].split())
    len_hin = len(data['hindi_sentence'][i].split())
    if len_eng > 25 or len_hin > 25:
        delete_rows.append(i)
        count = count+1
print(count)

29786


In [10]:
data = data.drop(delete_rows)
data.drop_duplicates(inplace=True)
data.drop(['source'],axis=1,inplace=True)
data.isna().sum()

english_sentence    0
hindi_sentence      0
dtype: int64

In [11]:
data.shape

(95086, 2)

In [12]:
def clean_english_data(sentence):
    exclude = set(string.punctuation)
    remove_digits = str.maketrans('', '', string.digits)
    sentence = sentence.lower()
    sentence = ''.join(ch for ch in sentence if ch not in exclude)
    sentence = sentence.translate(remove_digits)
    sentence = sentence.strip()
    sentence = re.sub(" +", " ", sentence)
    return sentence

In [13]:
def clean_hindi_data(sentence):
    exclude = set(string.punctuation)
    remove_digits = str.maketrans('', '', string.digits)
    sentence = sentence.lower()
    sentence = ''.join(ch for ch in sentence if ch not in exclude)

    sent_temp = ''
    for c in sentence:
        if c == ' ':
            sent_temp += c
        elif ord(u'\u0900') <= ord(c) <= ord(u'\u097F'):
            sent_temp += c
    sentence = sent_temp

    sentence = re.sub('[a-z]', '', sentence)
    sentence = re.sub('[०१२३४५६७८९]', '', sentence)
    sentence = sentence.translate(remove_digits)
    sentence = sentence.strip()
    sentence = re.sub(" +", " ", sentence)
    return sentence

In [14]:
X = [clean_english_data(x) for x in data['english_sentence'].values]
Y = [clean_hindi_data(y) for y in data['hindi_sentence'].values]

## Tokenization and Padding

As a part of this process we'll tokenize both the input and output. The important thing over here is the addition of the <START>(START) and <END>(END) tags on the output sequence. This is for our neural network to understand the sequence and when to break the processing in our architecture. Then we will padd the tokens to the largest length of the sentence.

In [15]:
MAX_VOCAB = 30000

In [16]:
def tokenize(x,is_hindi = False):
    # Add START and END tag to the sentence
    tokenizer_input = Tokenizer(num_words=MAX_VOCAB)
    if is_hindi:
        target_input_sequences = []
        target_sequences = []
        for sentence in x:
            target_input_sequences.append('START_ '+sentence)
            target_sequences.append(sentence+' _END')
        tokenizer_output = Tokenizer(num_words=30000)
        tokenizer_output.fit_on_texts(target_input_sequences+target_sequences)
        target_input_sequences = tokenizer_output.texts_to_sequences(target_input_sequences)
        target_sequences = tokenizer_output.texts_to_sequences(target_sequences)
        return target_input_sequences,target_sequences,tokenizer_output
    else:
        tokenizer_input = Tokenizer(num_words=30000) 
        tokenizer_input.fit_on_texts(x)
        input_sequences = tokenizer_input.texts_to_sequences(x)
        return input_sequences,tokenizer_input 

In [17]:
input_sequences, tokenizer_input = tokenize(X)
target_input_sequences,target_sequences, tokenizer_output = tokenize(Y,is_hindi=True)
print(len(target_input_sequences),len(target_sequences))

95086 95086


In [18]:
def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
    padded_x = pad_sequences(x, maxlen = length, padding = 'post', truncating = 'post')
    return padded_x

In [19]:
input_sequences = pad(input_sequences)
target_input_sequences = pad(target_input_sequences)
target_sequences = pad(target_sequences)

input_num_words = len(tokenizer_input.word_index) + 1
target_num_words = len(tokenizer_output.word_index) + 1  

max_input_len = len(input_sequences[0])
max_target_len = len(target_sequences[0])

words_input = tokenizer_input.word_index
words_output = tokenizer_output.word_index

In [20]:
print("English vocab size: ", input_num_words)
print("Hindi vocab size: ", target_num_words)

English vocab size:  49253
Hindi vocab size:  48413


## Model Architecture

Now that the data is prepared, we can proceed by creating the model architecture

We will create the input and the target word embedding matrix.

We will be working with Bi-Directional LSTM as we need to make sure that the context of the output is maintained along with the sequence structure. This is the core advantage of using BiLSTM as it not only preserver information and data from past (like single unit LSTM) but also considers future data. 

The output structure will be closesly related to the context of input which is very essential while converting sentences from one language to another.

In [21]:
DIMENSIONS = 100
LSTM_UNITS = 256

In [22]:
# Converting word to word vectors
word2Vec = {}
with open('C:/Users/shrea/Desktop/Jupyter Notebooks/IITB Internship/Russian Translation/Experiments/Pretrained/glove.6B.100d.txt', encoding="utf8") as f:
    for line in f:
        line = line.split(' ')
        word = line[0]
        word2Vec[word] = line[1:]

In [23]:
input_embedding_matrix = np.zeros((input_num_words, DIMENSIONS))
for word, k in words_input.items():
    if k < input_num_words:
        embedding_vector = word2Vec.get(word)
        if embedding_vector is not None:
            input_embedding_matrix[k] = embedding_vector

In [24]:
target_embedding_matrix = np.zeros((target_num_words, DIMENSIONS))
for word, k in words_input.items():
    if k < target_num_words:
        embedding_vector = word2Vec.get(word)
        if embedding_vector is not None:
            target_embedding_matrix[k] = embedding_vector

In [25]:
# Creating the input layer embeddings
# We take the 2D input and convert it to a tensor shaped embedding matrix.
embedding_input = Input(shape=(max_input_len,))
embedding_input_layer = Embedding(input_num_words, DIMENSIONS, weights=[input_embedding_matrix], trainable=True)
x = embedding_input_layer(embedding_input)

Instructions for updating:
Colocations handled automatically by placer.


In Neural Machine Translation, it is been found that deeper architectures tends to have much better performance than a single unit neural network. In [this paper](https://www.aclweb.org/anthology/W17-4710.pdf) the author goes on to explain why deeper architectures are much useful for NMT and goes on to suggest that variation of DeepBi-RNN with depth 8 performs the best. To keep the computational and understanding simpler, we will restrict ourselves to 2 units of Bi-LSTM to improve the performance over single unit Neural Nets.

In [26]:
input_lstm1 = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))
input_lstm1_output = input_lstm1(x)

input_lstm2 = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))
encoder_output = input_lstm2(input_lstm1_output)

In [27]:
def softmax_over_time(x):
    assert(K.ndim(x) > 2)
    e = K.exp(x - K.max(x, axis=1, keepdims=True))
    s = K.sum(e, axis=1, keepdims=True)
    return e / s

# Attention

Attention is needed so that the long sequences of the textual data can be processed with maximum precision. Repeat vector is used to repeat the same input vectors at every iteration with different hidden state. We will use the tanh and softmax activation functions for the same.

In [28]:
atten_layer_repeat = RepeatVector(max_input_len)
atten_concatenate = Concatenate(axis=-1)
atten_dense1 = Dense(30, activation='tanh')
atten_dense2 = Dense(1, activation=softmax_over_time)
attn_dot = Dot(axes=1)

In [29]:
def attention_procedure(h, st_1):
    st_1 = atten_layer_repeat(st_1)
    x = atten_concatenate([h, st_1])
    x = atten_dense1(x)
    alphas = atten_dense2(x)
    context = attn_dot([alphas,h])
    return context

In [30]:
st_0 = Input(shape=(LSTM_UNITS,))
c_0 = Input(shape=(LSTM_UNITS,))
context_last_word_concat_layer = Concatenate(axis=2)

Decoding embedding layer at output and Initiating the Model. We feed the output of the first LSTM layer to input the second LSTM layer before finally applying the softmax activation function on the output. If we feed the softmax to the output of first LSTM unit, it won't give us great results.

We also stack the tensors to get the output and feed it to the model function.

In [31]:
embedding_decoder_input = Input(shape=(max_target_len,))
embedding_decoder_layer = Embedding(target_num_words, DIMENSIONS, weights=[target_embedding_matrix], trainable=True)
decoder_x = embedding_decoder_layer(embedding_decoder_input)

s = st_0
c = c_0
outputs = []
decoder_lstm = LSTM(LSTM_UNITS, return_state=True)
decoder_dense_layer = Dense(target_num_words, activation='softmax')
for i in range(max_target_len):
    context = attention_procedure(encoder_output, s)
    selector = Lambda(lambda x: x[:, i:i+1])
    xt = selector(decoder_x)
    decoder_lstm_input = context_last_word_concat_layer([context, xt])

    decoder_lstm_output, s, c = decoder_lstm(decoder_lstm_input, initial_state=[s,c])

    decoder_output = decoder_dense_layer(decoder_lstm_output)
    outputs.append(decoder_output)

In [32]:
def stack_and_transpose(x):
    x = K.stack(x) 
    x = K.permute_dimensions(x, pattern=(1, 0, 2)) 
    return x

In [33]:
stacker = Lambda(stack_and_transpose)
outputs = stacker(outputs)

model = Model(
  inputs=[
    embedding_input,
    embedding_decoder_input,
    st_0, 
    c_0,
  ],
  outputs=outputs
)

# Model Training

We have prepared the data and the model architecture to train the model. We will use callback functions to regularly save our progress and then train our model.

We have used sparse_categorical_crossentropy as we have taken the target to be integers and not one-hot vectors. [This blog article](https://towardsdatascience.com/choosing-the-right-hyperparameters-for-a-simple-lstm-using-keras-f8e9ed76f046) gives us an idea as to how to how to fine tune the hyperparameters. For our project, rms_prop proved to be a better optimizer than Adam and hence, accepted in our training model.

We split train-test 80-20 and then train our neural network.

In [None]:
from keras.callbacks import ModelCheckpoint

filepath="C:/Users/shrea/Desktop/Jupyter Notebooks/IITB Internship/Russian Translation/Experiments/Attention-Eng-Hin-NMT/Checkpoints/weights-{epoch:02d}-{val_acc:.3f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, mode='max')
callbacks_list = [checkpoint]

In [None]:
from keras.callbacks import *
from keras.models import load_model

model.load_weights('C:/Users/shrea/Desktop/Jupyter Notebooks/IITB Internship/Russian Translation/Experiments/Attention-Eng-Hin-NMT/Checkpoints/weights.hdf5')

In [None]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['acc'])

z = np.zeros((len(target_sequences), LSTM_UNITS))
r = model.fit(
  [input_sequences, target_input_sequences, z, z], target_sequences.reshape(target_sequences.shape[0],target_sequences.shape[1], 1),
  batch_size=64,
  epochs=10,
  validation_split=0.2,
  verbose=1,
  callbacks=callbacks_list,
)