# Implementation of NMT

### Importing libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import string
from string import digits
import re
from sklearn.utils import shuffle
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Input, Dense,Embedding, Concatenate, TimeDistributed
from tensorflow.keras.models import Model,load_model, model_from_json
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
import pickle as pkl
import numpy as np

In [None]:
from attention import AttentionLayer

In [None]:
# reading the dataset
with open('mar.txt','r') as f:
  data = f.read()

### Preprocessing

In [None]:
uncleaned_data_list = data.split('\n')

In [None]:
uncleaned_data_list

['Go.\tजा.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #3138228 (sabretou)',
 'Run!\tपळ!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #3138217 (sabretou)',
 'Run!\tधाव!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #3138218 (sabretou)',
 'Run!\tपळा!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #3138219 (sabretou)',
 'Run!\tधावा!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #3138220 (sabretou)',
 'Who?\tकोण?\tCC-BY 2.0 (France) Attribution: tatoeba.org #2083030 (CK) & #3138225 (sabretou)',
 'Wow!\tवाह!\tCC-BY 2.0 (France) Attribution: tatoeba.org #52027 (Zifre) & #6728118 (sabretou)',
 'Fire!\tआग!\tCC-BY 2.0 (France) Attribution: tatoeba.org #1829639 (Spamster) & #3232248 (sabretou)',
 'Fire!\tफायर!\tCC-BY 2.0 (France) Attribution: tatoeba.org #1829639 (Spamster) & #3232249 (sabretou)',
 'Help!\tवाचवा!\tCC-BY 2.0 (France) Attribution: tatoeba.org #435084 (lukaszpp) & #2086003 (sabretou)',

As you can see that there are many irrelevant texts in this file. In order to have a clean file we will have to remove those texts and properly store the inpue text in a DataFrame.

In [None]:
len(uncleaned_data_list)

41029

In [None]:
uncleaned_data_list = uncleaned_data_list[:38695]

In [None]:
len(uncleaned_data_list)

38695

In [None]:
english_word = []
marathi_word = []
cleaned_data_list = []
for word in uncleaned_data_list:
    english_word.append(word.split('\t')[:-1][0]) # Seperating the english texts
    marathi_word.append(word.split('\t')[:-1][1]) # Seperating the marathi texts

In [None]:
len(english_word), len(marathi_word)


(38695, 38695)

In [None]:
language_data = pd.DataFrame(columns=['English','Marathi']) # making a new DataFrame to store our training data
language_data['English'] = english_word # adding english text to the 'English' column to the DataFrame
language_data['Marathi'] = marathi_word  # adding marathi text to the 'Marathi' column to the DataFrame

In [None]:
language_data.to_csv('language_data.csv', index=False)


In [None]:
language_data = pd.read_csv('language_data.csv')


In [None]:
language_data.head()

Unnamed: 0,English,Marathi
0,Go.,जा.
1,Run!,पळ!
2,Run!,धाव!
3,Run!,पळा!
4,Run!,धावा!


In [None]:
english_text = language_data['English'].values
marathi_text = language_data['Marathi'].values

In [None]:
language_data.isnull().sum()

English    0
Marathi    0
dtype: int64

From the above code cell output, we can conclude that there are no missing values in the DataFrame

## Data Cleaning

In [None]:
# converting to lower case
english_text_ = [x.lower() for x in english_text]
marathi_text_ = [x.lower() for x in marathi_text]

In [None]:
# removing inverted commas
english_text_ = [re.sub("'",'',x) for x in english_text_]
marathi_text_ = [re.sub("'",'',x) for x in marathi_text_]

In [None]:
# removing punctuations
def remove_punc(text_list):
    table = str.maketrans('', '', string.punctuation)
    removed_punc_text = []
    for sent in text_list:
        sentance = [w.translate(table) for w in sent.split(' ')]
        removed_punc_text.append(' '.join(sentance))
    return removed_punc_text
english_text_ = remove_punc(english_text_)
marathi_text_ = remove_punc(marathi_text_)

In [None]:
# removing the digits
remove_digits = str.maketrans('', '', digits)
removed_digits_text = []
for sent in english_text_:
    sentence = [w.translate(remove_digits) for w in sent.split(' ')]
    removed_digits_text.append(' '.join(sentence))
english_text_ = removed_digits_text

# removing the digits from the marathi sentances
marathi_text_ = [re.sub("[२३०८१५७९४६]","",x) for x in marathi_text_]
marathi_text_ = [re.sub("[\u200d]","",x) for x in marathi_text_]

# removing the stating and ending whitespaces
english_text_ = [x.strip() for x in english_text_]
marathi_text_ = [x.strip() for x in marathi_text_]

In [None]:
# removing the starting and ending whitespaces
english_text_ = [x.strip() for x in english_text_]
marathi_text_ = [x.strip() for x in marathi_text_]

In [None]:
# Putting the start and end words in the marathi sentances
marathi_text_ = ["start " + x + " end" for x in marathi_text_]

In [None]:
# manipulated_marathi_text_
marathi_text_[0], english_text_[0]

('start जा end', 'go')

In [None]:
X = english_text_
Y = marathi_text_

In [None]:
# splitting the DataFrame into training and testing
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size = 0.25)
len(X_train),len(y_train), len(X_test), len(y_test)

(29021, 29021, 9674, 9674)

In [None]:
X_train[0], y_train[0]

('she has short hair', 'start त्यांचे छोटे केस आहेत end')

In [None]:
X_test[0], y_test[0]

('do you write love letters', 'start तू प्रेमपत्र लिहितोस का end')

In [None]:
def Max_length(data):
    max_length_ = max([len(x.split(' ')) for x in data])
    return max_length_

#Training data
max_length_english = Max_length(X_train)
max_length_marathi = Max_length(y_train)

#Test data
max_length_english_test = Max_length(X_test)
max_length_marathi_test = Max_length(y_test)

In [None]:
max_length_marathi, max_length_english

(14, 11)

## Tokenization

In [None]:
# Tokenization for English Words
englishTokenizer = Tokenizer()
englishTokenizer.fit_on_texts(X_train)
Eword2index = englishTokenizer.word_index 
vocab_size_source = len(Eword2index) + 1

X_train = englishTokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=max_length_english, padding='post') # padding the english sequence (Training Data)

X_test = englishTokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen = max_length_english, padding='post') # padding the english sequence (Test Data)


In [None]:
# Tokenization for Marathi Words
marathiTokenizer = Tokenizer()
marathiTokenizer.fit_on_texts(y_train)
Mword2index = marathiTokenizer.word_index
vocab_size_target = len(Mword2index) + 1

y_train = marathiTokenizer.texts_to_sequences(y_train)
y_train = pad_sequences(y_train, maxlen=max_length_marathi, padding='post') # padding the marathi sequence (Training Data)

y_test = marathiTokenizer.texts_to_sequences(y_test)
y_test = pad_sequences(y_test, maxlen = max_length_marathi, padding='post') # padding the marathi sequence (Test Data)


In [None]:
vocab_size_source, vocab_size_target

(4595, 10547)

In [None]:
# pickling
with open('NMT_data.pkl','wb') as f:
    pkl.dump([X_train, y_train, X_test, y_test],f) # serializing data

In [None]:
with open('NMT_Etokenizer.pkl','wb') as f:
    pkl.dump([vocab_size_source, Eword2index, englishTokenizer], f) # serializing data

with open('NMT_Mtokenizer.pkl', 'wb') as f:
    pkl.dump([vocab_size_target, Mword2index, marathiTokenizer], f) # serializing data

In [None]:
X_train[0], y_train[0]

(array([ 25,  42, 872, 413,   0,   0,   0,   0,   0,   0,   0], dtype=int32),
 array([   1,  626, 1177,  426,   11,    2,    0,    0,    0,    0,    0,
           0,    0,    0], dtype=int32))

In [None]:
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

In [None]:
X_train[0], y_train[0]

(array([ 25,  42, 872, 413,   0,   0,   0,   0,   0,   0,   0], dtype=int32),
 array([   1,  626, 1177,  426,   11,    2,    0,    0,    0,    0,    0,
           0,    0,    0], dtype=int32))

## Encoder-Decoder Architecture

In [None]:
from keras import backend as K 
K.clear_session() 
latent_dim = 500 

# Encoder 
encoder_inputs = Input(shape=(max_length_english,)) 
enc_emb = Embedding(vocab_size_source, latent_dim,trainable=True)(encoder_inputs) 

#LSTM 1 
encoder_lstm1 = LSTM(latent_dim,return_sequences=True,return_state=True) 
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb) 

#LSTM 2 
encoder_lstm2 = LSTM(latent_dim,return_sequences=True,return_state=True) 
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1) 

#LSTM 3 
encoder_lstm3=LSTM(latent_dim, return_state=True, return_sequences=True) 
encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2) 

# Set up the decoder. 
decoder_inputs = Input(shape=(None,)) 
dec_emb_layer = Embedding(vocab_size_target, latent_dim,trainable=True) 
dec_emb = dec_emb_layer(decoder_inputs) 

#LSTM using encoder_states as initial state
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True) 
decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb,initial_state=[state_h, state_c]) 

#Attention Layer
attn_layer = AttentionLayer(name='attention_layer') 
attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs]) 

# Concat attention output and decoder LSTM output 
decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attn_out])

#Dense layer
decoder_dense = TimeDistributed(Dense(vocab_size_target, activation='softmax')) 
decoder_outputs = decoder_dense(decoder_concat_input) 

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs) 
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 11)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 11, 500)      2297500     input_1[0][0]                    
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 11, 500), (N 2002000     embedding[0][0]                  
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
______________________________________________________________________________________________

### Building Model

In [None]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1) # for ovoiding Overtraining in our model  

In [None]:
history = model.fit([X_train, y_train[:,:-1]], y_train.reshape(y_train.shape[0], y_train.shape[1],1)[:,1:], 
                    epochs=50, 
                    callbacks=[es],
                    batch_size=512,
                    validation_data = ([X_test, y_test[:,:-1]], y_test.reshape(y_test.shape[0], y_test.shape[1], 1)[:,1:]))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 00023: early stopping


In [None]:
# saving the model
model_json = model.to_json()
with open("NMT_model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("NMT_model_weight.h5")
print("Saved model to disk")

Saved model to disk


In [None]:
# loading the model architecture and asigning the weights
json_file = open('NMT_model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model_loaded = model_from_json(loaded_model_json, custom_objects={'AttentionLayer': AttentionLayer})
# load weights into new model
model_loaded.load_weights("NMT_model_weight.h5")

In [None]:
with open('NMT_Etokenizer.pkl','rb') as f:
    vocab_size_source, Eword2index, englishTokenizer = pkl.load(f)

with open('NMT_Mtokenizer.pkl', 'rb') as f:
    vocab_size_target, Mword2index, marathiTokenizer = pkl.load(f)

with open('NMT_data.pkl','rb') as f:
    X_train, y_train, X_test, y_test = pkl.load(f)

In [None]:
# vector to word
Eindex2word = englishTokenizer.index_word
Mindex2word = marathiTokenizer.index_word

In [None]:
model_loaded.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 11)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 11, 500)      2297500     input_1[0][0]                    
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 11, 500), (N 2002000     embedding[0][0]                  
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
______________________________________________________________________________________________

In [None]:
latent_dim=500
# encoder inference
encoder_inputs = model_loaded.input[0]  #loading encoder_inputs
encoder_outputs, state_h, state_c = model_loaded.layers[6].output #loading encoder_outputs

print(encoder_outputs.shape)

encoder_model = Model(inputs=encoder_inputs,outputs=[encoder_outputs, state_h, state_c])

# decoder inference
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_hidden_state_input = Input(shape=(11,latent_dim))

# Get the embeddings of the decoder sequence
decoder_inputs = model_loaded.layers[3].output

print(decoder_inputs.shape)
dec_emb_layer = model_loaded.layers[5]

dec_emb2= dec_emb_layer(decoder_inputs)

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_lstm = model_loaded.layers[7]
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])

#attention inference
attn_layer = model_loaded.layers[8]
attn_out_inf, attn_states_inf = attn_layer([decoder_hidden_state_input, decoder_outputs2])

concate = model_loaded.layers[9]
decoder_inf_concat = concate([decoder_outputs2, attn_out_inf])

# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_dense = model_loaded.layers[10]
decoder_outputs2 = decoder_dense(decoder_inf_concat)

# Final decoder model
decoder_model = Model(
[decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
[decoder_outputs2] + [state_h2, state_c2])

(None, 11, 500)
(None, None)


In [None]:
# function representing our predicted sequence (output)
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    e_out, e_h, e_c = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))

    # Chose the 'start' word as the first word of the target sequence
    target_seq[0, 0] = Mword2index['start']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        if sampled_token_index == 0:
            break
        else:
            sampled_token = Mindex2word[sampled_token_index]

            if(sampled_token!='end'):
                decoded_sentence += ' '+sampled_token

              # Exit condition: either hit max length or find stop word.
            if (sampled_token == 'end' or len(decoded_sentence.split()) >= (26-1)):
                  stop_condition = True

            # Update the target sequence (of length 1).
            target_seq = np.zeros((1,1))
            target_seq[0, 0] = sampled_token_index

            # Update internal states
            e_h, e_c = h, c

    return decoded_sentence

In [None]:
# function for Original Sequence
def seq2summary(input_seq):
    newString=''
    for i in input_seq:
        if((i!=0 and i!=Mword2index['start']) and i!=Mword2index['end']):
            newString=newString+Mindex2word[i]+' '
    return newString

# function for English sequence (input)
def seq2text(input_seq):
    newString=''
    for i in input_seq:
        if(i!=0):
            newString=newString+Eindex2word[i]+' '
    return newString

In [None]:
# printing our results 
for i in range(10):
    print("Review:",seq2text(X_test[i]))
    print("Original summary:",seq2summary(y_test[i]))
    print("Predicted summary:",decode_sequence(X_test[i].reshape(1,11)))
    print("\n")

Review: do you write love letters 
Original summary: तू प्रेमपत्र लिहितोस का 
Predicted summary:  तुम्ही प्रेमपत्र लिहिता का


Review: its raining 
Original summary: पाऊस पडतोय 
Predicted summary:  पाऊस पडतोय


Review: the police arrested the wrong man 
Original summary: पोलिसांनी चुकीच्या माणसाला अटक केली 
Predicted summary:  पोलिसांनी चोराला अटक केली


Review: you can see the sea on your right 
Original summary: उजवीकडे तू समुद्र बघू शकतोस 
Predicted summary:  उजवीकडे तू समुद्र बघू शकतेस


Review: hes a dj 
Original summary: तो डीजे आहे 
Predicted summary:  तो एक निन्जा आहे


Review: tom is wearing glasses 
Original summary: टॉमने चष्मा घातला आहे 
Predicted summary:  टॉम खाली पडला आहे


Review: where are we going today 
Original summary: आज आपण कुठे चाललो आहोत 
Predicted summary:  आज आपण कुठे चाललो आहोत


Review: i want to become an actress 
Original summary: मला अभिनेत्री बनायचं आहे 
Predicted summary:  मला इंजिनियर बनायचं आहे


Review: she cant find her hat 
Original summary: तिला 