<a href="https://colab.research.google.com/github/sau-rabh999/MachineTranslation/blob/main/Machine_Translation_LSTM(seq2seq_network).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Using seq2seq model to perform our task of machine translation. This involves translating from english language to hindi language

**Sacrebleu is used as an accuracy parameter which gives us the accuracy of our model prediction**

In [None]:
!pip install sacrebleu

Collecting sacrebleu
[?25l  Downloading https://files.pythonhosted.org/packages/7e/57/0c7ca4e31a126189dab99c19951910bd081dea5bbd25f24b77107750eae7/sacrebleu-1.5.1-py3-none-any.whl (54kB)
[K     |██████                          | 10kB 11.2MB/s eta 0:00:01[K     |████████████                    | 20kB 16.3MB/s eta 0:00:01[K     |██████████████████              | 30kB 11.4MB/s eta 0:00:01[K     |████████████████████████        | 40kB 9.1MB/s eta 0:00:01[K     |██████████████████████████████  | 51kB 4.2MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 3.2MB/s 
[?25hCollecting portalocker==2.0.0
  Downloading https://files.pythonhosted.org/packages/89/a6/3814b7107e0788040870e8825eebf214d72166adf656ba7d4bf14759a06a/portalocker-2.0.0-py2.py3-none-any.whl
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-2.0.0 sacrebleu-1.5.1


Importing Required Libraries

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import string
import re
import math
import os
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding
from tensorflow.keras.optimizers import RMSprop
from sacrebleu import sentence_bleu
from sklearn.model_selection import train_test_split

Reading dataset

In [None]:
df=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Machine Translation/hin.txt", sep='\t', header=None, names=["english_sentence","hindi_sentence","path"])
df.head(10)

Unnamed: 0,english_sentence,hindi_sentence,path
0,Wow!,वाह!,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
1,Help!,बचाओ!,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
2,Jump.,उछलो.,CC-BY 2.0 (France) Attribution: tatoeba.org #6...
3,Jump.,कूदो.,CC-BY 2.0 (France) Attribution: tatoeba.org #6...
4,Jump.,छलांग.,CC-BY 2.0 (France) Attribution: tatoeba.org #6...
5,Hello!,नमस्ते।,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
6,Hello!,नमस्कार।,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
7,Cheers!,वाह-वाह!,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
8,Cheers!,चियर्स!,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
9,Got it?,समझे कि नहीं?,CC-BY 2.0 (France) Attribution: tatoeba.org #4...


In [None]:
df= df.drop(columns=['path'])
df.shape

(2774, 2)

In [None]:
df.isnull().sum()

english_sentence    0
hindi_sentence      0
dtype: int64

Data Pre-Processing

In [None]:
df['english_sentence']=df['english_sentence'].apply(lambda x: x.lower())
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: x.lower())
df['english_sentence']=df['english_sentence'].apply(lambda x: x.strip())
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: x.strip())
df['english_sentence']=df['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))
df['english_sentence']=df['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in string.punctuation))
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in string.punctuation))
df['english_sentence']=df['english_sentence'].str.replace('\d+', '')
df['hindi_sentence']=df['hindi_sentence'].str.replace('\d+', '')

Marking start and end  in the target language data

In [None]:
start = '<s> '
end = ' </s>'
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x : start + x + end)

In [None]:
df.head()

Unnamed: 0,english_sentence,hindi_sentence
0,wow,<s> वाह </s>
1,help,<s> बचाओ </s>
2,jump,<s> उछलो </s>
3,jump,<s> कूदो </s>
4,jump,<s> छलांग </s>


In [None]:
english_vocab = {}
for i in df.english_sentence:
  for word in i.split():
    if word not in english_vocab:
      english_vocab[word] = 1
    else:
      english_vocab[word]+=1

hindi_vocab={}
for j in df.hindi_sentence:
  for a in j.split():
    if a not in hindi_vocab:
      hindi_vocab[a] = 1
    else:
      hindi_vocab[a]+=1

In [None]:
num_encoder_tokens=len(english_vocab.keys())
num_decoder_token=len(hindi_vocab.keys())
length = []
for i in df.english_sentence:
  length.append(len(i.split(' ')))
max_input_length = max(length)
print('max_input_length: ', max_input_length)
length = []
for i in df.hindi_sentence:
  length.append(len(i.split(' ')))
max_output_length = max(length)
print('max_output_length: ', max_output_length)

max_input_length:  22
max_output_length:  27


In [None]:
input_words = sorted(list(english_vocab.keys()))
target_words = sorted(list(hindi_vocab.keys()))

In [None]:
input_token_index = dict([(word, i) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i) for i, word in enumerate(target_words)])

In [None]:
encoder_input_data = np.zeros((len(df.english_sentence), max_input_length), dtype='float32')
decoder_input_data = np.zeros((len(df.hindi_sentence), max_output_length), dtype='float32')
decoder_target_data = np.zeros((len(df.hindi_sentence), max_output_length, num_decoder_token))

In [None]:
for i,(input_text, output_text) in enumerate(zip(df.english_sentence, df.hindi_sentence)):
  for t, word in enumerate(input_text.split()):
    encoder_input_data[i,t] = input_token_index[word]
  for t,word in enumerate(output_text.split()):
    decoder_input_data[i,t] = target_token_index[word]
    if t > 0:
      decoder_target_data[i,t-1,target_token_index[word]] = 1

In [None]:
latent_dim=300

Embedding convert wored token index to word vector                 
LSTM gives 3 outputs:

    i) output for next layer
    ii) cell memory
    iii) hidden state

But we need only the encoder states i.e, the cell memory value and hidden state value to give it as the input to the decoder

In [None]:
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

In [None]:
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_token, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_token, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['accuracy'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 300)    702900      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 300)    890700      input_2[0][0]                    
______________________________________________________________________________________________

Fitting the model

In [None]:
 model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=100, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7f94ed5e1fd0>

Encoder Model

In [None]:
encoder_model = Model(encoder_inputs, encoder_states)
encoder_model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 300)         702900    
_________________________________________________________________
lstm (LSTM)                  [(None, 300), (None, 300) 721200    
Total params: 1,424,100
Trainable params: 1,424,100
Non-trainable params: 0
_________________________________________________________________


In [None]:
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= dec_emb_layer(decoder_inputs) 
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

Decoder Model

In [None]:
decoder_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 300)    890700      input_2[0][0]                    
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 300)]        0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 300)]        0                                            
____________________________________________________________________________________________

In [None]:
reverse_input_char_index = dict((i,char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i,char) for char, i in target_token_index.items())

In [None]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = target_token_index['<s>']
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char
        if (sampled_char == '</s>' or
           len(decoded_sentence) > 50):
            stop_condition = True
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence

Finding the accuracy of our model using bleu score

In [None]:
k=2001
english =df.english_sentence[k:k+1].values[0]
actual = df.hindi_sentence[k:k+1].values[0]
predicted = decode_sequence(encoder_input_data[k:k+1])
print("The actual english sentence is:",english)
print("The actual hindi sentence is:",actual)
print("The predicted hindi sentence is:",predicted)
print("The BLEU score is :",sentence_bleu(predicted,[actual]).score)

The actual english sentence is: the doctor advised him not to smoke
The actual hindi sentence is: <s> डॉक्टर ने उसे सिगरेट न पीने की सलह दी। </s>
The predicted hindi sentence is:  डॉक्टर ने अपने बाल पीने की कोशिश करी। </s>
The BLEU score is : 20.52596383056271


In [None]:
k=1301
english =df.english_sentence[k:k+1].values[0]
actual = df.hindi_sentence[k:k+1].values[0]
predicted = decode_sequence(encoder_input_data[k:k+1])
print("The actual english sentence is:",english)
print("The actual hindi sentence is:",actual)
print("The predicted hindi sentence is:",predicted)
print("The BLEU score is :",sentence_bleu(predicted,[actual]).score)

The actual english sentence is: you are not coming are you
The actual hindi sentence is: <s> तुम नहीं आ रहे हो ना </s>
The predicted hindi sentence is:  तुम तुम क्यों नहीं कर सकते। </s>
The BLEU score is : 21.53672420052281


In [None]:
k=1
english =df.english_sentence[k:k+1].values[0]
actual = df.hindi_sentence[k:k+1].values[0]
predicted = decode_sequence(encoder_input_data[k:k+1])
print("The actual english sentence is:",english)
print("The actual hindi sentence is:",actual)
print("The predicted hindi sentence is:",predicted)
print("The BLEU score is :",sentence_bleu(predicted,[actual]).score)

The actual english sentence is: help
The actual hindi sentence is: <s> बचाओ </s>
The predicted hindi sentence is:  मैं मत करो। </s>
The BLEU score is : 35.64026463354184


In [None]:
test_sentence = 'i am fine'
encoder_test_data = np.zeros((len(df.english_sentence), max_input_length), dtype='float32')
for t, word in enumerate(test_sentence.split()):
    encoder_test_data[1,t] = input_token_index[word]
decode_sequence(encoder_test_data[1:2])

' मैं थक गया हूँ। </s>'