<a href="https://colab.research.google.com/github/roht20/Portfolio/blob/master/Seq2Seq_LSTM_Model_Language_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Seq2Seq_LSTM_Language_Translation**

In [0]:
import numpy as np
np.random.seed(42)

1. Prepare data
Data was downloaded from http://www.manythings.org/anki/

1.1 Data has sentence pairs each English word has a corresponding Hindi word

In [0]:
!wget http://www.manythings.org/anki/hin-eng.zip --quiet

**Importing the necessary libraries**

In [0]:
import zipfile
import io

In [0]:
zf = zipfile.ZipFile('hin-eng.zip', 'r')

In [0]:
data = ''
with zf.open('hin.txt') as readfile:
  for line in io.TextIOWrapper(readfile, 'utf-8'):
    data += line

In [6]:
data[0:50]

'Wow!\tवाह!\nHelp!\tबचाओ!\nJump.\tउछलो.\nJump.\tकूदो.\nJump'

In [0]:
data =  data.split('\n')

In [8]:
len(data)

2867

In [9]:
data[100:105]

["I can't move.\tमैं हिल नहीं सकता।",
 "I don't know.\tमुझे नहीं पता।",
 "I don't know.\tमुझे नहीं मालूम।",
 'I have a car.\tमेरे पास एक गाड़ी है।',
 'I have a dog.\tमेरे पास एक कुत्ता है।']

**Review the data

1.2 Separate out Encoder and Decoder input data**

In [0]:
encoder_text = []

In [0]:
decoder_text = []

In [0]:
for line in data:
    try:
        in_txt, out_txt = line.split('\t')
        encoder_text.append(in_txt)
        
        # Add tab '<start>' as 'start sequence in target
        # And '<end>' as End
        decoder_text.append('<start> ' + out_txt + ' <end>')
    except:
        pass #ignore data which goes into error 

In [13]:
decoder_text[100:105]

['<start> मैं हिल नहीं सकता। <end>',
 '<start> मुझे नहीं पता। <end>',
 '<start> मुझे नहीं मालूम। <end>',
 '<start> मेरे पास एक गाड़ी है। <end>',
 '<start> मेरे पास एक कुत्ता है। <end>']

In [14]:
encoder_text[100:105]

["I can't move.",
 "I don't know.",
 "I don't know.",
 'I have a car.',
 'I have a dog.']

**1.3 Building Sequences for Encoder and Decoder Input**

**Importing the Tokenzier from the Keras library**

In [0]:
from tensorflow.python.keras.preprocessing.text import Tokenizer

In [0]:
encoder_t = Tokenizer()

In [0]:
encoder_t.fit_on_texts(encoder_text)

In [0]:
encoder_seq = encoder_t.texts_to_sequences(encoder_text)

In [0]:
max_encoder_seq_length = max([len(txt) for txt in encoder_seq])

In [20]:
max_encoder_seq_length

22

In [0]:
encoder_vocab_size = len(encoder_t.word_index)

In [22]:
encoder_vocab_size

2404

In [23]:
encoder_text[100:105]

["I can't move.",
 "I don't know.",
 "I don't know.",
 'I have a car.',
 'I have a dog.']

In [24]:
encoder_seq[100:105]

[[2, 68, 406], [2, 28, 43], [2, 28, 43], [2, 12, 6, 100], [2, 12, 6, 131]]

**Decoder tokenizer**

In [0]:
decoder_t = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')

In [0]:
decoder_t.fit_on_texts(decoder_text)

In [0]:
decoder_seq = decoder_t.texts_to_sequences(decoder_text)

In [0]:
max_decoder_seq_length = max([len(txt) for txt in decoder_seq])

In [29]:
max_decoder_seq_length

27

In [0]:
decoder_vocab_size = len(decoder_t.word_index)

In [31]:
decoder_vocab_size

3009

In [32]:
decoder_text[100:105]

['<start> मैं हिल नहीं सकता। <end>',
 '<start> मुझे नहीं पता। <end>',
 '<start> मुझे नहीं मालूम। <end>',
 '<start> मेरे पास एक गाड़ी है। <end>',
 '<start> मेरे पास एक कुत्ता है। <end>']

In [33]:
decoder_seq[100:105]

[[1, 6, 1498, 5, 162, 2],
 [1, 12, 5, 630, 2],
 [1, 12, 5, 1499, 2],
 [1, 28, 40, 20, 106, 3, 2],
 [1, 28, 40, 20, 208, 3, 2]]

**Padding Sequences**

In [0]:
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [0]:
encoder_input_data = pad_sequences(encoder_seq, maxlen=max_encoder_seq_length, padding='pre')

In [0]:
decoder_input_data = pad_sequences(decoder_seq, maxlen=max_decoder_seq_length, padding='post')

In [37]:
encoder_input_data.shape

(2866, 22)

In [38]:
decoder_input_data.shape

(2866, 27)

**Integer to Word converter for Decoder data**

In [0]:
#decoder_t.word_index

In [0]:
int_to_word_decoder = dict((i,c) for c, i in decoder_t.word_index.items())

In [0]:
#int_to_word_decoder

**1.5 Building Decoder Output data**

In [0]:
decoder_target_data = np.zeros((decoder_input_data.shape[0], decoder_input_data.shape[1]))

In [0]:
for i in range(decoder_input_data.shape[0]):
    for j in range(1,decoder_input_data.shape[1]):
        decoder_target_data[i][j-1] = decoder_input_data[i][j]

In [44]:
decoder_input_data[0]

array([  1, 767,   2,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0], dtype=int32)

In [45]:
decoder_target_data[0]

array([767.,   2.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.])

**
Converting target data in one hot vector**

In [0]:
from tensorflow.python.keras.utils import  to_categorical

In [0]:
decoder_target_one_hot = np.zeros((decoder_input_data.shape[0], 
                                   decoder_input_data.shape[1],
                                   len(decoder_t.word_index)+1))

In [0]:
for i in range(decoder_target_data.shape[0]):
    for j in range(decoder_target_data.shape[1]):
        decoder_target_one_hot[i][j] = to_categorical(decoder_target_data[i][j],
                                                      num_classes=len(
                                                          decoder_t.word_index)+1)

In [49]:
decoder_target_one_hot.shape

(2866, 27, 3010)

**2. Building the Training Model
**

In [0]:
from tensorflow.python.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.python.keras.callbacks import ModelCheckpoint

**
Define config parameters**

In [0]:
encoder_embedding_size = 50
decoder_embedding_size = 50
rnn_units = 256

**2.1 Building Encoder layers**

In [0]:
encoder_inputs = Input(shape=(None,))

In [0]:
encoder_embedding = Embedding(encoder_vocab_size+1, encoder_embedding_size)

In [0]:
encoder_embedding_output = encoder_embedding(encoder_inputs)

In [0]:
x, state_h, state_c = LSTM(rnn_units,return_state=True)(encoder_embedding_output)

In [0]:
encoder_states = [state_h, state_c]

**2.2 Building Decoder layers**

In [0]:
decoder_inputs = Input(shape=(None,))

In [0]:
decoder_embedding = Embedding(decoder_vocab_size + 1, decoder_embedding_size)

In [0]:
decoder_embedding_output = decoder_embedding(decoder_inputs)

In [0]:
decoder_rnn = LSTM(rnn_units, return_sequences=True, return_state=True)

In [0]:
#Initialize initial state with encoder_states
#Output will be all hidden sequences, last 'h' state and last 'c' state
x,_,_ = decoder_rnn(decoder_embedding_output, initial_state=encoder_states)

In [0]:
decoder_dense = Dense(decoder_vocab_size + 1, activation='softmax')

In [0]:
decoder_outputs = decoder_dense(x)

**2.3 Building Model using both Encoder and Decoder layers
**

In [0]:
from tensorflow.python.keras.models import Model

In [0]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [0]:
model.compile(optimizer='adam', loss='categorical_crossentropy')

**3. Training the model**

In [91]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_one_hot,
          batch_size=100,
          epochs=200,
          validation_split=0.2)

Train on 2292 samples, validate on 574 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200


<tensorflow.python.keras.callbacks.History at 0x7f68002d3c18>

**4. Building Model for Prediction

4.1 Build the Encoder Model to predict Encoder States**

In [0]:
encoder_model = Model(encoder_inputs, encoder_states)

**4.2 Build the Decoder Model
Define Input for both 'h' state and 'c' state initialization
Get RNN outputs along with h and c state
Define Decoder Output
Build Model**

In [0]:
decoder_state_input_h = Input(shape=(rnn_units,))

In [0]:
decoder_state_input_c = Input(shape=(rnn_units,))

In [0]:
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

**
Get RNN outputs, state(s)**

In [0]:
x = decoder_embedding(decoder_inputs)

In [0]:
#We will use the layer which we trained earlier
rnn_outputs, state_h, state_c = decoder_rnn(x, initial_state=decoder_states_inputs)

In [0]:

decoder_states = [state_h, state_c]

**get Decoder output**

In [0]:
decoder_outputs = decoder_dense(rnn_outputs)

**Build Decoder Model**

In [0]:
decoder_model = Model([decoder_inputs] + decoder_states_inputs,  #Model inputs
                     [decoder_outputs] + decoder_states)

**5.0 Predicting Output**

**Build a prediction function**

In [0]:
def decode_sentence(input_sequence):
    
    #Get the encoder state values
    decoder_initial_states_value = encoder_model.predict(input_sequence)
    
    #Build a sequence with '<start>' - starting sequence for Decoder
    target_seq = np.zeros((1,1))    
    target_seq[0][0] = decoder_t.word_index['<start>']
    
    #flag to check if prediction should be stopped
    stop_loop = False
    
    #Initialize predicted sentence
    predicted_sentence = ''
    
    #start the loop
    while not stop_loop:
        
        predicted_outputs, h, c = decoder_model.predict([target_seq] + 
                                                        decoder_initial_states_value)
        
        #Get the predicted output with highest probability
        predicted_output = np.argmax(predicted_outputs[0,-1,:])
        
        #Get the predicted word from predicter integer
        predicted_word = int_to_word_decoder[predicted_output]
        
        #Check if prediction should stop
        if(predicted_word == '<end>' or len(predicted_sentence) > max_decoder_seq_length):
            
            stop_loop = True
            continue
                    
        #Updated predicted sentence
        if (len(predicted_sentence) == 0):
            predicted_sentence = predicted_word
        else:
            predicted_sentence = predicted_sentence + ' ' + predicted_word
            
        #Update target_seq to be the predicted word index
        target_seq[0][0] = predicted_output
        
        #Update initial states value for decoder
        decoder_initial_states_value = [h,c]
        
    
    return predicted_sentence

In [102]:
#Get a random sentence
start_num = np.random.randint(0, high=len(encoder_text) - 10)
print(start_num)

for i in range(start_num, start_num + 10):
    input_seq = encoder_input_data[i : i+1]
    predicted_sentence = decode_sentence(input_seq)
    print('--------')
    print ('Input sentence: ', encoder_text[i])
    print ('Predicted sentence: ', predicted_sentence )

94
--------
Input sentence:  Bring him in.
Predicted sentence:  उसको अंदर ले आओ।
--------
Input sentence:  Come with us.
Predicted sentence:  हमारे साथ आओ।
--------
Input sentence:  Happy Easter!
Predicted sentence:  एसटर मुबारक हो
--------
Input sentence:  Has Tom left?
Predicted sentence:  टॉम चला गया क्या
--------
Input sentence:  He is French.
Predicted sentence:  वह फ़्रानसीसी है।
--------
Input sentence:  I am at home.
Predicted sentence:  मैं घर पर हूँ।
--------
Input sentence:  I can't move.
Predicted sentence:  मैं हिल नहीं सकता।
--------
Input sentence:  I don't know.
Predicted sentence:  मुझे नहीं पता।
--------
Input sentence:  I don't know.
Predicted sentence:  मुझे नहीं पता।
--------
Input sentence:  I have a car.
Predicted sentence:  मैं गाड़ी चला सकता हूँ।
