### Load tensorflow

In [1]:
import tensorflow as tf
tf.compat.v1.reset_default_graph()
tf.compat.v1.set_random_seed(42)

### Read the data
<font size="2">Data for this exercise can be downloaded from http://www.manythings.org/anki/</font>

In [0]:
#You can use wget to download the file directly
!wget http://www.manythings.org/anki/hin-eng.zip --quiet

In [0]:
!ls -l

In [0]:
!unzip hin-eng.zip

In [0]:
!cat hin.txt

In [3]:
import zipfile
import io

#Read the zip file
zf = zipfile.ZipFile('data/hin-eng.zip', 'r')

#Extract data from zip file
data = ''
with zf.open('hin.txt') as readfile:
  for line in io.TextIOWrapper(readfile, 'utf-8'):
    data += line

In [4]:
len(data)

430654

In [8]:
data[40000:50000]

"minshirui) & #482431 (minshirui)\nIt's not my fault.\tमेरी ग़लती नहीं है।\tCC-BY 2.0 (France) Attribution: tatoeba.org #321324 (CK) & #609869 (minshirui)\nIt's starting now.\tशुरू हो रहा है।\tCC-BY 2.0 (France) Attribution: tatoeba.org #33137 (CK) & #480236 (minshirui)\nMy bag was stolen.\tमेरा बस्ता चोरी हो गया था।\tCC-BY 2.0 (France) Attribution: tatoeba.org #35155 (CK) & #588618 (minshirui)\nMy eyes are tired.\tमेरी आँखें थक गईं हैं।\tCC-BY 2.0 (France) Attribution: tatoeba.org #507513 (CK) & #588532 (minshirui)\nNo one's in sight.\tआसपास कोई नहीं है।\tCC-BY 2.0 (France) Attribution: tatoeba.org #2091110 (CK) & #3216498 (anubhav93)\nPut your hands up!\tहाथ ऊपर करो!\tCC-BY 2.0 (France) Attribution: tatoeba.org #265926 (CK) & #443082 (minshirui)\nShe began to sing.\tवह गाने लगी।\tCC-BY 2.0 (France) Attribution: tatoeba.org #312881 (CK) & #514032 (minshirui)\nShe has blue eyes.\tउसकी नीली आँखें हैं।\tCC-BY 2.0 (France) Attribution: tatoeba.org #472087 (CK) & #588558 (minshirui)\nShe i


### Extract Source and Target Language pairs

In [9]:
#Split by newline character
data =  data.split('\n')

#Show some Data
data[100:105]

['We promised.\tहमने वादा किया।\tCC-BY 2.0 (France) Attribution: tatoeba.org #2107668 (CK) & #8876370 (simranbansal)',
 "What's this?\tयह क्या है?\tCC-BY 2.0 (France) Attribution: tatoeba.org #413821 (CK) & #443158 (minshirui)",
 'Are you sick?\tक्या तुम बीमार हो?\tCC-BY 2.0 (France) Attribution: tatoeba.org #434252 (lukaszpp) & #518699 (minshirui)',
 'Bring him in.\tउसको अंदर ले आओ।\tCC-BY 2.0 (France) Attribution: tatoeba.org #307895 (CK) & #475932 (minshirui)',
 'Come with us.\tहमारे साथ आओ।\tCC-BY 2.0 (France) Attribution: tatoeba.org #433696 (CK) & #485546 (minshirui)']

In [11]:
len(data)

2981

### Separate Source and Target pairs

In [28]:
encoder_text = [] #Initialize Source language list
decoder_text = [] #Initialize Target language list

#Iterate over data
for line in data:
    try:
        in_txt, out_txt,_ = line.split('\t')
        encoder_text.append(in_txt)
        
        # Add tab '<start>' as 'start sequence in target
        # And '<end>' as End
        decoder_text.append('<start> ' + out_txt + ' <end>')
    except:
        pass #ignore data which goes into error        

### Separate Source and Target pairs..

In [29]:
encoder_text[100:105]

['We promised.',
 "What's this?",
 'Are you sick?',
 'Bring him in.',
 'Come with us.']

In [30]:
decoder_text[100:105]

['<start> हमने वादा किया। <end>',
 '<start> यह क्या है? <end>',
 '<start> क्या तुम बीमार हो? <end>',
 '<start> उसको अंदर ले आओ। <end>',
 '<start> हमारे साथ आओ। <end>']

### Tokenize Source language sentences

In [31]:
#Tokenizer for source language
encoder_t = tf.keras.preprocessing.text.Tokenizer()
encoder_t.fit_on_texts(encoder_text) #Fit it on Source sentences

In [32]:
len(encoder_t.word_index)

2410

In [33]:
encoder_t.word_index

{'the': 1,
 'i': 2,
 'to': 3,
 'you': 4,
 'a': 5,
 'is': 6,
 'he': 7,
 'of': 8,
 'in': 9,
 'it': 10,
 'my': 11,
 'do': 12,
 'have': 13,
 'me': 14,
 'this': 15,
 'was': 16,
 'that': 17,
 'she': 18,
 'for': 19,
 'are': 20,
 'his': 21,
 'what': 22,
 'your': 23,
 "don't": 24,
 'tom': 25,
 'we': 26,
 'on': 27,
 'at': 28,
 'will': 29,
 'like': 30,
 'him': 31,
 'her': 32,
 'go': 33,
 'not': 34,
 'be': 35,
 "i'm": 36,
 'with': 37,
 'how': 38,
 'know': 39,
 'can': 40,
 'and': 41,
 'has': 42,
 'up': 43,
 'there': 44,
 'all': 45,
 'time': 46,
 'very': 47,
 'they': 48,
 "it's": 49,
 'come': 50,
 'want': 51,
 'as': 52,
 'did': 53,
 'here': 54,
 'from': 55,
 'had': 56,
 'please': 57,
 'when': 58,
 'an': 59,
 "can't": 60,
 'by': 61,
 'out': 62,
 'about': 63,
 'going': 64,
 'one': 65,
 'no': 66,
 'get': 67,
 "didn't": 68,
 'been': 69,
 'am': 70,
 'take': 71,
 'would': 72,
 'if': 73,
 'father': 74,
 'were': 75,
 'day': 76,
 "i'll": 77,
 'but': 78,
 'book': 79,
 'now': 80,
 'see': 81,
 'money': 82,
 'us

In [34]:
#Convert English text to indexes
encoder_seq = encoder_t.texts_to_sequences(encoder_text) #Convert sentences to numbers 

encoder_seq[100:105] #Display some converted sentences

[[26, 714], [177, 15], [20, 4, 406], [407, 31, 9], [50, 37, 83]]

In [35]:
encoder_text[100:105]

['We promised.',
 "What's this?",
 'Are you sick?',
 'Bring him in.',
 'Come with us.']

In [36]:
#Maximum length of sentence
max_encoder_seq_length = max([len(txt) for txt in encoder_seq])
print('Maximum sentence length for Source language: ', max_encoder_seq_length)

#Source language Vocablury
encoder_vocab_size = len(encoder_t.word_index)
print('Source language vocablury size: ', encoder_vocab_size)

Maximum sentence length for Source language:  22
Source language vocablury size:  2410


### Tokenize Target language sentences

In [38]:
?tf.keras.preprocessing.text.Tokenizer

In [39]:
#Tokenizer for target language, filters should not <start> and <end>
#remove < and > used in Target language sequences
decoder_t = tf.keras.preprocessing.text.Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
decoder_t.fit_on_texts(decoder_text) #Fit it on target sentences
decoder_seq = decoder_t.texts_to_sequences(decoder_text) #Convert sentences to numbers 

In [40]:
#Maximum length of sentence
max_decoder_seq_length = max([len(txt) for txt in decoder_seq])
print('Maximum sentence length for Target language: ', max_decoder_seq_length)

#Target language Vocablury
decoder_vocab_size = len(decoder_t.word_index)
print('Target language vocablury size: ', decoder_vocab_size)

Maximum sentence length for Target language:  27
Target language vocablury size:  3048


In [41]:
decoder_t.word_index

{'<start>': 1,
 '<end>': 2,
 'है।': 3,
 'नहीं': 4,
 'में': 5,
 'मैं': 6,
 'वह': 7,
 'से': 8,
 'मुझे': 9,
 'क्या': 10,
 'है': 11,
 'के': 12,
 'को': 13,
 'की': 14,
 'हैं।': 15,
 'हो': 16,
 'बहुत': 17,
 'एक': 18,
 'का': 19,
 'पर': 20,
 'तुम': 21,
 'उसने': 22,
 'था।': 23,
 'हूँ।': 24,
 'यह': 25,
 'कर': 26,
 'लिए': 27,
 'कि': 28,
 'टॉम': 29,
 'मेरे': 30,
 'हैं': 31,
 'तुम्हें': 32,
 'और': 33,
 'ने': 34,
 'भी': 35,
 'अपने': 36,
 'इस': 37,
 'उसे': 38,
 'मेरी': 39,
 'रहा': 40,
 'मैंने': 41,
 'करने': 42,
 'तो': 43,
 'अपनी': 44,
 'ही': 45,
 'पास': 46,
 'हम': 47,
 'आप': 48,
 'गया।': 49,
 'था': 50,
 'कल': 51,
 'काम': 52,
 'करना': 53,
 'कोई': 54,
 'उसके': 55,
 'बात': 56,
 'साथ': 57,
 'घर': 58,
 'तुम्हारे': 59,
 'उसकी': 60,
 'पता': 61,
 'उस': 62,
 'कुछ': 63,
 'गया': 64,
 'समय': 65,
 'मेरा': 66,
 'सकते': 67,
 'थी।': 68,
 'अभी': 69,
 'रही': 70,
 'रहे': 71,
 'कभी': 72,
 'लगता': 73,
 'यहाँ': 74,
 'किया।': 75,
 'तक': 76,
 'बजे': 77,
 'आ': 78,
 'हूँ': 79,
 'सकता': 80,
 'किसी': 81,
 'आज': 82,
 'चाहिए।': 83

### Compare different sentences length

In [42]:
#Source Language sentences
print('Length for sentence number 100: ', len(encoder_seq[100]))
print('Length for sentence number 2000: ', len(encoder_seq[2000]))

Length for sentence number 100:  2
Length for sentence number 2000:  8


In [44]:
#Target Language sentences
print('Length for sentence number 100: ', len(decoder_seq[100]))
print('Length for sentence number 2000: ', len(decoder_seq[2000]))

Length for sentence number 100:  5
Length for sentence number 2000:  13


In [47]:
encoder_text[100], decoder_text[100]

('We promised.', '<start> हमने वादा किया। <end>')

### How do we make it same?

### Padding the sentences

In [48]:
#Source sentences
encoder_input_data = tf.keras.preprocessing.sequence.pad_sequences(encoder_seq, 
                                                                   maxlen=max_encoder_seq_length, #22
                                                                   padding='pre')

#Target Sentences
decoder_input_data = tf.keras.preprocessing.sequence.pad_sequences(decoder_seq, 
                                                                   maxlen=max_decoder_seq_length, #27
                                                                   padding='post')

In [49]:
print('Source data shape: ', encoder_input_data.shape)
print('Target data shape: ', decoder_input_data.shape)

Source data shape:  (2980, 22)
Target data shape:  (2980, 27)


In [50]:
encoder_text[200]

'A man must work.'

In [51]:
encoder_input_data[200]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   5,  92,  96, 104])

In [52]:
decoder_text[200]

'<start> एक आदमी के लिए काम करना ज़रूरी है। <end>'

In [53]:
decoder_input_data[200]

array([  1,  18, 129,  12,  27,  52,  53, 536,   3,   2,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0])

#### Integer to Word converter for Decoder data

In [54]:
int_to_word_decoder = dict((i,c) for c, i in decoder_t.word_index.items())

In [56]:
int_to_word_decoder[15]

'हैं।'

### Building Decoder Output

In [57]:
decoder_input_data.shape

(2980, 27)

In [66]:
import numpy as np

#Initialize array
decoder_target_data = np.zeros((decoder_input_data.shape[0], 
                                decoder_input_data.shape[1]))

#Shift Target output by one word
for i in range(decoder_input_data.shape[0]):
    for j in range(1,decoder_input_data.shape[1]):
        decoder_target_data[i][j-1] = decoder_input_data[i][j]

In [74]:
#decoder_t.word_index

In [0]:
#<start> yeh kitab hai <end>
#Yeh kitab hai <end> 0

In [72]:
decoder_input_data[0]

array([  1, 796,   2,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0])

In [73]:
decoder_target_data[0]

array([796.,   2.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.])

#### Convert target data in one hot vector

In [75]:
#Initialize one hot encoding array
decoder_target_one_hot = np.zeros((decoder_input_data.shape[0], #number of sentences
                                   decoder_input_data.shape[1], #Number of words in each sentence
                                   len(decoder_t.word_index)+1)) #Vocab size + 1

In [93]:
decoder_target_one_hot[1]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [94]:
#Build one hot encoded array
for i in range(decoder_target_data.shape[0]):
    for j in range(decoder_target_data.shape[1]):
        decoder_target_one_hot[i][j] = tf.keras.utils.to_categorical(decoder_target_data[i][j],
                                                                     num_classes=len(decoder_t.word_index)+1)    

In [113]:
decoder_target_one_hot[0][0]

array([0., 0., 0., ..., 0., 0., 0.])

In [114]:
decoder_target_one_hot.shape

(2980, 27, 3049)

In [115]:
decoder_target_one_hot[0].shape

(27, 3049)

### Building the Training Model

In [121]:
#Define config parameters
encoder_embedding_size = 50
decoder_embedding_size = 50
rnn_units = 256

#### Build Encoder

In [119]:
tf.keras.backend.clear_session()

In [123]:
#Input Layer
encoder_inputs = tf.keras.layers.Input(shape=(None,))

#Embedding layer
encoder_embedding = tf.keras.layers.Embedding(encoder_vocab_size+1, 
                                              encoder_embedding_size)

#Get embedding layer output by feeding inputs
encoder_embedding_output = encoder_embedding(encoder_inputs)

#LSTM Layer and its output
x, state_h, state_c = tf.keras.layers.LSTM(rnn_units,return_state=True)(encoder_embedding_output)

#Build a list to feed Decoder - Sentence Embedding
encoder_states = [state_h, state_c]

In [124]:
encoder_embedding_output

<KerasTensor: shape=(None, None, 50) dtype=float32 (created by layer 'embedding_1')>

#### Build Decoder

In [125]:
#Decode input - padded Target sentences
decoder_inputs = tf.keras.layers.Input(shape=(None,))

#Decoder Embedding layer
decoder_embedding = tf.keras.layers.Embedding(decoder_vocab_size + 1, 
                                              decoder_embedding_size)

#Embedding layer output
decoder_embedding_output = decoder_embedding(decoder_inputs)

#Decoder RNN
decoder_rnn = tf.keras.layers.LSTM(rnn_units, 
                                   return_sequences=True, 
                                   return_state=True, )

#Decoder RNN Output, State initialization from Encoder states
#Output will be all hidden sequences, last 'h' state and last 'c' state
x,_,_ = decoder_rnn(decoder_embedding_output, 
                    initial_state=encoder_states)

#Output Layer
decoder_dense = tf.keras.layers.Dense(decoder_vocab_size + 1, #+1 to make sure one-hot encoding works for highest index value
                                      activation='softmax')

#Output of Dense layer
decoder_outputs = decoder_dense(x)

In [126]:
x

<KerasTensor: shape=(None, None, 256) dtype=float32 (created by layer 'lstm_2')>

In [127]:
decoder_outputs

<KerasTensor: shape=(None, None, 3049) dtype=float32 (created by layer 'dense')>

### Build Model using both Encoder and Decoder

In [128]:
model = tf.keras.models.Model([encoder_inputs, decoder_inputs], #2 Inputs to the model
                              decoder_outputs) #Output of the model

In [129]:
model.output

<KerasTensor: shape=(None, None, 3049) dtype=float32 (created by layer 'dense')>

In [130]:
model.compile(optimizer='adam', loss='categorical_crossentropy')

### Train the model

In [131]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, None, 50)     120550      ['input_2[0][0]']                
                                                                                                  
 embedding_2 (Embedding)        (None, None, 50)     152450      ['input_3[0][0]']                
                                                                                              

In [132]:
model.fit([encoder_input_data, decoder_input_data], 
          decoder_target_one_hot,
          batch_size=64,
          epochs=20,
          validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1f8754d5420>

### Save the model for later reuse

In [133]:
model.save('seq2seq_training_translation.hd5')



INFO:tensorflow:Assets written to: seq2seq_training_translation.hd5\assets


INFO:tensorflow:Assets written to: seq2seq_training_translation.hd5\assets


# Building Model for Prediction

### Build the Encoder Model to predict Encoder States

In [134]:
encoder_model = tf.keras.models.Model(encoder_inputs, #Padded input sequences
                                      encoder_states) #Hidden state and Cell state at last time step

In [135]:
encoder_model.output

[<KerasTensor: shape=(None, 256) dtype=float32 (created by layer 'lstm_1')>,
 <KerasTensor: shape=(None, 256) dtype=float32 (created by layer 'lstm_1')>]

### Build the Decoder Model 
<p/>

<ol><li>Define Input for both 'h' state and 'c' state initialization </li>
<li>Get Decoder RNN outputs along with h and c state</li>
<li>Get Decoder Dense layer output</li>
        <li>Build Model</li></ol>

##### Step 1 - Define Input for both 'h' state and 'c' state initialization

In [136]:
#Hidden state input
decoder_state_input_h = tf.keras.layers.Input(shape=(rnn_units,))

#Cell state input
decoder_state_input_c = tf.keras.layers.Input(shape=(rnn_units,))

#Putting it together
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

##### Step 2 - Get Decoder RNN outputs along with h and c state

In [137]:
#Get Embedding layer output
x = decoder_embedding(decoder_inputs)

#We will use the layer which we trained earlier
rnn_outputs, state_h, state_c = decoder_rnn(x, initial_state=decoder_states_inputs)

#Why do we need this?
decoder_states = [state_h, state_c]

##### Step 3 - Get Decoder Dense layer output

In [138]:
decoder_outputs = decoder_dense(rnn_outputs)

##### Step 4 - Build Decoder Model

In [139]:
decoder_model = tf.keras.models.Model([decoder_inputs] + decoder_states_inputs,  #Model inputs
                                      [decoder_outputs] + decoder_states)

# Predicting output from Seq2Seq model

##### Build a prediction function

In [140]:
decoder_t.word_index['<start>']

1

In [141]:
int_to_word_decoder[1]

'<start>'

In [142]:
def decode_sentence(input_sequence):
    
    #Get the encoder state values - Sentence embedding
    decoder_initial_states_value = encoder_model.predict(input_seq)
    
    #Build a sequence with '<start>' - starting sequence for Decoder
    target_seq = np.zeros((1,1))    
    target_seq[0][0] = decoder_t.word_index['<start>']
    
    #flag to check if prediction should be stopped
    stop_loop = False
    
    #Initialize predicted sentence
    predicted_sentence = ''
    
    num_of_predictions = 0
    
    #start the loop
    while not stop_loop:
        
        predicted_outputs, h, c = decoder_model.predict([target_seq] + 
                                                        decoder_initial_states_value)
        
        #Get the predicted word index with highest probability
        predicted_output = np.argmax(predicted_outputs[0,-1,:])
        
        #Get the predicted word from predicter index
        predicted_word = int_to_word_decoder[predicted_output]
        
        #Check if prediction should stop
        if(predicted_word == '<end>' or num_of_predictions > max_decoder_seq_length):
            
            stop_loop = True
            continue
        
        num_of_predictions += 1
        
        #Updated predicted sentence
        if (len(predicted_sentence) == 0):
            predicted_sentence = predicted_word
        else:
            predicted_sentence = predicted_sentence + ' ' + predicted_word
            
        #Update target_seq to be the predicted word index
        target_seq[0][0] = predicted_output
        
        #Update initial states value for decoder
        decoder_initial_states_value = [h,c]
        
    
    return predicted_sentence

##### Call Prediction function on a random sentence

In [143]:
#Generate a random number
start_num = np.random.randint(0, high=len(encoder_text) - 10)

#Predict model output for 5 sentences
for i in range(start_num, start_num + 5):
    input_seq = encoder_input_data[i : i+1]
    predicted_sentence = decode_sentence(input_seq)
    print('--------')
    print ('Input sentence: ', encoder_text[i])
    print ('Predicted sentence: ', predicted_sentence )

--------
Input sentence:  That is not a tiger.
Predicted sentence:  मैं बहुत नहीं है।
--------
Input sentence:  That was my mistake.
Predicted sentence:  मैं एक बहुत नहीं है।
--------
Input sentence:  The skies are clear.
Predicted sentence:  मैं बहुत नहीं है।
--------
Input sentence:  The trees are green.
Predicted sentence:  मैं बहुत नहीं है।
--------
Input sentence:  These are our books.
Predicted sentence:  मैं एक बहुत नहीं है।


##### Save encoder and decoder model

In [144]:
#Compile models to avoid error
encoder_model.compile(optimizer='adam',loss='categorical_crossentropy')
decoder_model.compile(optimizer='adam',loss='categorical_crossentropy')

#Save the models
encoder_model.save('seq2seq_encoder_eng_hin.hd5')  #Encoder model
decoder_model.save('seq2seq_decoder_eng_hin.hd5')  #Decoder model



INFO:tensorflow:Assets written to: seq2seq_encoder_eng_hin.hd5\assets


INFO:tensorflow:Assets written to: seq2seq_encoder_eng_hin.hd5\assets


INFO:tensorflow:Assets written to: seq2seq_decoder_eng_hin.hd5\assets


INFO:tensorflow:Assets written to: seq2seq_decoder_eng_hin.hd5\assets


##### Save encoder and decoder tokenizers

In [145]:
import pickle

pickle.dump(encoder_t,open('encoder_tokenizer_eng','wb'))
pickle.dump(decoder_t,open('decoder_tokenizer_hin','wb'))