In [1]:
import tensorflow as tf
tf.reset_default_graph()
tf.set_random_seed(42)

In [2]:
!wget http://www.manythings.org/anki/hin-eng.zip

--2019-11-18 10:45:52--  http://www.manythings.org/anki/hin-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.24.109.196, 104.24.108.196, 2606:4700:30::6818:6dc4, ...
Connecting to www.manythings.org (www.manythings.org)|104.24.109.196|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 126760 (124K) [application/zip]
Saving to: ‘hin-eng.zip’


2019-11-18 10:45:52 (392 KB/s) - ‘hin-eng.zip’ saved [126760/126760]



In [0]:
import zipfile
import io
#Read the zip file
zf = zipfile.ZipFile('hin-eng.zip', 'r')
#Extract data from zip file
data = ''
with zf.open('hin.txt') as readfile:
  for line in io.TextIOWrapper(readfile, 'utf-8'):
    data += line

In [41]:
len(data)

2786

In [28]:
data[400:500]

'.\tCC-BY 2.0 (France) Attribution: tatoeba.org #631038 (Shishir) & #6179123 (fastrizwaan)\nHello!\tनमस्'

In [29]:
#Split by newline character
data = data.split('\n')
#Show some Data
data[100:105]

["I don't know.\tमुझे नहीं पता।\tCC-BY 2.0 (France) Attribution: tatoeba.org #349064 (fatih) & #609376 (minshirui)",
 "I don't know.\tमुझे नहीं मालूम।\tCC-BY 2.0 (France) Attribution: tatoeba.org #349064 (fatih) & #609377 (minshirui)",
 'I have a car.\tमेरे पास एक गाड़ी है।\tCC-BY 2.0 (France) Attribution: tatoeba.org #252272 (CK) & #477720 (minshirui)',
 'I have a dog.\tमेरे पास एक कुत्ता है।\tCC-BY 2.0 (France) Attribution: tatoeba.org #378502 (CK) & #443037 (minshirui)',
 'I understand.\tमैं समझता हूँ।\tCC-BY 2.0 (France) Attribution: tatoeba.org #433468 (CK) & #588495 (minshirui)']

In [0]:
encoder_text = [] #Initialize Source language list
decoder_text = [] #Initialize Target language list
#Iterate over data
for line in data:
  try:
    in_txt, out_txt, a = line.split('\t')
    encoder_text.append(in_txt)
    decoder_text.append('<start>'+out_txt+'<end>')
  except:
    pass

In [36]:
encoder_text[100:105]

["I don't know.",
 "I don't know.",
 'I have a car.',
 'I have a dog.',
 'I understand.']

In [37]:
decoder_text[100:105]

['<start>मुझे नहीं पता।<end>',
 '<start>मुझे नहीं मालूम।<end>',
 '<start>मेरे पास एक गाड़ी है।<end>',
 '<start>मेरे पास एक कुत्ता है।<end>',
 '<start>मैं समझता हूँ।<end>']

In [39]:
#Tokenizer for source language
encoder_t = tf.keras.preprocessing.text.Tokenizer()
encoder_t.fit_on_texts(encoder_text) #Fit it on Source sentences
encoder_seq = encoder_t.texts_to_sequences(encoder_text) #Convert sentences to numbers
encoder_seq[100:105] #Display some converted sentences

[[2, 27, 43], [2, 27, 43], [2, 14, 6, 97], [2, 14, 6, 124], [2, 209]]

In [40]:
#Maximum length of sentence
max_encoder_seq_length = max([len(txt) for txt in encoder_seq])
print('Maximum sentence length for Source language: ', max_encoder_seq_length)
#Source language Vocablury
encoder_vocab_size = len(encoder_t.word_index)
print('Source language vocablury size: ', encoder_vocab_size)

Maximum sentence length for Source language:  22
Source language vocablury size:  2376


In [0]:
#Tokenizer for target language, filters should not <start> and <end>
#remove < and > used in Target language sequences
decoder_t = tf.keras.preprocessing.text.Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
decoder_t.fit_on_texts(decoder_text) #Fit it on target sentences
decoder_seq = decoder_t.texts_to_sequences(decoder_text) #Convert sentences tonumbers 

In [43]:
#Maximum length of sentence
max_decoder_seq_length = max([len(txt) for txt in decoder_seq])
print('Maximum sentence length for Target language: ', max_decoder_seq_length)
#Target language Vocablury
decoder_vocab_size = len(decoder_t.word_index)
print('Target language vocablury size: ', decoder_vocab_size)

Maximum sentence length for Target language:  25
Target language vocablury size:  3317


In [44]:
#Target Language sentences
print('Length for sentence number 100: ', len(decoder_seq[100]))
print('Length for sentence number 2000: ', len(decoder_seq[2000]))

Length for sentence number 100:  3
Length for sentence number 2000:  7


In [45]:
#Target Language sentences
print('Length for sentence number 100: ', len(encoder_seq[100]))
print('Length for sentence number 2000: ', len(encoder_seq[2000]))

Length for sentence number 100:  3
Length for sentence number 2000:  6


In [0]:
#Source sentences
encoder_input_data = tf.keras.preprocessing.sequence.pad_sequences(encoder_seq,maxlen=max_encoder_seq_length, padding='pre')
#Target Sentences
decoder_input_data = tf.keras.preprocessing.sequence.pad_sequences(decoder_seq, maxlen=max_decoder_seq_length, padding='post')

In [48]:
print('Source data shape: ', encoder_input_data.shape)
print('Target data shape: ', decoder_input_data.shape)

Source data shape:  (2785, 22)
Target data shape:  (2785, 25)


In [0]:
import numpy as np
#Initialize array
decoder_target_data = np.zeros((decoder_input_data.shape[0], decoder_input_data.shape[1]))
#Shift Target output by one word
for i in range(decoder_input_data.shape[0]):
  for j in range(1, decoder_input_data.shape[1]):
    decoder_target_data[i][j-1] = decoder_input_data[i][j]

In [0]:
decoder_target_one_hot = np.zeros((decoder_input_data.shape[0], decoder_input_data.shape[1], len(decoder_t.word_index)+1))

In [0]:
#Build one hot encoded array
for i in range(decoder_target_data.shape[0]):
  for j in range(decoder_target_data.shape[1]):
    decoder_target_one_hot[i][j] = tf.keras.utils.to_categorical(decoder_target_data[i][j],num_classes=len(decoder_t.word_index)+1) 

In [54]:
decoder_target_one_hot.shape

(2785, 25, 3318)

In [55]:
decoder_target_data.shape

(2785, 25)

In [56]:
len(decoder_t.word_index)

3317

In [0]:
int_to_word_decoder = dict((i,c) for c,i in decoder_t.word_index.items())

In [60]:
int_to_word_decoder[5]

'से'

## Encoder model

In [0]:
#Define config parameters
encoder_embedding_size = 50
decoder_embedding_size = 50
rnn_units = 256

In [62]:
#Input Layer
encoder_inputs = tf.keras.layers.Input(shape=(None,))
#Embedding layer
encoder_embedding = tf.keras.layers.Embedding(encoder_vocab_size+1, encoder_embedding_size)
#Get embedding layer output by feeding inputs
encoder_embedding_output = encoder_embedding(encoder_inputs)
#LSTM Layer and its output
x, state_h, state_c = tf.keras.layers.LSTM(rnn_units,return_state=True)(encoder_embedding_output)
#Build a list to feed Decoder
encoder_states = [state_h, state_c]

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [0]:
decoder_inputs = tf.keras.Input(shape=(None,))
#Decoder Embedding layer
decoder_embedding = tf.keras.layers.Embedding(decoder_vocab_size + 1, decoder_embedding_size)
#Embedding layer output
decoder_embedding_output = decoder_embedding(decoder_inputs)
#Decoder RNN
decoder_rnn = tf.keras.layers.LSTM(rnn_units, return_sequences=True, return_state=True)
#Decoder RNN Output, State initialization from Encoder states

#Output will be all hidden sequences, last 'h' state and last 'c' state
x,_,_ = decoder_rnn(decoder_embedding_output, initial_state=encoder_states)
#Output Layer
decoder_dense = tf.keras.layers.Dense(decoder_vocab_size + 1, activation='softmax')
#Output of Dense layer
decoder_outputs = decoder_dense(x)

In [0]:
model = tf.keras.models.Model([encoder_inputs, decoder_inputs], #2 Inputs to the model
                                decoder_outputs) #Output of the model

In [66]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 50)     118850      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 50)     165900      input_2[0][0]                    
______________________________________________________________________________________________

In [0]:
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [68]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_one_hot, batch_size=64, epochs=5, validation_split=0.2)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 2228 samples, validate on 557 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f65d544ed68>

In [0]:
model.save('./seq2seq_training_translation.hd5')

In [0]:
encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)

In [0]:
#Hidden state input
decoder_state_input_h = tf.keras.layers.Input(shape=(rnn_units,))
#Cell state input
decoder_state_input_c = tf.keras.layers.Input(shape=(rnn_units,))
#Putting it together
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

In [0]:
#Get Embedding layer output
x = decoder_embedding(decoder_inputs)
#We will use the layer which we trained earlier
rnn_outputs, state_h, state_c = decoder_rnn(x, initial_state=decoder_states_inputs)
#Why do we need this?
decoder_states = [state_h, state_c]

In [0]:
decoder_outputs=decoder_dense(rnn_outputs)

In [0]:
 decoder_model = tf.keras.models.Model([decoder_inputs] + decoder_states_inputs, #Model inputs
 [decoder_outputs] + decoder_states)

In [0]:
def decode_sentence(input_sequence):

 #Get the encoder state values - Sentence embedding
  decoder_initial_states_value = encoder_model.predict(input_seq)

 #Build a sequence with '<start>' - starting sequence for Decoder
  target_seq = np.zeros((1,1))
  target_seq[0][0] = decoder_t.word_index['<start>']

 #flag to check if prediction should be stopped
  stop_loop = False

  #Initialize predicted sentence
  predicted_sentence = ''
  i = 0
 #start the loop
  while not stop_loop:

    predicted_outputs, h, c = decoder_model.predict([target_seq] + decoder_initial_states_value)
    #Get the predicted word index with highest probability
    predicted_output = np.argmax(predicted_outputs[0,-1,:])
    if(predicted_output ==0):
      predicted_word = ' '
    #Get the predicted word from predicter index
    else:
      predicted_word = int_to_word_decoder[predicted_output]
    
    #Check if prediction should stop
    if(predicted_word == '<end>' or len(predicted_sentence) > max_decoder_seq_length):
      stop_loop = True
      continue
    #Updated predicted sentence
    if (len(predicted_sentence) == 0):
      predicted_sentence = predicted_word
    else:
      predicted_sentence = predicted_sentence + ' ' + predicted_word

    #Update target_seq to be the predicted word index
    target_seq[0][0] = predicted_output

    #Update initial states value for decoder
    decoder_initial_states_value = [h,c]

  return predicted_sentence

In [124]:
start_num = np.random.randint(0, high=len(encoder_text)-10)

for i in range(start_num, start_num+1):
  input_seq = encoder_input_data[i:i+1]
  predicted_sentences = decode_sentence(input_seq)
  print('------------')
  print('Input sentence: ', encoder_text[i])
  print('Predicted sentences: ', predicted_sentences)

------------
Input sentence:  He saw a dog near the door.
Predicted sentences:  में में में                
