# Sequence to Sequence model for english to tamil transilation using rnn encoder and decoder ( at character level)

## dataset is downloaded from http://www.manythings.org/anki/

In [23]:
from __future__ import print_function

from keras.models import Model,load_model
from keras.layers import Input,LSTM,Dense
from keras.callbacks import ModelCheckpoint
import numpy as np

batch_size=128
epochs=25
latent_dim=256
data_path="/home/santhosh/keras/rnn/rnn_encoder_decoder/data/tam.txt"

## vectorizing the data

In [24]:
input_texts=[]
target_texts=[]
input_characters=set()
target_characters=set()
with open(data_path,'r',encoding='utf-8') as f:
    lines=f.read().split('\n')
for line in lines:
    line=line.split('\t')
    if(len(line)!=2):
        continue
    input_text,target_text=line
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text='\t'+target_text+'\n'
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

In [3]:
input_characters=sorted(list(input_characters))
targer_characters=sorted(list(target_characters))
num_encoder_tokens=len(input_characters)
num_decoder_tokens=len(target_characters)
max_encoder_seq_len=max([len(text) for text in input_texts])
max_decoder_seq_len=max([len(text) for text in target_texts])

print('Number of samples:',len(input_texts))
print('number of unique input token:',num_encoder_tokens)
print('number of unique output token:',num_decoder_tokens)
print('Max Sequence length for inputs:',max_encoder_seq_len)
print('Max Sequence length for outputs:',max_decoder_seq_len)


Number of samples: 180
number of unique input token: 53
number of unique output token: 54
Max Sequence length for inputs: 94
Max Sequence length for outputs: 111


## defining token2index

In [25]:
input_token2index=dict([(char,i) for i,char in enumerate(input_characters)])
target_token2index=dict([(char,i) for i,char in enumerate(target_characters)])

## defing encoder_input,decoder_input and decoder_output

In [26]:
encoder_input_data=np.zeros((len(input_texts),max_encoder_seq_len,num_encoder_tokens),dtype='float32')
decoder_input_data=np.zeros((len(input_texts),max_decoder_seq_len,num_decoder_tokens),dtype='float32')
decoder_target_data=np.zeros((len(input_texts),max_decoder_seq_len,num_decoder_tokens),dtype='float32')

## creating training dataset

In [27]:
target_token2index

{'\t': 15,
 '\n': 34,
 ' ': 40,
 '!': 47,
 '(': 18,
 ')': 33,
 ',': 2,
 '.': 8,
 '0': 6,
 '2': 53,
 '?': 43,
 'C': 4,
 'D': 39,
 'அ': 13,
 'ஆ': 36,
 'இ': 25,
 'உ': 14,
 'ஊ': 52,
 'எ': 31,
 'ஏ': 26,
 'ஒ': 1,
 'ஓ': 9,
 'க': 12,
 'ங': 51,
 'ச': 44,
 'ஜ': 7,
 'ஞ': 30,
 'ட': 23,
 'ண': 49,
 'த': 5,
 'ந': 10,
 'ன': 38,
 'ப': 16,
 'ம': 42,
 'ய': 41,
 'ர': 17,
 'ற': 50,
 'ல': 22,
 'ள': 48,
 'ழ': 45,
 'வ': 20,
 'ஷ': 35,
 'ஸ': 24,
 'ா': 28,
 'ி': 21,
 'ீ': 29,
 'ு': 3,
 'ூ': 46,
 'ெ': 27,
 'ே': 32,
 'ை': 37,
 'ொ': 19,
 'ோ': 11,
 '்': 0}

In [28]:
for i,(input_text,target_text) in enumerate(zip(input_texts,target_texts)):
    for t,char in enumerate(input_text):
        encoder_input_data[i,t,input_token2index[char]]=1
    for t,char in enumerate(target_text):
        decoder_input_data[i,t,target_token2index[char]]=1
        # decoder_target_data is ahead of decoder_input_data by one timestep
        if t>0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i,t-1,target_token2index[char]]=1


## Define an input sequence and process it.

In [29]:


encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]




## Set up the decoder, using encoder_states as initial state.

In [30]:


decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)



## Define the model that will turn
## `encoder_input_data` & `decoder_input_data` into `decoder_target_data`

In [31]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=["accuracy"])

## function to test model

In [32]:
# Next: inference mode (sampling).
# Here's the drill:
# 1) encode input and retrieve initial decoder state
# 2) run one step of decoder with this initial state
# and a "start of sequence" token as target.
# Output will be the next target token
# 3) Repeat with the current target token and current states

# Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in input_token2index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token2index.items())


def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token2index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_len):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence


## Run training

In [None]:

iteration=0
"""
# load weights
print('loading the weights')
model=load_model('char_level.h5')

# estimate accuracy on whole dataset using loaded weights
scores = model.evaluate([encoder_input_data, decoder_input_data], decoder_target_data,verbose=0)
print("%s: %.2f%%\n\n" % (model.metrics_names[1], scores[1]*100))
print("Testing Samples\n"+"-"*50)
for i in range(5):
    index=np.random.randint(len(input_texts))
    encoded_input_sequence=encoder_input_data[index: index + 1]
    output_sequence=decode_sequence(encoded_input_sequence)
    print(input_texts[index],output_sequence)
print("-"*50)
"""
iteration_file="/home/santhosh/keras/rnn/rnn_encoder_decoder/iteration_char_level.txt"
try:
    file=open(iteration_file,'r')
    last_line=file.read().split('\n')[-2]
    print('file_data,',last_line)
    iteration=int(last_line.split(':')[1])
    #print(iteration)
    file.close()
    
except:
    print('no file exist')

# checkpoint
filepath="weights_best_char_level.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='acc', verbose=0, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

while True:
    print('Iteration:',iteration+1)
    #training
    model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2,callbacks=callbacks_list)
    #prepare sample_data to test 5 samples:
    print("-"*50)
    for i in range(5):
        index=np.random.randint(len(input_texts))
        encoded_input_sequence=encoder_input_data[index: index + 1]
        output_sequence=decode_sequence(encoded_input_sequence)
        print(input_texts[index],output_sequence)
    print("-"*50)
    # Save model
    file=open(iteration_file,'a')
    file.write('iteration:'+str(iteration+1)+'\n')
    file.close()
    iteration+=1
    model.save('char_level.h5')

file_data, iteration:10
Iteration: 11
Train on 144 samples, validate on 36 samples
Epoch 1/25


  '. They will not be included '


Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
--------------------------------------------------
When does it begin? அவன்           ் ் ்் ்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்
Do you have a lot of pens? அவன்           ் ் ் ்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்
Friendship requires mutual trust. அவன்           ் ் ் ்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்
Be kind to old people. அவன்           ் ் ் ்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்
Nobody speaks to me. அவன்           ் ் ் ்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்

Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
--------------------------------------------------
I'm taller than you. அவன் பன் புட் பிட் பிட் பிட் பிட் துட்

She is kind. அவன் பன் புட் பிட் பிட் பிட் பிட் துட்

Don't listen to her. அவன் பன் புட் பிட் பிட் பிட் பிட் துட்

She got married to him. அவன் பன் புட் பிட் பிட் பிட் பிட் துட்

He asked us to help him. அவன் பன் புட் பிட் பிட் பிட் பிட் துட்

--------------------------------------------------
Iteration: 14
Train on 144 samples, validate on 36 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/2

Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
--------------------------------------------------
She danced with him. அவள் அவனிட் து விட்து தெருக்கிறேன்

Tom runs very fast. அவள் அவனிட் து விட்து வேட்டிடிட்து

He asked us to help him. அவள் அவனிட் து விட்து வேட்டிடிட்து

Do I have to study? அவள் அவனிட் து விட்து வேட்டிடிட்து

Don't drink and drive. அவள் அவனிட் து விட்து வேட்டிடிட்து

--------------------------------------------------
Iteration: 16
Train on 144 samples, validate on 36 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
--------------------------------------------------
Do yo

Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
--------------------------------------------------
She glanced through the magazine. அவன் அவனிடம் பிடிடு வேண்டு வெருக்கிறேன்

I told him to come. அவன் அவனிடம் பிடியாதா?

My younger sister got married in her teens. அவன் அவனிடம் பிடிடு வேண்டு வெருக்கிறேன்

Keep to the right. அவன் அவனிடம் பிடியாதா?

Don't lie to me. அவன் அவனிடம் பிடியாதா?

--------------------------------------------------
Iteration: 18
Train on 144 samples, validate on 36 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
--------------------------------------------------
Come and help us. அவன் அவனிடம் படிதாதாள்

I'm trying to sleep. அவ

Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
--------------------------------------------------
It's the third biggest city of Serbia. அவன் அவனுக்கு நிச்சயம்கிக்கள்

He began to run. அவன் அவனுக்கு நிச்சயம்கிக்கள்

Do you have any gum? அவன் அவனுக்கு நிச்சயம்கிக்கள்

Be kind to old people. அவன் அவனுக்கு நிச்சயம்கிக்கள்

This apple is sweet. அவன் அவனுக்கு நிச்சயம்கிக்கேன்

--------------------------------------------------
Iteration: 20
Train on 144 samples, validate on 36 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
--------------------------------------------------
She sat next to me. அவன் அவனுக் நிச்சயம்

Do I have to study? அவன் அவனுக் நிச்சயம்

It may rain. அவன் அவனுக் நிச்சயம்

She went out of the room. அவன

Epoch 25/25
--------------------------------------------------
He has a lot of money. அவள் அவனுக்கு நிரும் செய்திற்குக்கே இருக்கின்

I'm short of money. அவள் அவனுக்கு நிரும் செய்திற்குக்கே இருக்கின்

It's been a long time since I've heard anyone use that word. அவள் அவனுக்கு நிரும் செய்திற்குக்கே இருக்கின்

The price of eggs is going up. அவள் அவனுக்கு நிரும் செய்திற்குக்கே இருக்கின்

Raise your hand. அவள் அவனுக்கு நிரும் செய்திற்குக்கே இருக்கின்

--------------------------------------------------
Iteration: 22
Train on 144 samples, validate on 36 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
