# Word to Word conversion
https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html

In [1]:
from __future__ import print_function

from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras.preprocessing.text import Tokenizer
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None) 

Using TensorFlow backend.


1. What do the variables below mean? How do they effect the model?

In [0]:
batch_size = 64  # Batch size for training.
epochs = 2  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = 10000  # Number of samples to train on.
# Path to the data txt file on disk.
data_path = 'hin.txt'

## Vectorize data
Can you use a tokenizer?

In [3]:
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()
with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
for line in lines[: min(num_samples, len(lines) - 1)]:
    input_text, target_text = line.split('\t')
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text = '\t' + target_text + '\n'
    input_texts.append(input_text)
    target_texts.append(target_text)

print(input_texts)
print(target_texts)

['Wow!', 'Help!', 'Jump.', 'Jump.', 'Jump.', 'Hello!', 'Hello!', 'Cheers!', 'Cheers!', 'Got it?', "I'm OK.", 'Awesome!', 'Come in.', 'Get out!', 'Go away!', 'Goodbye!', 'Perfect!', 'Perfect!', 'Welcome.', 'Welcome.', 'Have fun.', 'Have fun.', 'Have fun.', 'I forgot.', 'I forgot.', "I'll pay.", "I'm fine.", "I'm full.", "Let's go!", 'Answer me.', 'Birds fly.', 'Excuse me.', 'Fantastic!', 'I fainted.', 'I fear so.', 'I laughed.', "I'm bored.", "I'm broke.", "I'm tired.", "It's cold.", 'Well done!', 'Who knows?', 'Who knows?', 'Who knows?', 'Who knows?', 'Wonderful!', 'Birds sing.', 'Come on in.', 'Definitely!', "Don't move.", 'Fire burns.', 'Follow him.', 'I am tired.', 'I can swim.', 'I can swim.', 'I love you.', 'I love you.', 'I love you.', 'I love you.', 'I love you.', 'I will try.', "I'm coming.", "I'm hungry!", "I'm hungry!", 'Let him in.', 'Let him in.', 'Let me out!', 'Once again.', 'Please sit.', 'That a boy!', "What's new?", "What's new?", "Who's that?", "Don't shout.", "Don't 

In [0]:
t1=Tokenizer(num_words=len(input_texts), filters='"#$%&()*+,-/:;<=>?@[\\]^_`{|}~', lower=True, split=' ', char_level=False, oov_token=None, document_count=0)
t2=Tokenizer(num_words=len(target_texts), filters='"#$%&()*+,-/:;<=>?@[\\]^_`{|}~', lower=True, split=' ', char_level=False, oov_token=None, document_count=0)

In [6]:
# Vectorize the data.


t1.fit_on_texts(input_texts)
#num_encoder_tokens=t.word_counts
input_words=t1.word_index
input_words = sorted(list(input_words))

num_encoder_tokens=len(t1.word_index)+1

t2.fit_on_texts(target_texts)
#print(k2)
target_words=t2.word_index
print(target_words)
print(t2.word_counts)
target_words = sorted(target_words)
print(target_words)
num_decoder_tokens=len(t2.word_index)+1

encoder_input_data=t1.texts_to_matrix(input_texts, mode='count')
decoder_input_data=t2.texts_to_matrix(target_texts, mode='count')

max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])



# Create word to token dictionary for both source and target
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

print(encoder_input_data.shape)
print(decoder_input_data.shape)
#print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

{'है।\n': 1, '\n': 2, 'में': 3, 'नहीं': 4, 'से': 5, '\tमैं': 6, 'के': 7, '\tवह': 8, 'है': 9, 'को': 10, '\tमुझे': 11, 'हैं।\n': 12, 'क्या': 13, 'की': 14, 'हो': 15, 'का': 16, 'बहुत': 17, 'पर': 18, 'हूँ।\n': 19, '\tउसने': 20, 'एक': 21, 'था।\n': 22, 'कर': 23, 'लिए': 24, '\tतुम': 25, 'कि': 26, '\tयह': 27, 'हैं': 28, 'भी': 29, 'और': 30, 'वह': 31, 'अपने': 32, 'रहा': 33, 'मैं': 34, '\tमैंने': 35, 'पास': 36, 'ने': 37, 'तो': 38, '\tमेरे': 39, 'करने': 40, 'ही': 41, 'काम': 42, 'गया।\n': 43, '\tक्या': 44, 'अपनी': 45, 'मुझे': 46, '\tहम': 47, 'था': 48, 'साथ': 49, 'उसे': 50, 'करना': 51, 'कोई': 52, 'थी।\n': 53, 'गया': 54, 'घर': 55, 'सकते': 56, 'समय': 57, 'यह': 58, 'कल': 59, 'बात': 60, 'रही': 61, 'मेरे': 62, 'इस': 63, 'पता': 64, 'बजे': 65, '\tइस': 66, '\tमेरी': 67, 'मेरी': 68, 'तक': 69, 'चाहिए।\n': 70, 'अभी': 71, 'लगता': 72, 'किताब': 73, 'दिया।\n': 74, 'पसंद': 75, 'उसके': 76, 'थे।\n': 77, 'कभी': 78, 'रहे': 79, 'कुछ': 80, 'सकता': 81, 'अच्छा': 82, 'मत': 83, 'हुई': 84, 'आ': 85, 'हूँ': 86, 'किया।\n': 87, 'क

In [0]:
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')



for i, (input_word, target_word) in enumerate(zip(input_text, target_text)):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        if i > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            
            for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[target_word]] = decoder_input_data[i, t, target_token_index[target_word]]

What are the dimensions of the encoder input, decoder input and decoder target? How many features and timesteps?

## Model
Encode-Decoder Model. 
1. How are the layers connected?
2. What data flows between the encoder and decoder? 
3. What data flows into the decoder?

In [36]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

W0721 22:58:52.329359 140628630685568 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0721 22:58:52.368342 140628630685568 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0721 22:58:52.375879 140628630685568 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.



In [37]:
# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.summary()

W0721 22:58:59.410687 140628630685568 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0721 22:58:59.442128 140628630685568 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3295: The name tf.log is deprecated. Please use tf.math.log instead.



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 2906)   0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 5110)   0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 256), (None, 3238912     input_1[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, None, 256),  5495808     input_2[0][0]                    
                                                                 lstm_1[0][1]                     
          

How to save and reload same model?

In [38]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)
# Save model
model.save('s2s.h5')

NameError: ignored

## Inference Mode
1. How is inference different from training?
2. What is input to encoder?
3. What is input to decoder?

In [0]:
# Next: inference mode (sampling).
# Here's the drill:
# 1) encode input and retrieve initial decoder state
# 2) run one step of decoder with this initial state
# and a "start of sequence" token as target.
# Output will be the next target token
# 3) Repeat with the current target token and current states

# Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [0]:
# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())

Why are we saving h, c from decoder?

In [0]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [0]:
for seq_index in range(100):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)