Source - https://nextjournal.com/gkoehler/machine-translation-seq2seq-cpu

Data Source - http://www.manythings.org/anki/

# Data Read

In [28]:
with open('Data/deu-eng/deu.txt', 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')

In [29]:
lines[155]

'Go away!\tSchwirr ab!\tCC-BY 2.0 (France) Attribution: tatoeba.org #433518 (CK) & #2494158 (Pfirsichbaeumchen)'

In [30]:
len(lines)

227081

In [31]:
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

# Generate Input and Targets

Creates a list of uninque characters in the corpur - for input and target separately

In [32]:
num_samples = 10000

In [38]:
'''TEST HOW THIS CODE WORKS'''

for line in lines[:min(num_samples, len(lines) - 1)]:
    input_text, target_text, extra = line.split('\t')
    print(input_text, target_text, extra)
    target_text = '\t' + target_text + '\n'
    input_texts.append(input_text)
    target_texts.append(target_text)
    
    # making set of unique characters - input
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    # making set of unique characters - target
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)
    break

Go. Geh. CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8597805 (Roujin)


In [34]:
input_texts

['Go.']

In [35]:
target_texts

['\tGeh.\n']

In [36]:
input_characters

{'.', 'G', 'o'}

In [37]:
target_characters

{'\t', '\n', '.', 'G', 'e', 'h'}

We don't need input and target characters.

In [39]:
''' ACTUAL EXECUTION '''

input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

num_samples = 10000

for line in lines[:min(num_samples, len(lines) - 1)]:
    input_text, target_text, extra = line.split('\t')
    #     print(input_text, target_text, extra)
    target_text = '\t' + target_text + '\n'
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

In [42]:
print(input_characters)

{'Y', 'H', 's', 'J', 'Q', '7', 'w', 'o', '4', '1', 'n', 'C', 'T', '-', 'a', 'd', 'c', 'M', 'U', 'E', 'm', ' ', 'S', 'P', 'F', 'e', 'L', 'j', '!', '.', 'z', 'D', '3', 'u', 'v', "'", 'x', '9', 'K', 'N', ':', 'p', 'R', 'A', 'G', 'h', '5', '%', '"', 'y', 'O', '2', 'I', '6', ',', 'g', 'b', '0', '?', 'V', 'W', '8', 't', 'l', 'k', 'f', 'i', 'q', '$', 'B', 'r'}


In [45]:
len(input_characters)

71

In [43]:
print(target_characters)

{'\xa0', 'J', '„', 'o', 'U', '\u202f', ' ', 'S', '3', 'L', 'D', 'u', 'N', '9', "'", 'p', 'A', 'h', '5', 'g', '0', 'V', 'W', '8', 'Ü', 'Q', 'w', '1', 'a', 'M', 'c', 'ü', 'P', '\n', 'F', 'O', 'y', ',', 'Ä', 'q', 'H', 'ö', '4', 'n', 'T', '-', 'j', 'e', 'z', 'x', ':', '2', 'I', 'b', '?', 't', 'k', 'Y', 'Ö', 's', '“', '7', '\t', 'C', 'd', 'E', 'm', '!', '.', 'v', 'Z', 'K', 'R', 'G', 'ß', '%', '’', 'ä', '6', 'l', 'f', '$', 'i', 'B', 'r'}


In [46]:
len(target_characters)

85

# Get Configs for this Corpus

Here, we obtain

1. number of input texts (should be equal to number of samples)

2. Total number of input characters for encoder model

3. Total number of output characters for decoder model

4. Maximum sequence length for input and output for binarised one-hot encoding of te texts

In [55]:
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts
                              ])  # length includes spaces
max_decoder_seq_length = max([len(txt) for txt in target_texts
                              ])  # length includes spaces

print('Number of samples:', len(input_texts))  # we set this earlier
print('Number of unique input tokens (characters):', num_encoder_tokens)
print('Number of unique output tokens (characters):', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 10000
Number of unique input tokens (characters): 71
Number of unique output tokens (characters): 85
Max sequence length for inputs: 15
Max sequence length for outputs: 45


In [56]:
# # TEST - Length includes spaces
# for txt in input_texts[15:17]:
#     print(txt)
#     print(len(txt))

Go on.
6
Hello!
6


In [57]:
# one-hot encoding of the characters - basically creating word embeddings

input_token_index = dict([(char, i)
                          for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i)
                           for i, char in enumerate(target_characters)])

In [58]:
import numpy as np

encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

In [59]:
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.

## Example - Input Characters

In [66]:
input_texts[1]

'Hi.'

In [67]:
print(input_characters)

[' ', '!', '"', '$', '%', "'", ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [75]:
index_h = input_characters.index('H')
index_h

28

In [72]:
encoder_input_data[1][0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.], dtype=float32)

In [76]:
encoder_input_data[1][0][index_h]

1.0

In [138]:
encoder_input_data[1].shape

(15, 71)

## Example - Output Characters

In [77]:
target_texts[1]

'\tHallo!\n'

In [118]:
for each_char in target_texts[1]:
    print("{}: {}".format(each_char, np.where(np.array(target_characters) == each_char)))

	: (array([0], dtype=int64),)
H: (array([29], dtype=int64),)
a: (array([47], dtype=int64),)
l: (array([58], dtype=int64),)
l: (array([58], dtype=int64),)
o: (array([61], dtype=int64),)
!: (array([3], dtype=int64),)

: (array([1], dtype=int64),)


In [80]:
print(target_characters)

['\t', '\n', ' ', '!', '$', '%', "'", ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\xa0', 'Ä', 'Ö', 'Ü', 'ß', 'ä', 'ö', 'ü', '’', '“', '„', '\u202f']


In [133]:
for array_ in decoder_input_data[1]:
    idx = np.where(array_ == 1)
    try:
        if idx:
            print(idx[0][0])
    except:
        pass
    
print("Number of arrays: ", len(decoder_input_data[1])) # Should be equal to max_decoder_seq_length

0
29
47
58
58
61
3
1
Number of arrays:  45


In [135]:
# Expected the array to start from 29, since it is ahead by one step

for array_ in decoder_target_data[1]:
    idx = np.where(array_ == 1)
    try:
        if idx:
            print(idx[0][0])
    except:
        pass
    
print("Number of arrays: ", len(decoder_target_data[1])) # Should be equal to max_decoder_seq_length

29
47
58
58
61
3
1
Number of arrays:  45


# Building the Model

In [143]:
import keras, tensorflow
from keras.models import Model
from keras.layers import Input, LSTM, Dense
import numpy as np

In [144]:
batch_size = 64  # batch size for training
epochs = 100  # number of epochs to train for
lstm_units = 256  # latent dimensionality of the encoding space

In [146]:
encoder_inputs = Input(shape=(None, num_encoder_tokens))  # input texts
encoder = LSTM(lstm_units, return_state=True)
encoder_outputs, state_h, state_c = encoder(
    encoder_inputs
)  # Gets back the hidden and cell states to feed in to the decoder
encoder_states = [
    state_h, state_c
]  # obtained the encoder vector states cz only interested in the state vectors

In [147]:
decoder_inputs = Input(shape=(None, num_decoder_tokens))  # the target texts
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [148]:
model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_outputs)

In [149]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, None, 71)]   0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, None, 85)]   0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 256), (None, 335872      input_4[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, None, 256),  350208      input_5[0][0]                    
                                                                 lstm_1[0][1]                 

In [150]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

OSError: Unable to create file (unable to open file: name = '/results/seq2seq_eng-ger.h5', errno = 2, error message = 'No such file or directory', flags = 13, o_flags = 302)

In [151]:
model.save('/Models/seq2seq_eng-ger.h5')

# Inference

## Load from Saved Model

In [152]:
## LOAD FROM SAVED MODEL

# model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
# model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
# model.load_weights()

In [153]:
encoder_model = Model(encoder_inputs, encoder_states)

In [154]:
decoder_state_input_h = Input(shape=(lstm_units, ))
decoder_state_input_c = Input(shape=(lstm_units, ))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

In [155]:
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model([decoder_inputs] + decoder_states_inputs,
                      [decoder_outputs] + decoder_states)

In [156]:
# reverse-lookup token index to turn sequences back to characters

reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())

In [161]:
input_sentence = "How are you?"
test_sentence_tokenized = np.zeros(
    (1, max_encoder_seq_length, num_encoder_tokens), dtype='float32')
print(test_sentence_tokenized.shape) 

for t, char in enumerate(input_sentence):
    test_sentence_tokenized[0, t, input_token_index[char]] = 1.
print(test_sentence_tokenized) # the binarised one-hot encoded form

(1, 15, 71)
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]


In [164]:
states_value = encoder_model.predict(test_sentence_tokenized) # getting the states values

In [185]:
# states_value is a list with two elements

states_value[0].shape # recall lstm layer units

(1, 256)

In [184]:
# generate empty target sequence
target_seq = np.zeros((1, 1, num_decoder_tokens))
target_seq

array([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.]]])

In [186]:
# Add 1 to indicate [START] or \t
target_seq[0, 0, target_token_index['\t']] = 1.
target_seq

array([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.]]])

In [189]:
target_seq.shape

(1, 1, 85)

In [187]:
output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
output_tokens

array([[[3.76637938e-04, 9.58756063e-05, 3.31804156e-04, 2.97309336e-04,
         1.14638271e-04, 9.66965308e-05, 9.81786361e-05, 1.78010625e-04,
         6.99794618e-05, 5.93443692e-05, 1.65433070e-04, 3.67576249e-05,
         5.57057247e-05, 2.16987301e-05, 3.71472452e-05, 1.93148371e-04,
         1.62555123e-04, 1.08228276e-04, 4.69805527e-05, 2.52005702e-04,
         3.42866027e-04, 1.91174586e-05, 1.54850539e-03, 1.13971392e-03,
         9.27087895e-05, 8.56718887e-03, 2.10939441e-03, 4.94942418e-04,
         3.12250981e-04, 1.15063193e-03, 1.58461800e-03, 6.51997048e-04,
         8.86965368e-04, 9.14979319e-04, 8.97095480e-04, 2.63134460e-03,
         2.64438451e-04, 1.70949686e-04, 1.40627686e-04, 1.47998182e-03,
         5.60146524e-03, 2.53967335e-03, 1.41218153e-03, 1.70434956e-04,
         9.53538597e-01, 8.70950025e-05, 4.61355579e-04, 7.01689860e-05,
         4.17333613e-05, 9.90965418e-05, 1.19104938e-04, 4.44617384e-04,
         7.30814718e-05, 1.07304550e-04, 1.89090191

In [188]:
output_tokens.shape

(1, 1, 85)

In [198]:
output_tokens[0, -1, :] == output_tokens[0][0]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True])

In [199]:
sampled_token_index = np.argmax(output_tokens[0, -1, :])

In [200]:
sampled_token_index

44

In [204]:
# # To explain argmax
# np.where(output_tokens[0][0] == np.max(output_tokens[0, -1, :]))

In [205]:
sampled_char = reverse_target_char_index[sampled_token_index] # Get the character from the dictionary
sampled_char

'W'

In [207]:
decoded_sentence = ''
decoded_sentence += sampled_char # add the character
# This will be in a while loop later to keep adding until the [END] is reached

In [209]:
# STOP CONDITION  - if the character is \n or the lenth of the sentence if more than the max target inputs we used for training
if (sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length):
    stop_condition = True

In [210]:
# update the target sequence (length 1).
target_seq = np.zeros((1, 1, num_decoder_tokens))
target_seq[0, 0, sampled_token_index] = 1.

In [211]:
target_seq

array([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.]]])

In [212]:
states_value = [h, c]

In [213]:
output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
    
# sample a token and add the corresponding character to the 
# decoded sequence
sampled_token_index = np.argmax(output_tokens[0, -1, :])
sampled_char = reverse_target_char_index[sampled_token_index]
decoded_sentence += sampled_char
decoded_sentence

'Wi'

In [214]:
''' Putting all of that together '''

def decode_sequence(input_seq):
    # encode the input sequence to get the internal state vectors.
    states_value = encoder_model.predict(input_seq)

    # generate empty target sequence of length 1 with only the start character
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, target_token_index['\t']] = 1

    # output sequence loop
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] +
                                                    states_value)

        # sample a token and add the corresponding character to the
        # decoded sequence
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char
        # check for the exit condition: either hitting max length
        # or predicting the 'stop' character
        if (sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # update the target sequence (length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # update states
        states_value = [h, c]

    return decoded_sentence

In [215]:
input_sentence = "How are you?"
test_sentence_tokenized = np.zeros(
    (1, max_encoder_seq_length, num_encoder_tokens), dtype='float32')
for t, char in enumerate(input_sentence):
    test_sentence_tokenized[0, t, input_token_index[char]] = 1.
print(input_sentence)
print(decode_sequence(test_sentence_tokenized))

How are you?
Wie geht es auf.



In [222]:
for seq_index in range(6, 10):
    input_seq = encoder_input_data[seq_index:seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)
    print('Target sentence:', target_texts[seq_index].strip())

-
Input sentence: Wow!
Decoded sentence: Wonnerten?

Target sentence: Donnerwetter!
-
Input sentence: Fire!
Decoded sentence: Gehen.

Target sentence: Feuer!
-
Input sentence: Help!
Decoded sentence: Zie in der auf.

Target sentence: Hilfe!
-
Input sentence: Help!
Decoded sentence: Zie in der auf.

Target sentence: Zu Hülf!


 # Model Validation

In [224]:
val_input_texts = []
val_target_texts = []
line_ix = 12000 # Stating after 10000
for line in lines[line_ix:line_ix + 10]: # Getting 10 lines for validation
    input_text, target_text, extra = line.split('\t')
    val_input_texts.append(input_text)
    val_target_texts.append(target_text)

# Prep the encoder
val_encoder_input_data = np.zeros(
    (len(val_input_texts), max([len(txt) for txt in val_input_texts
                                ]), num_encoder_tokens),
    dtype='float32')

for i, input_text in enumerate(val_input_texts):
    for t, char in enumerate(input_text):
        val_encoder_input_data[i, t, input_token_index[char]] = 1.

In [225]:
val_input_texts

['I saw him again.',
 'I saw him first.',
 'I saw it coming.',
 'I saw it coming.',
 'I saw it coming.',
 'I saw one today.',
 'I saw something.',
 'I saw that, too.',
 'I saw the movie.',
 'I saw you there.']

In [226]:
for seq_index in range(10):
    input_seq = val_encoder_input_data[seq_index:seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', val_input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence[:-1])
    print('Ground Truth sentence:', val_target_texts[seq_index])

-
Input sentence: I saw him again.
Decoded sentence: Ich habe einen Siegenerten.
Ground Truth sentence: Ich habe ihn wieder gesehen.
-
Input sentence: I saw him first.
Decoded sentence: Ich habe eine Schleren.
Ground Truth sentence: Ich habe ihn zuerst gesehen.
-
Input sentence: I saw it coming.
Decoded sentence: Ich habe einen Siegerehen.
Ground Truth sentence: Ich habe es kommen sehen.
-
Input sentence: I saw it coming.
Decoded sentence: Ich habe einen Siegerehen.
Ground Truth sentence: Ich habe es geahnt.
-
Input sentence: I saw it coming.
Decoded sentence: Ich habe einen Siegerehen.
Ground Truth sentence: Ich ahnte es.
-
Input sentence: I saw one today.
Decoded sentence: Ich habe eine Sie gefunden.
Ground Truth sentence: Ich habe heute einen gesehen.
-
Input sentence: I saw something.
Decoded sentence: Ich habe einen Schlesen gehen.
Ground Truth sentence: Ich habe etwas gesehen.
-
Input sentence: I saw that, too.
Decoded sentence: Ich habe eine Sie gesehen.
Ground Truth sentence: D