In [1]:

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.layers import Attention, Concatenate

#import keras
import numpy as np

data_path = './dataset/ces.txt'
num_samples=24000
latent_dim = 256
epochs = 50
batch_size = 64

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [2]:
import tensorflow
print(tensorflow.__version__)

2.6.0-dev20210407


In [4]:
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()
with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
for line in lines[: min(num_samples, len(lines)-1)]:
    input_text, target_text, _ = line.split('\t')
    
    target_text = '\t' + target_text + '\n'
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

In [5]:
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

In [6]:
for i, char in enumerate(target_characters):
    print(char);

	


 
!
"
'
,
-
.
0
1
2
3
4
5
6
7
8
9
:
;
?
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
R
S
T
U
V
W
Y
Z
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
¨
Ú
á
é
í
ó
ú
ý
Č
č
ď
ě
ň
Ř
ř
Š
š
ť
ů
Ž
ž
—
‘
’
‚
“
„
₂


In [7]:
print('Number of sampoles: ', len(input_texts))
print('Number of unique input toklens: ', num_encoder_tokens)
print('Number of unique output toklens: ', num_decoder_tokens)
print('max sequnce len for inputs: ', max_encoder_seq_length)
print('max sequnce len for output: ', max_decoder_seq_length)

Number of sampoles:  24000
Number of unique input toklens:  75
Number of unique output toklens:  100
max sequnce len for inputs:  58
max sequnce len for output:  77


In [8]:
input_token_index = dict(
    [(char, i ) for i, char in enumerate(input_characters)])
target_token_index = dict(
    [(i, char ) for i, char in enumerate(target_characters)])


target_token_index = dict([(value, key) for key, value in target_token_index.items()])

In [9]:
target_token_index

{'\t': 0,
 '\n': 1,
 ' ': 2,
 '!': 3,
 '"': 4,
 "'": 5,
 ',': 6,
 '-': 7,
 '.': 8,
 '0': 9,
 '1': 10,
 '2': 11,
 '3': 12,
 '4': 13,
 '5': 14,
 '6': 15,
 '7': 16,
 '8': 17,
 '9': 18,
 ':': 19,
 ';': 20,
 '?': 21,
 'A': 22,
 'B': 23,
 'C': 24,
 'D': 25,
 'E': 26,
 'F': 27,
 'G': 28,
 'H': 29,
 'I': 30,
 'J': 31,
 'K': 32,
 'L': 33,
 'M': 34,
 'N': 35,
 'O': 36,
 'P': 37,
 'R': 38,
 'S': 39,
 'T': 40,
 'U': 41,
 'V': 42,
 'W': 43,
 'Y': 44,
 'Z': 45,
 'a': 46,
 'b': 47,
 'c': 48,
 'd': 49,
 'e': 50,
 'f': 51,
 'g': 52,
 'h': 53,
 'i': 54,
 'j': 55,
 'k': 56,
 'l': 57,
 'm': 58,
 'n': 59,
 'o': 60,
 'p': 61,
 'q': 62,
 'r': 63,
 's': 64,
 't': 65,
 'u': 66,
 'v': 67,
 'w': 68,
 'x': 69,
 'y': 70,
 'z': 71,
 '¨': 72,
 'Ú': 73,
 'á': 74,
 'é': 75,
 'í': 76,
 'ó': 77,
 'ú': 78,
 'ý': 79,
 'Č': 80,
 'č': 81,
 'ď': 82,
 'ě': 83,
 'ň': 84,
 'Ř': 85,
 'ř': 86,
 'Š': 87,
 'š': 88,
 'ť': 89,
 'ů': 90,
 'Ž': 91,
 'ž': 92,
 '—': 93,
 '‘': 94,
 '’': 95,
 '‚': 96,
 '“': 97,
 '„': 98,
 '₂': 99}

In [10]:
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype = 'float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype = 'float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype = 'float32')


In [11]:
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    encoder_input_data[i, t+1:, input_token_index[' ']] = 1.
    for t, char in enumerate(target_text):
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t>0:
            
            decoder_target_data[i, t-1, target_token_index[char]] =1.
    decoder_input_data[i, t+1:, target_token_index[' ']] =1.
    decoder_target_data[i, t:, target_token_index[' ']] =1.
        

In [13]:
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

encoder_states = [state_h, state_c]

In [14]:
decoder_inputs = Input(shape=(None, num_decoder_tokens))

decoder_lstm = LSTM(latent_dim, return_sequences = True, return_state = True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state = encoder_states)
#attention
#attn_layer = Attention('softmax', name='attention_layer')
#attn_out, attn_states = attn_layer([encoder_inputs, decoder_inputs])
#concat
#decoder_concat_input = Concatenate(axis=-1, name = "concat_layer")([decoder_outputs,attn_out])

decoder_dense = Dense(num_decoder_tokens, activation = 'softmax')
decoder_outputs = decoder_dense(decoder_outputs)



In [15]:
decoder_target_data[0][3]


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)

In [15]:
import tensorflow as tf
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
opt = tf.keras.optimizers.RMSprop(learning_rate=0.0001)
model.compile(optimizer='rmsprop', loss = 'categorical_crossentropy', metrics = ['accuracy'])

model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
         batch_size = batch_size,
         epochs = epochs,
         validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x133c83ee7c0>

In [16]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())


def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [17]:
for seq_index in range(10):
    
    input_seq = encoder_input_data[seq_index: seq_index +1]
    decoded_sequence = decode_sequence(input_seq)
    print('-')
    print("Input sentence: ", input_texts[seq_index])
    print("Decoded senetence: ", decoded_sequence)

-
Input sentence:  Hi.
Decoded senetence:  Je strátila.

-
Input sentence:  Run!
Decoded senetence:  Přestaňte má.

-
Input sentence:  Run!
Decoded senetence:  Přestaňte má.

-
Input sentence:  Who?
Decoded senetence:  Kdo je tvůj otec?

-
Input sentence:  Wow!
Decoded senetence:  Počerejte se.

-
Input sentence:  Fire!
Decoded senetence:  Pojď se.

-
Input sentence:  Fire!
Decoded senetence:  Pojď se.

-
Input sentence:  Hello!
Decoded senetence:  Vyhnu.

-
Input sentence:  Hurry!
Decoded senetence:  Pojďme správnout.

-
Input sentence:  Hurry!
Decoded senetence:  Pojďme správnout.



In [None]:
reverse_target_char_index