# Model sec2sec z użyciem wyrazów

## Stałe

In [1]:
batch_size = 64  
epochs = 50  
latent_dim = 256  
embed_dim = 32
num_samples = 20000
data_path = 'corpora/pol-eng-short.txt'
clean_data_path = 'corpora/clean-pol-eng-short.txt'

## Czyszczenie tekstu

In [2]:
import string

with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read()
    table = str.maketrans('', '', string.punctuation)
    clean_lines = lines.lower().translate(table)

with open(clean_data_path, 'w', encoding='utf-8') as f:
    f.write(clean_lines)

## Wektoryzacja

In [3]:
import sys

input_texts = []
target_texts = []
input_words = set()
target_words = set()

with open(clean_data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')

for line in lines[: num_samples]:
    try:
        input_text, target_text = line.split('\t')
        input_text = '<s> ' + input_text + ' <e>'
        target_text = '<s> ' + target_text + ' <e>'
        input_texts.append(input_text)
        target_texts.append(target_text)
        for word in input_text.split(' '):
            if word not in input_words:
                input_words.add(word)
        for word in target_text.split(' '):
            if word not in target_words:
                target_words.add(word)
    except:
        print("Unexpected error:", sys.exc_info()[0])    
        
input_words = sorted(list(input_words))
target_words = sorted(list(target_words))
num_encoder_tokens = len(input_words)
num_decoder_tokens = len(target_words)
max_encoder_seq_length = max([len(txt.split(' ')) for txt in input_texts])
max_decoder_seq_length = max([len(txt.split(' ')) for txt in target_texts])

## Generacja danych

In [4]:
import numpy as np

# slowniki slow
input_token_index = dict(
    [(word, i) for i, word in enumerate(input_words)])
target_token_index = dict(
    [(word, i) for i, word in enumerate(target_words)])

# wypelnienie wejsc zerami
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(target_texts), max_decoder_seq_length),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(target_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

for text_index, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for word_index, (word) in enumerate(input_text.split(' ')):
        encoder_input_data[text_index, word_index] = input_token_index[word]
    for word_index, (word) in enumerate(target_text.split(' ')):
        decoder_input_data[text_index, word_index] = target_token_index[word]
        if word_index > 0:
            decoder_target_data[text_index, word_index, target_token_index[word]] = 1


## Model treningowy

In [15]:
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Dense

# max_encoder_seq_length == 9
# max_decoder_seq_length == 11
# batch_size = 64  
# epochs = 20  
# latent_dim = 256  
# embed_dim = 32
encoder_inputs = Input(shape=(max_encoder_seq_length,), name="encoder_input")
encoder = Embedding(num_encoder_tokens, embed_dim, input_length = max_encoder_seq_length, name="encoder_embedding")(encoder_inputs)
_, state_h, state_c = LSTM(latent_dim, return_state=True, name="encoder_lstm")(encoder)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(max_decoder_seq_length,), name ="decoder_input")
decoder_embedding = Embedding(output_dim = embed_dim, input_dim = num_decoder_tokens, input_length = max_decoder_seq_length, name = "decoder_embedding")(decoder_inputs)
decoder = LSTM(latent_dim, return_state=True, return_sequences=True, name="decoder_lstm")(decoder_embedding, initial_state=encoder_states)
decoder_outputs = Dense(num_decoder_tokens, activation='softmax', name="decoder_output")(decoder)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model(inputs = [encoder_inputs, decoder_inputs], outputs = decoder_outputs)

# Compile & run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
# Note that `decoder_target_data` needs to be one-hot encoded,
# rather than sequences of integers like `decoder_input_data`!

model.summary()
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2
)

model.save('s2s-word.h5')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      (None, 10)           0                                            
__________________________________________________________________________________________________
decoder_input (InputLayer)      (None, 15)           0                                            
__________________________________________________________________________________________________
encoder_embedding (Embedding)   (None, 10, 32)       156512      encoder_input[0][0]              
__________________________________________________________________________________________________
decoder_embedding (Embedding)   (None, 15, 32)       345760      decoder_input[0][0]              
__________________________________________________________________________________________________
encoder_ls

KeyboardInterrupt: 

## Model produkcyjny

In [15]:
from keras.models import Model, load_model
from keras.layers import Input, LSTM, Dense

model = load_model('s2s-word.h5')

for layer in model.layers:
    print(layer.get_config())
    print('\n')
    

encoder_inputs = Input(shape=(max_encoder_seq_length,))
decoder_inputs = Input(shape=(max_decoder_seq_length,))
encoder_embedding = model.get_layer(name="encoder_embedding")
encoder = model.get_layer(name="encoder_lstm")
decoder_embedding = model.get_layer(name="decoder_embedding")
decoder_lstm = model.get_layer(name="decoder_lstm")
decoder_dense = model.get_layer(name="decoder_output")

#inference encoder
encoder_embedding = encoder_embedding(encoder_inputs)
_, state_h, state_c = encoder(encoder_embedding)
encoder_states = [state_h, state_c]
encoder_model = Model(encoder_inputs, encoder_states)
#init
#inference decoder
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_embedding = decoder_embedding(decoder_inputs)
decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

# Reverse-lookup token index to decode sequences back to
# something readable.

reverse_input_word_index = dict(
    (i, word) for word, i in input_token_index.items())
reverse_target_word_index = dict(
    (i, word) for word, i in target_token_index.items())

def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, max_decoder_seq_length))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = input_token_index["<s>"]

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    index = 1
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_target_word_index[sampled_token_index]
        decoded_sentence += sampled_word

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_word == '<e>' or len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq[0, index] = sampled_token_index

        # Update states
        states_value = [h, c]
        index += 1

    return decoded_sentence


for seq_index in range(100):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)
    
    #1 nauczyc na niemicekim - moze polski za trudny
    #2 nie ladowac modelu tylko uzyc gotowego - moze blad w ladowaniu
    #3 zadzwonic po speca

{'batch_input_shape': (None, 10), 'dtype': 'float32', 'sparse': False, 'name': 'encoder_input'}


{'batch_input_shape': (None, 15), 'dtype': 'float32', 'sparse': False, 'name': 'decoder_input'}


{'name': 'encoder_embedding', 'trainable': True, 'batch_input_shape': (None, 10), 'dtype': 'float32', 'input_dim': 4891, 'output_dim': 32, 'embeddings_initializer': {'class_name': 'RandomUniform', 'config': {'minval': -0.05, 'maxval': 0.05, 'seed': None}}, 'embeddings_regularizer': None, 'activity_regularizer': None, 'embeddings_constraint': None, 'mask_zero': False, 'input_length': 10}


{'name': 'decoder_embedding', 'trainable': True, 'batch_input_shape': (None, 15), 'dtype': 'float32', 'input_dim': 10805, 'output_dim': 32, 'embeddings_initializer': {'class_name': 'RandomUniform', 'config': {'minval': -0.05, 'maxval': 0.05, 'seed': None}}, 'embeddings_regularizer': None, 'activity_regularizer': None, 'embeddings_constraint': None, 'mask_zero': False, 'input_length': 15}


{'name': 'encoder_l

IndexError: index 15 is out of bounds for axis 1 with size 15