In [2]:
import requests
from zipfile import ZipFile
from io import BytesIO, StringIO
import os

In [3]:
def download_and_extract(url, path):
    request = requests.get(url)
    with ZipFile(BytesIO(request.content), "r") as file:
         file.extractall(path)

urls = [
    "https://chitanka.info/book/25-andersenovi-prikazki.txt.zip",
    "https://chitanka.info/book/3103-601-izpitani-gotvarski-retsepti.txt.zip",
    "https://chitanka.info/book/6393-bogat-tatko-beden-tatko.txt.zip",
    "https://chitanka.info/book/524-az-i-moreto.txt.zip",
    "https://chitanka.info/book/6548-baba-djado-i-vnuche.txt.zip",
    "https://chitanka.info/book/6938-12-printsipa-na-proizvoditelnostta.txt.zip",
    "https://chitanka.info/book/1581-igrata-na-lisitsite.txt.zip",
    "https://chitanka.info/book/8106-usmivka-v-polunosht.txt.zip",
]
            
for url in urls:
    download_and_extract(url,"./training/")

In [4]:
max_line_len = 50

In [5]:
def read_corpus(path, files=os.listdir("./training/")):
    lines = []

    for file in files:
        with open("./training/{}".format(file), encoding="utf-8-sig") as file_content:
            for line in file_content:
                line = line.strip()
                if(len(line) > max_line_len):
                    tokens = line.split(" ")
                    new_line = ""
                    for token in tokens:
                        if(len(new_line) + len(token) < max_line_len):
                            new_line +=token + " "
                        else:
                            lines.append(new_line)
                            new_line = ""
                else:
                    if len(line) > 3:
                        lines.append(line)
    return lines

In [6]:
lines = read_corpus("./training/")

In [7]:
len(lines)

52963

In [8]:
lines[-3:]

['Пенка Чолчева. 601 изпитани готварски рецепти',
 'Съставител: Ани Чолчева',
 'Издадена от списание „Стил“, София, 1991.']

In [9]:
input_text = [ line.replace(" ", "") for line in lines]

In [10]:
START_CHAR = "\t"
END_CHAR = "\n"

In [11]:
target_text = [ START_CHAR + line + END_CHAR for line in lines]

In [12]:
input_text[:3]

['Баба,дядоивнуче', 'Народниприказки', 'НиколайХайтов']

In [13]:
input_ch = set()
for line in input_text:
    for c in line:
        input_ch.add(c)

In [14]:
input_ch = list(sorted(input_ch))
input_ch_len = len(input_ch)
input_ch_len

167

In [15]:
target_ch = set()
for line in target_text:
    for c in line:
        target_ch.add(c)

In [16]:
target_ch_len = len(target_ch)
target_ch_len

170

In [17]:
input_ch_idx = dict([(char,i) for i, char in enumerate(input_ch)])
target_ch_idx = dict([(char,i) for i, char in enumerate(target_ch)])

In [18]:
max_input_len = max([len(line) for line in input_text])
max_target_len = max([len(line) for line in target_text])

print(max_input_len)
print(max_target_len)

46
52


In [19]:
import numpy as np

In [20]:
encoder_input_data = np.zeros ( (len(input_text), max_input_len, input_ch_len ),dtype="float64" )
decoder_input_data = np.zeros ( (len(input_text), max_target_len, target_ch_len ),dtype="float64" )
decoder_target_data = np.zeros( (len(input_text), max_target_len, target_ch_len ),dtype="float64" )

In [21]:
encoder_input_data.shape

(52963, 46, 167)

In [22]:
decoder_input_data.shape

(52963, 52, 170)

In [23]:
decoder_target_data.shape

(52963, 52, 170)

In [24]:
for i, (input_name, target_name) in enumerate(zip(input_text, target_text)):
    for t, char in enumerate(input_name):
        encoder_input_data[i, t, input_ch_idx[char]] = 1.
    for t, char in enumerate(target_name):
        decoder_input_data[i, t, target_ch_idx[char]] = 1.
        if t > 0:
            decoder_target_data[i, t - 1, target_ch_idx[char]] = 1.

In [25]:
import keras
from keras.models import Model
from keras.layers import Input,Dense,LSTM, Embedding, Dropout

Using TensorFlow backend.


In [26]:
STATE_SPACE_DIM = 256

In [27]:
encoder_inputs = Input(shape=(None, input_ch_len))
encoder = LSTM(STATE_SPACE_DIM, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

In [28]:
decoder_inputs = Input(shape=(None, target_ch_len))
dropout = Dropout(0.2)(decoder_inputs)
decoder_lstm = LSTM(STATE_SPACE_DIM, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dropout,
                                     initial_state=encoder_states)
decoder_dense = Dense(target_ch_len, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [29]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='adam', loss='categorical_crossentropy')

In [30]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, None, 170)    0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, None, 167)    0                                            
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, None, 170)    0           input_2[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 256), (None, 434176      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LS

In [59]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=256,
          epochs=2,
          validation_split=0.2)

Train on 42370 samples, validate on 10593 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f99a7ea07b8>

In [60]:
model.save("char_based_rnn_more_data_2.h5")

  str(node.arguments) + '. They will not be included '


In [33]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(STATE_SPACE_DIM,))
decoder_state_input_c = Input(shape=(STATE_SPACE_DIM,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)


In [34]:
reverse_input_ch_idx = dict(
    (i, char) for char, i in input_ch_idx.items())
reverse_target_ch_idx= dict(
    (i, char) for char, i in target_ch_idx.items())

In [35]:
def decode_sequence(input_seq):
  
    states_value = encoder_model.predict(input_seq)

  
    target_seq = np.zeros((1, 1, len(target_ch)))
   
    target_seq[0, 0, target_ch_idx['$']] = 1.

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_ch_idx[sampled_token_index]
        decoded_sentence += sampled_char

        
        if (sampled_char == END_CHAR or
           len(decoded_sentence) > max_target_len):
            stop_condition = True

       
        target_seq = np.zeros((1, 1, len(target_ch)))
        target_seq[0, 0, sampled_token_index] = 1.

        states_value = [h, c]

    return decoded_sentence

In [None]:
for seq_index in range(100):
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_text[seq_index])
    print('Decoded sentence:', decoded_sentence)

In [40]:
def encode(input_name):
    encoded = np.zeros (( 1 , max_input_len, input_ch_len ),dtype="float64" )
    for t, char in enumerate(input_name):
        encoded[0, t, input_ch_idx[char]] = 1.
    return encoded[0:1]

In [61]:
print(decode_sequence(encode("дасмеживииздрави")))
print(decode_sequence(encode("замногогодини")))

да сгеживи издрави

за много години

