In [1]:
import requests
from zipfile import ZipFile
from io import BytesIO, StringIO

In [2]:
def download_and_extract(url, path):
    request = requests.get(url)
    with ZipFile(BytesIO(request.content), "r") as file:
         file.extractall(path)

urls = [
    "https://chitanka.info/book/25-andersenovi-prikazki.txt.zip",
    "https://chitanka.info/book/3103-601-izpitani-gotvarski-retsepti.txt.zip",
    "https://chitanka.info/book/6393-bogat-tatko-beden-tatko.txt.zip",
    "https://chitanka.info/book/524-az-i-moreto.txt.zip",
    "https://chitanka.info/book/6548-baba-djado-i-vnuche.txt.zip",
    "https://chitanka.info/book/6938-12-printsipa-na-proizvoditelnostta.txt.zip"
    
]
            
for url in urls:
    download_and_extract(url,"./data/")

In [3]:
max_line_len = 50

In [4]:
files = [
    "data/Hans_Kristian_Andersen_-_Andersenovi_prikazki_-25-b.txt",
    "data/Penka_Cholcheva_-_601_izpitani_gotvarski_retsepti_-3103-b.txt",
    "data/Robyrt_Kijosaki_-_Bogat_tatko_beden_tatko_-_Na_kakvo_bogatite_uchat_detsata_si_za_parite_a_bednite_i_srednata_klasa_-_ne-6393-b.txt",
    "data/Petja_Dubarova_-_Az_i_moreto_-524-b.txt",
    "data/Baba_djado_i_vnuche_-_Narodni_prikazki-6548-b.txt",
    "data/Haringtyn_Emersyn_-_12_printsipa_na_proizvoditelnostta_-6938-b.txt"
]

In [5]:
def read_corpus(files):
    lines = []

    for file in files:
        with open(file, encoding="utf-8-sig") as file_content:
            for line in file_content:
                line = line.strip()
                if(len(line) > max_line_len):
                    tokens = line.split(" ")
                    new_line = ""
                    for token in tokens:
                        if(len(new_line) + len(token) < max_line_len):
                            new_line +=token + " "
                        else:
                            lines.append(new_line)
                            new_line = ""
                else:
                    if len(line) > 3:
                        lines.append(line)
    return lines

In [6]:
lines = read_corpus(files)

In [7]:
len(lines)

32262

In [8]:
lines[-3:]

['Партиздат, София, 1973',
 'Редактор: Борис Въжаров',
 'Коректор: Веселина Цветкова']

In [9]:
input_text = [ line.replace(" ", "") for line in lines]

In [10]:
START_CHAR = "\t"
END_CHAR = "\n"

In [11]:
target_text = [ START_CHAR + line + END_CHAR for line in lines]

In [12]:
input_text[:3]

['ХансКристианАндерсен', 'Андерсеновиприказки', 'Огнивото']

In [13]:
input_ch = set()
for line in input_text:
    for c in line:
        input_ch.add(c)

In [14]:
input_ch = list(sorted(input_ch))
input_ch_len = len(input_ch)
input_ch_len

148

In [15]:
target_ch = set()
for line in target_text:
    for c in line:
        target_ch.add(c)

In [16]:
target_ch_len = len(target_ch)
target_ch_len

151

In [17]:
input_ch_idx = dict([(char,i) for i, char in enumerate(input_ch)])
target_ch_idx = dict([(char,i) for i, char in enumerate(target_ch)])

In [18]:
max_input_len = max([len(line) for line in input_text])
max_target_len = max([len(line) for line in target_text])

print(max_input_len)
print(max_target_len)

46
52


In [19]:
import numpy as np

In [20]:
encoder_input_data = np.zeros ( (len(input_text), max_input_len, input_ch_len ),dtype="float64" )
decoder_input_data = np.zeros ( (len(input_text), max_target_len, target_ch_len ),dtype="float64" )
decoder_target_data = np.zeros( (len(input_text), max_target_len, target_ch_len ),dtype="float64" )

In [21]:
encoder_input_data.shape

(32262, 46, 148)

In [22]:
decoder_input_data.shape

(32262, 52, 151)

In [23]:
decoder_target_data.shape

(32262, 52, 151)

In [24]:
for i, (input_name, target_name) in enumerate(zip(input_text, target_text)):
    for t, char in enumerate(input_name):
        encoder_input_data[i, t, input_ch_idx[char]] = 1.
    for t, char in enumerate(target_name):
        decoder_input_data[i, t, target_ch_idx[char]] = 1.
        if t > 0:
            decoder_target_data[i, t - 1, target_ch_idx[char]] = 1.

In [25]:
import keras
from keras.models import Model
from keras.layers import Input,Dense,LSTM, Embedding, Dropout

Using TensorFlow backend.


In [26]:
STATE_SPACE_DIM = 256

In [27]:
encoder_inputs = Input(shape=(None, input_ch_len))
encoder = LSTM(STATE_SPACE_DIM, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

In [28]:
decoder_inputs = Input(shape=(None, target_ch_len))
dropout = Dropout(0.2)(decoder_inputs)
decoder_lstm = LSTM(STATE_SPACE_DIM, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dropout,
                                     initial_state=encoder_states)
decoder_dense = Dense(target_ch_len, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [29]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='adam', loss='categorical_crossentropy')

In [30]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, None, 151)    0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, None, 148)    0                                            
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, None, 151)    0           input_2[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 256), (None, 414720      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LS

In [31]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=128,
          epochs=256,
          validation_split=0.2)

Train on 25809 samples, validate on 6453 samples
Epoch 1/256
Epoch 2/256
Epoch 3/256
Epoch 4/256
Epoch 5/256
Epoch 6/256
Epoch 7/256
Epoch 8/256
Epoch 9/256
Epoch 10/256
Epoch 11/256
Epoch 12/256
Epoch 13/256
Epoch 14/256
Epoch 15/256
Epoch 16/256
Epoch 17/256
Epoch 18/256
Epoch 19/256
Epoch 20/256
Epoch 21/256
Epoch 22/256
Epoch 23/256
Epoch 24/256
Epoch 25/256
Epoch 26/256
Epoch 27/256
Epoch 28/256
Epoch 29/256
Epoch 30/256
Epoch 31/256
Epoch 32/256
Epoch 33/256
Epoch 34/256
Epoch 35/256
Epoch 36/256
Epoch 37/256
Epoch 38/256
Epoch 39/256
Epoch 40/256
Epoch 41/256
Epoch 42/256
Epoch 43/256
Epoch 44/256
Epoch 45/256
Epoch 46/256
Epoch 47/256
Epoch 48/256
Epoch 49/256
Epoch 50/256
Epoch 51/256
Epoch 52/256
Epoch 53/256
Epoch 54/256
Epoch 55/256
Epoch 56/256
Epoch 57/256
Epoch 58/256
Epoch 59/256
Epoch 60/256
Epoch 61/256
Epoch 62/256
Epoch 63/256
Epoch 64/256
Epoch 65/256
Epoch 66/256
Epoch 67/256
Epoch 68/256
Epoch 69/256
Epoch 70/256
Epoch 71/256
Epoch 72/256
Epoch 73/256
Epoch 74/25

Epoch 76/256
Epoch 77/256
Epoch 78/256
Epoch 79/256
Epoch 80/256
Epoch 81/256
Epoch 82/256
Epoch 83/256
Epoch 84/256
Epoch 85/256
Epoch 86/256

KeyboardInterrupt: 

In [32]:
model.save("char_based_rnn_baseline_86_epochs.h5")

  str(node.arguments) + '. They will not be included '


In [33]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(STATE_SPACE_DIM,))
decoder_state_input_c = Input(shape=(STATE_SPACE_DIM,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)


In [34]:
reverse_input_ch_idx = dict(
    (i, char) for char, i in input_ch_idx.items())
reverse_target_ch_idx= dict(
    (i, char) for char, i in target_ch_idx.items())

In [35]:
def decode_sequence(input_seq):
  
    states_value = encoder_model.predict(input_seq)

  
    target_seq = np.zeros((1, 1, len(target_ch)))
   
    target_seq[0, 0, target_ch_idx['$']] = 1.

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_ch_idx[sampled_token_index]
        decoded_sentence += sampled_char

        
        if (sampled_char == END_CHAR or
           len(decoded_sentence) > max_target_len):
            stop_condition = True

       
        target_seq = np.zeros((1, 1, len(target_ch)))
        target_seq[0, 0, sampled_token_index] = 1.

        states_value = [h, c]

    return decoded_sentence

In [36]:
for seq_index in range(100):
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_text[seq_index])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: ХансКристианАндерсен
Decoded sentence: Хан „Растици най-дебер

-
Input sentence: Андерсеновиприказки
Decoded sentence: Ан ден себя по при как из

-
Input sentence: Огнивото
Decoded sentence: Огиновото

-
Input sentence: Изширокияпътвървешевойник:едно,две,едно,
Decoded sentence: Из изкили то пръветвеше болни. Ще доде, дедна, 

-
Input sentence: Нагърбасиносешераница,анакръста—сабя,
Decoded sentence: На гъбра си носе зернаци, а на най траса; — бабя 

-
Input sentence: бешеходилнавойнаисегасевръщашеудомаси.
Decoded sentence: беше ходила в някона се изсегат бъдеше магьоса и 

-
Input sentence: пътягосрещнаеднастарамагьосница.Тябеше
Decoded sentence: път ягосреща неднастара магьосница. Тя беше 

-
Input sentence: —Добървечер,войниче!—казамагьосницата.—
Decoded sentence: — Добър вечер, в гоне! — чака за могьосанцита. — 

-
Input sentence: хубавасабяикакваголямараницаимаш!Тиси
Decoded sentence: хубава са бията когава пляма на имаши. — изче 

-
Input sentence: —Благодаряти,ст

-
Input sentence: тяхсивзезлатни.Тойнатъпкасжълтицинесамо
Decoded sentence: тях си взезлят на. Той не так пък в пениците са 

-
Input sentence: ираницата,ноифуражкатаичизмитеси,тъйче
Decoded sentence: и раницата, но и упраканата и черимите, стъче, 

-
Input sentence: можешедаседвижи.Сегатойнаистинаимашемного
Decoded sentence: можеше да седвижи. Саготена соти и самие на можа 

-
Input sentence: Сложипаккучетовърхусандъка,затвориврататаи
Decoded sentence: Сложи пак кучето върху саднок, за тарвори в тарика 

-
Input sentence: —Дърпайнагоре,старамагьоснице!
Decoded sentence: — Дърра сален, преста за магьоснице!

-
Input sentence: —Взелиогнивото?—попитамагьосницата.
Decoded sentence: — Взели ги новито? — попита магьосницата. 

-
Input sentence: —Подяволите!—извикавойникът.—Негосъвсем
Decoded sentence: — Подяволите! — извика возникът. — Него все съга 

-
Input sentence: —Итойсевърна,тавзеогнивото.Тогава
Decoded sentence: — И той се върна, та внезговино. Това го ва 

-
Input sentence: гоизтег