In [1]:
from tensorflow import keras
from tensorflow.keras import layers

import numpy as np
import io
import re

In [2]:
# delete all symbols excluding [^а-я]
book_name = "data/svoa-komnata.txt"
with io.open(book_name, encoding="utf-8") as book:
    book_text = book.read().lower()
book_text = re.sub(r'[^а-я\s]', "", book_text.replace("\n", " "))
chars = sorted(list(set(book_text)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

print('Book text length: {}, all chars: {}'.format(len(book_text), len(chars)))

Book text length: 164906, all chars: 33


In [3]:
sentence_length = 30
encoding_size = len(chars)

# window
step = 2
sentences = []
next_chars = []
for i in range(0, len(book_text) - sentence_length, step):
    sentences.append(book_text[i : i + sentence_length])
    next_chars.append(book_text[i + sentence_length])

print('Number of sentences: {}'.format(len(sentences)))

Number of sentences: 82438


In [4]:
x = np.zeros((len(sentences), sentence_length, encoding_size), dtype=np.bool)
y = np.zeros((len(sentences), encoding_size), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

In [5]:
model = keras.Sequential(
    [
        keras.Input(shape=(sentence_length, encoding_size)),
        layers.LSTM(128),
        layers.Dense(encoding_size, activation="softmax")
    ]
)
optimizer = keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 128)               82944     
_________________________________________________________________
dense (Dense)                (None, 33)                4257      
Total params: 87,201
Trainable params: 87,201
Non-trainable params: 0
_________________________________________________________________


In [6]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [11]:
import random

epochs = 15
batch_size = 100

x_train = x[:10000]
y_train = y[:10000]

for epoch in range(epochs):
    model.fit(x_train, y_train, batch_size=batch_size, epochs=1)
    print()
    print("Generating text after epoch: %d" % epoch)

    start_index = random.randint(0, len(book_text) - sentence_length - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print("...Diversity:", diversity)

        generated = ""
        sentence = book_text[start_index : start_index + sentence_length]
        print('...Generating with seed: "' + sentence + '"')

        for i in range(20):
            x_pred = np.zeros((1, sentence_length, encoding_size))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.0
            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]
            sentence = sentence[1:] + next_char
            generated += next_char

        print("...Generated: ", generated)
        print()


Generating text after epoch: 0
...Diversity: 0.2
...Generating with seed: "а это ее и упрекали           "
...Generated:                      

...Diversity: 0.5
...Generating with seed: "а это ее и упрекали           "
...Generated:                      

...Diversity: 1.0
...Generating with seed: "а это ее и упрекали           "
...Generated:                      

...Diversity: 1.2
...Generating with seed: "а это ее и упрекали           "
...Generated:       что вошла со тн


Generating text after epoch: 1
...Diversity: 0.2
...Generating with seed: " разговор расхваставшихся мужч"
...Generated:  ины несем не с отрад

...Diversity: 0.5
...Generating with seed: " разговор расхваставшихся мужч"
...Generated:  ины в весь за нев в 

...Diversity: 1.0
...Generating with seed: " разговор расхваставшихся мужч"
...Generated:  ины если клуские зул

...Diversity: 1.2
...Generating with seed: " разговор расхваставшихся мужч"
...Generated:  ины черад провещная 


Generating text after epoch: 2
