In [None]:
import sys
import re

import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

In [None]:
text = open('./Москва-Петушки.txt', 'r', encoding='utf-8').read()
text = text.lower()
text = re.sub(r'[^а-я\s\d.!\?-]+', '', text)
text = re.sub(r'[!\?]+', '.', text)
text = re.sub(r'\s+', ' ', text)

chars = sorted(list(set(text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

n_chars = len(text)
n_vocab = len(chars)

In [None]:
max_sentence_length = 100
window_length = 25
step = 1
dataX = []
dataY = []
for sentence in text.split('.'):
  if len(sentence) == 0:
    continue
  sentence += '.'
  for i in range(0, min(len(sentence), max_sentence_length) - window_length, step):
    seq_in = sentence[i:i + window_length]
    seq_out = sentence[i + window_length]
    dataX.append(seq_in)
    dataY.append(seq_out)
n_sentences = len(dataX)
X = np.zeros((n_sentences, window_length, n_vocab), dtype=np.bool)
y = np.zeros((n_sentences, n_vocab), dtype=np.bool)
for i, sentence in enumerate(dataX):
    for t, char in enumerate(sentence):
      X[i, t, char_to_int[char]] = 1
    y[i, char_to_int[dataY[i]]] = 1
print('X shape:', X.shape)

X shape: (90987, 25, 45)


In [None]:
len(text)

195760

In [None]:
model = keras.Sequential(
    [
        LSTM(256, input_shape=X.shape[1:], return_sequences=True),
        Dropout(0.2),
        LSTM(256),
        Dropout(0.2),
        Dense(y.shape[1], activation='softmax')
    ]
)
filename = "rnn.hdf5"
# model.load_weights(filename)
model.compile(loss="categorical_crossentropy", optimizer='adam')
model.fit(X, y, batch_size=128, epochs=40)

In [None]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
start = np.random.randint(0, n_sentences - 1)
temperature = 0.1
generated = ""
pattern = dataX[start]
print('...Generating with seed: "' + pattern + '"')
for i in range(100):
    x_pred = np.zeros((1, window_length, n_vocab))
    for t, char in enumerate(pattern):
        x_pred[0, t, char_to_int[char]] = 1.0
    preds = model.predict(x_pred, verbose=0)[0]
    next_char = int_to_char[sample(preds, temperature)]
    pattern = pattern[1:] + next_char
    sys.stdout.write(next_char)
    if next_char == '.':
      break

...Generating with seed: "го карачарова от серпа и "
молота до карачарова мой бог не мог было приняли давайте и написал ни любить на свете положением сто

In [None]:

model.save('rnn.hdf5', save_format='hdf5')

### Markov chain

In [None]:
from collections import defaultdict

markov_window_length = 5
step = 1
markov_dataX = []
markov_dataY = []
for sentence in text.split('.'):
  if len(sentence) == 0:
    continue
  sentence += '.'
  for i in range(0, min(len(sentence), max_sentence_length) - markov_window_length, step):
    seq_in = sentence[i:i + markov_window_length]
    seq_out = sentence[i + markov_window_length]
    markov_dataX.append(seq_in)
    markov_dataY.append(seq_out)

nodes = defaultdict(lambda: defaultdict(lambda: 0))
for sentence, symbol in zip(markov_dataX, markov_dataY):
  nodes[sentence][symbol] += 1

In [None]:
start = np.random.randint(0, len(markov_dataX) - 1)

pattern = markov_dataX[start]
print('...Generating with seed: "' + pattern + '"')
for i in range(100):
    next_chars_pool = [symbol for symbol in nodes[pattern]]
    probas = np.array([w for w in nodes[pattern].values()])
    probas = probas / probas.sum()
    if len(probas) == 0:
      break
    next_char = next_chars_pool[np.argmax(np.random.multinomial(1, probas, 1))]
    pattern = pattern[1:] + next_char
    sys.stdout.write(next_char)

...Generating with seed: "а там"
 сосна он плотно и как же.