In [49]:
import numpy as np
import os
import tensorflow as tf
from tensorflow.data import Dataset, AUTOTUNE
from tensorflow import keras
from typing import Dict, Tuple
import keras.layers as l
from keras import models, callbacks, utils, losses

In [50]:
text = ''
with open('Пикник на обочине.txt', 'r', encoding='windows-1251') as file:
    text = file.read()

def get_features_target(seq: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
    features = seq[:-1]
    target = seq[1:]
    return features, target

BATCH_SIZE = 100

alphabet = np.array(sorted(set(text)))

word_index = {char: i for i, char in enumerate(alphabet)}
index_word = {i: char for i, char in enumerate(alphabet)}

sequences = Dataset.from_tensor_slices(np.array([word_index[char] for char in text])).batch(BATCH_SIZE, drop_remainder=True)
dataset = sequences.map(get_features_target)

data = dataset.batch(BATCH_SIZE, drop_remainder=True).repeat()
data = data.prefetch(AUTOTUNE)


In [56]:
model = keras.Sequential([
    l.Embedding(len(alphabet), BATCH_SIZE, batch_input_shape=[BATCH_SIZE, None]),
    l.SimpleRNN(512, return_sequences=True, stateful=True),
    l.SimpleRNN(512, return_sequences=True, stateful=True),
    l.Dense(len(alphabet))
])

model.compile(optimizer='adam', loss=losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
model.fit(data, epochs=20, verbose=1, steps_per_epoch= len(sequences) // BATCH_SIZE)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7dcac875ac20>

In [57]:
def predict_next(sample: str, model: keras.Sequential, tokenizer: Dict[str, int], vocabulary: Dict[int, str], n_next: int, temperature: float, batch_size: int, word: bool = False) -> str:
    if word:
        sample_vector = [tokenizer[word] for word in sample.split()]
    else:
        sample_vector = [tokenizer[char] for char in sample]
    predicted = sample_vector
    sample_tensor = tf.expand_dims(sample_vector, 0)
    sample_tensor = tf.repeat(sample_tensor, batch_size, axis=0)
    for i in range(n_next):
        pred = model(sample_tensor)
        pred = pred[0].numpy() / temperature
        pred = tf.random.categorical(pred, num_samples=1)[-1, 0].numpy()
        predicted.append(pred)
        sample_tensor = predicted[-99:]
        sample_tensor = tf.expand_dims([pred], 0)
        sample_tensor = tf.repeat(sample_tensor, batch_size, axis=0)
    pred_seq = [vocabulary[i] for i in predicted]
    generated = ' '.join(pred_seq) if word else ''.join(pred_seq)
    return generated

In [59]:
print(predict_next(
    sample='Разум',
    model=model,
    tokenizer=word_index,
    vocabulary=index_word,
    n_next=200,
    temperature=0.6,
    batch_size=BATCH_SIZE
))

Разум, говорат и поперал водота, полек посесь постовал ого не в подло на сограсто и свомо и дело воребно, постом бень и в это это стацо на подноже увитальи. Дак замат вез дыль прозал, как там волного в дад


In [60]:
print(predict_next(
    sample='Сердце',
    model=model,
    tokenizer=word_index,
    vocabulary=index_word,
    n_next=100,
    temperature=0.2,
    batch_size=BATCH_SIZE
))

Сердце стал продул в постовал с не стал в сказал он в сторал он подерал волько в стал в это выл он в сто м


In [61]:
print(predict_next(
    sample='Боль',
    model=model,
    tokenizer=word_index,
    vocabulary=index_word,
    n_next=150,
    temperature=0.81,
    batch_size=BATCH_SIZE
))

Боль. Вак нем. Я нак егрыгадь к м горани и скошки протникомняю, плесди ко поволя и вак коже смабалы дал? Нучамы. Рэдола ведо, посерик, на тебоди, разрерну
