In [1]:
import numpy as np
import os
import tensorflow as tf
from tensorflow.data import Dataset, AUTOTUNE
from tensorflow import keras
from typing import Dict, Tuple
import re
import keras.layers as l
from keras import models, callbacks, utils, losses

In [2]:
text = ''
with open('Dead-souls.txt', 'r', encoding='utf-8') as file:
    text = file.read()

def get_features_target(seq: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
    features = seq[:-1]
    target = seq[1:]
    return features, target

BATCH_SIZE = 32

words = list(filter(None, [re.sub('[^а-яА-ЯёЁ0-9 ,-]', '', s).strip() for s in text.split('.')]))
alphabet = np.array(sorted(set(' '.join(words).split(' '))))

word_index = {char: i for i, char in enumerate(alphabet)}
index_word = {i: char for i, char in enumerate(alphabet)}

sequences = Dataset.from_tensor_slices(np.array([word_index[word] for word in ' '.join(words).split()])).batch(BATCH_SIZE, drop_remainder=True)
dataset = sequences.map(get_features_target)

data = dataset.batch(BATCH_SIZE, drop_remainder=True).repeat()
data = data.prefetch(AUTOTUNE)


In [3]:
model = keras.Sequential([
    l.Embedding(len(alphabet), BATCH_SIZE, batch_input_shape=[BATCH_SIZE, None]),
    l.Bidirectional(l.LSTM(150, return_sequences=True)),
    l.Dropout(0.2),
    l.LSTM(512, return_sequences=True, stateful=True),
    l.Dense(len(alphabet) / 2, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
    l.Dense(len(alphabet), activation='softmax')
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (32, None, 32)            961024    
                                                                 
 bidirectional (Bidirection  (32, None, 300)           219600    
 al)                                                             
                                                                 
 dropout (Dropout)           (32, None, 300)           0         
                                                                 
 lstm_1 (LSTM)               (32, None, 512)           1665024   
                                                                 
 dense (Dense)               (32, None, 15016)         7703208   
                                                                 
 dense_1 (Dense)             (32, None, 30032)         450990544 
                                                        

In [11]:
from numba import cuda
cuda.select_device(0)
cuda.close()

In [4]:

model.compile(optimizer='adam', loss=losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
model.fit(data, epochs=35, verbose=1, steps_per_epoch= len(sequences) // BATCH_SIZE)

Epoch 1/35


  output, from_logits = _get_logits(


Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


<keras.src.callbacks.History at 0x79242b94bfd0>

In [5]:
def predict_next(sample: str, model: keras.Sequential, tokenizer: Dict[str, int], vocabulary: Dict[int, str], n_next: int, temperature: float, batch_size: int, word: bool = False) -> str:
    if word:
        sample_vector = [tokenizer[word] for word in sample.split()]
    else:
        sample_vector = [tokenizer[char] for char in sample]
    predicted = sample_vector
    sample_tensor = tf.expand_dims(sample_vector, 0)
    sample_tensor = tf.repeat(sample_tensor, batch_size, axis=0)
    for i in range(n_next):
        pred = model(sample_tensor)
        pred = pred[0].numpy() / temperature
        pred = tf.random.categorical(pred, num_samples=1)[-1, 0].numpy()
        predicted.append(pred)
        sample_tensor = predicted[-99:]
        sample_tensor = tf.expand_dims([pred], 0)
        sample_tensor = tf.repeat(sample_tensor, batch_size, axis=0)
    pred_seq = [vocabulary[i] for i in predicted]
    generated = ' '.join(pred_seq) if word else ''.join(pred_seq)
    return generated

In [6]:
print(predict_next(
    sample='Где',
    model=model,
    tokenizer=word_index,
    vocabulary=index_word,
    n_next=20,
    temperature=0.6,
    batch_size=BATCH_SIZE,
    word=True
))

Где разбойничье поднят тверже раздевать подсвечнике неведомый изумления вышить Ребята, неблагоприятных изобрел фигурка скачки, правую фаянсовых сказанное правильные опасности, лавки досаду,


In [7]:
print(predict_next(
    sample='Душ',
    model=model,
    tokenizer=word_index,
    vocabulary=index_word,
    n_next=20,
    temperature=0.6,
    batch_size=BATCH_SIZE,
    word=True
))

Душ бриться, гражданского Эге шахматы спаси приятели, первых-то молоденькую закопался воздвигнуть Изумляются молоденькие туда ризе положила ни, каурой грамотно Экой надлежащих


In [8]:
print(predict_next(
    sample='Поручик',
    model=model,
    tokenizer=word_index,
    vocabulary=index_word,
    n_next=20,
    temperature=0.6,
    batch_size=BATCH_SIZE,
    word=True
))

Поручик устремлено сердито, похлопотать, однако танцевавшее разговора, судьбамиЧичиков прыть поворачивать наплетет, зеркала выехал Петух распечет узел сап чувствами сором пристроил палец
