Устанавливаем необходимые библиотеки

In [1]:
!pip install -q --upgrade keras-nlp
!pip install -q --upgrade keras

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m570.5/570.5 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m589.8/589.8 MB[0m [31m584.9 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m57.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m53.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m56.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

Импортируем библиотеки

In [19]:
import os
import keras_nlp
import keras

import tensorflow as tf
import tensorflow.data as tf_data
import tensorflow.strings as tf_strings

Задаём параметры

In [80]:
BATCH_SIZE = 64
MIN_STRING_LEN = 512
SEQ_LEN = 128

EMBED_DIM = 256
FEED_FORWARD_DIM = 128
NUM_HEADS = 3
NUM_LAYERS = 2
VOCAB_SIZE = 5000

EPOCHS = 70

NUM_TOKENS_TO_GENERATE = 30

Получаем датасет и разделяем его на выборки

In [81]:
dataset_dir = os.path.expanduser("./books.txt")

raw_train_ds = (
    tf_data.TextLineDataset(dataset_dir)
    .filter(lambda x: tf_strings.length(x) > MIN_STRING_LEN)
    .batch(BATCH_SIZE)
    .shuffle(buffer_size=256)
)

raw_val_ds = (
    tf_data.TextLineDataset(dataset_dir)
    .filter(lambda x: tf_strings.length(x) > MIN_STRING_LEN)
    .batch(BATCH_SIZE)
)

Обучаем токенизатор

In [82]:
vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
    raw_train_ds,
    vocabulary_size=VOCAB_SIZE,
    lowercase=True,
    reserved_tokens=["[PAD]", "[UNK]", "[BOS]"],
)

Загружаем токенизатор

In [83]:
tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocab,
    sequence_length=SEQ_LEN,
    lowercase=True,
)

Токенезируем данные

In [84]:
start_packer = keras_nlp.layers.StartEndPacker(
    sequence_length=SEQ_LEN,
    start_value=tokenizer.token_to_id("[BOS]"),
)


def preprocess(inputs):
    outputs = tokenizer(inputs)
    features = start_packer(outputs)
    labels = outputs
    return features, labels

train_ds = raw_train_ds.map(preprocess, num_parallel_calls=tf_data.AUTOTUNE).prefetch(
    tf_data.AUTOTUNE
)
val_ds = raw_val_ds.map(preprocess, num_parallel_calls=tf_data.AUTOTUNE).prefetch(
    tf_data.AUTOTUNE
)

Строим модель

In [85]:
inputs = keras.layers.Input(shape=(None,), dtype="int32")

embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=VOCAB_SIZE,
    sequence_length=SEQ_LEN,
    embedding_dim=EMBED_DIM,
    mask_zero=True,
)
x = embedding_layer(inputs)

for _ in range(NUM_LAYERS):
    decoder_layer = keras_nlp.layers.TransformerDecoder(
        num_heads=NUM_HEADS,
        intermediate_dim=FEED_FORWARD_DIM,
    )
    x = decoder_layer(x)

outputs = keras.layers.Dense(VOCAB_SIZE)(x)
model = keras.Model(inputs=inputs, outputs=outputs)
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
perplexity = keras_nlp.metrics.Perplexity(from_logits=True, mask_token_id=0)
model.compile(optimizer="adam", loss=loss_fn, metrics=[perplexity])

Начинаем обучение модели

In [86]:
model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS)

Epoch 1/70
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 7s/step - loss: 8.3558 - perplexity: 4295.9414 - val_loss: 7.6233 - val_perplexity: 2045.2753
Epoch 2/70
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 7s/step - loss: 7.2938 - perplexity: 1489.5649 - val_loss: 6.5895 - val_perplexity: 727.3959
Epoch 3/70
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 7s/step - loss: 6.3524 - perplexity: 578.1729 - val_loss: 5.8681 - val_perplexity: 353.5903
Epoch 4/70
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 5s/step - loss: 5.7858 - perplexity: 326.4159 - val_loss: 5.4702 - val_perplexity: 237.4990
Epoch 5/70
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 7s/step - loss: 5.4065 - perplexity: 222.9250 - val_loss: 5.3016 - val_perplexity: 200.6520
Epoch 6/70
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 7s/step - loss: 5.2776 - perplexity: 195.8936 - val_loss: 5.2508 - val_perplexity: 190.7144
E

<keras.src.callbacks.history.History at 0x7c4489fcf2b0>

Заполним нашу модель входной последовательностью, начинающейся с какого-то токена, и постепенно будем выбирать модель, делая прогнозы для каждого последующего токена в цикле

In [95]:
prompt_tokens = start_packer(tokenizer([""]))

def next(prompt, cache, index):
    logits = model(prompt)[:, index - 1, :]
    hidden_states = None
    return logits, hidden_states, cache

Для лучшего ответа используем Top-P search

In [96]:
sampler = keras_nlp.samplers.TopPSampler(p=0.5)
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=1,
)
txt = tokenizer.detokenize(output_tokens)

decoded_txt = ''.join([t.decode('utf-8') for t in txt.numpy()])
print(decoded_txt)



[BOS] мгновенно все преобразилось . люди отодвинулись , уплощаясь , становясь настенными изображениями ; белый стол раздался вширь , обратился в престол алтаря , где восседала в одиночестве жрица . он подошел ; кровь струилась горячо по жилам ; он все стоял и с
