In [None]:
!pip install tensorflow keras numpy

In [None]:
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))

In [None]:
import glob

all_text = ""
for fname in sorted(glob.glob("dataset/*.txt")):
    with open(fname, encoding="utf-8") as f:
        all_text += f.read() + "\n"

print(f"Total corpus length: {len(all_text):,} characters")

In [None]:
vocab = sorted(set(all_text))
char2idx = {u: i for i, u in enumerate(vocab)}
idx2char = {i: u for i, u in enumerate(vocab)}

print(f"Unique characters: {len(vocab)}")

In [None]:
import numpy as np
from keras.utils import Sequence

class CharSequenceGenerator(Sequence):
    def __init__(self, text, seq_length, batch_size, vocab, step=1):
        self.text = text
        self.seq_length = seq_length
        self.batch_size = batch_size
        self.vocab = vocab
        self.char2idx = {c: i for i, c in enumerate(vocab)}
        self.indices = list(range(0, len(text) - seq_length - 1, step))

    def __len__(self):
        return len(self.indices) // self.batch_size

    def __getitem__(self, idx):
        x = np.zeros((self.batch_size, self.seq_length, len(self.vocab)), dtype=np.bool_)
        y = np.zeros((self.batch_size, len(self.vocab)), dtype=np.bool_)
        for i in range(self.batch_size):
            start = self.indices[idx * self.batch_size + i]
            seq = self.text[start:start + self.seq_length]
            next_char = self.text[start + self.seq_length]
            for t, char in enumerate(seq):
                x[i, t, self.char2idx[char]] = 1
            y[i, self.char2idx[next_char]] = 1
        return x, y

In [12]:
SEQ_LENGTH = 100
BATCH_SIZE = 256
generator = CharSequenceGenerator(all_text, SEQ_LENGTH, BATCH_SIZE, vocab, step=10)

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout

model = Sequential([
    LSTM(256, input_shape=(SEQ_LENGTH, len(vocab)), return_sequences=True),
    Dropout(0.2),
    LSTM(256),
    Dropout(0.2),
    Dense(len(vocab), activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

In [15]:
from keras.callbacks import ModelCheckpoint, EarlyStopping

EPOCHS = 20

callbacks = [
    ModelCheckpoint("lstm_textgen_best.h5", save_best_only=True),
    EarlyStopping(patience=3, restore_best_weights=True)
]

model.fit(generator, epochs=EPOCHS, callbacks=callbacks)

Epoch 1/20


  self._warn_if_super_not_called()


[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step - loss: 2.7228

  if self._should_save_model(epoch, batch, logs, filepath):


[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1748s[0m 133ms/step - loss: 2.7228
Epoch 2/20


  current = self.get_monitor_value(logs)


[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step - loss: 1.8865



[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1611s[0m 123ms/step - loss: 1.8865
Epoch 3/20
[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122ms/step - loss: 1.6129



[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1601s[0m 122ms/step - loss: 1.6129
Epoch 4/20
[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step - loss: 1.5145



[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1574s[0m 120ms/step - loss: 1.5145
Epoch 5/20
[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - loss: 1.4610



[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1541s[0m 117ms/step - loss: 1.4610
Epoch 6/20
[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - loss: 1.4351



[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1537s[0m 117ms/step - loss: 1.4351
Epoch 7/20
[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - loss: 1.4109



[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1539s[0m 117ms/step - loss: 1.4109
Epoch 8/20
[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - loss: 1.3894



[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1531s[0m 117ms/step - loss: 1.3894
Epoch 9/20
[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - loss: 1.3815



[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1537s[0m 117ms/step - loss: 1.3815
Epoch 10/20
[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - loss: 1.3654



[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1536s[0m 117ms/step - loss: 1.3654
Epoch 11/20
[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - loss: 1.3582



[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1535s[0m 117ms/step - loss: 1.3582
Epoch 12/20
[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - loss: 1.3483



[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1536s[0m 117ms/step - loss: 1.3483
Epoch 13/20
[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - loss: 1.3368



[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1531s[0m 117ms/step - loss: 1.3368
Epoch 14/20
[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step - loss: 1.3366



[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1530s[0m 116ms/step - loss: 1.3366
Epoch 15/20
[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - loss: 1.3301



[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1535s[0m 117ms/step - loss: 1.3301
Epoch 16/20
[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - loss: 1.3261



[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1534s[0m 117ms/step - loss: 1.3261
Epoch 17/20
[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - loss: 1.3195



[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1533s[0m 117ms/step - loss: 1.3195
Epoch 18/20
[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step - loss: 1.3189



[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1530s[0m 116ms/step - loss: 1.3189
Epoch 19/20
[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - loss: 1.3166



[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1531s[0m 117ms/step - loss: 1.3166
Epoch 20/20
[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step - loss: 1.3110



[1m13137/13137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1529s[0m 116ms/step - loss: 1.3110


<keras.src.callbacks.history.History at 0x7f8325bb6350>