1. Load text → build char-to-index and index-to-char mappings

In [2]:
import os
path = r"./../../../nlp/autocomplete/data/raw/01 Harry Potter and the Sorcerers Stone.txt"
with open(path, "r", encoding="utf-8") as f:
    text = f.read()
chars = sorted(set(text))
char_to_idx = {ch:i for i,ch in enumerate(chars)}
idx_to_char = {i:ch for i,ch in enumerate(chars)}
vocab_size = len(chars)
print("Text length:", len(text), "Vocab size:", vocab_size)

Text length: 439478 Vocab size: 82


2. Create input sequences of length 40 → predict 41st; vectorize text

In [3]:
import numpy as np
seq_length = 40
step = 1
sequences = []
next_chars = []
for i in range(0, len(text) - seq_length, step):
    sequences.append(text[i: i + seq_length])
    next_chars.append(text[i + seq_length])
print("Number of sequences:", len(sequences))

X = np.zeros((len(sequences), seq_length, vocab_size), dtype=np.bool_)
y = np.zeros(len(sequences), dtype=np.int32)
for i, seq in enumerate(sequences):
    for t, ch in enumerate(seq):
        X[i, t, char_to_idx[ch]] = 1
    y[i] = char_to_idx[next_chars[i]]

Number of sequences: 439438


3. Create training sequence dataset

In [4]:
import tensorflow as tf
batch_size = 64
buffer_size = 10000
ds = tf.data.Dataset.from_tensor_slices((X, y))
ds = ds.shuffle(buffer_size).batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)

print(ds.element_spec)

(TensorSpec(shape=(64, 40, 82), dtype=tf.bool, name=None), TensorSpec(shape=(64,), dtype=tf.int32, name=None))


4. Build RNN model using SimpleRNN layers

In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense

model = Sequential([
    SimpleRNN(256, return_sequences=True, input_shape=(seq_length, vocab_size)),
    SimpleRNN(256),
    Dense(vocab_size, activation='softmax')
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()

  super().__init__(**kwargs)


5. Train the model

In [6]:
epochs = 10
history = model.fit(ds, epochs=epochs)

Epoch 1/10
[1m6866/6866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 16ms/step - loss: 2.6042
Epoch 2/10
[1m6866/6866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 15ms/step - loss: 2.2146
Epoch 3/10
[1m6866/6866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 15ms/step - loss: 2.0935
Epoch 4/10
[1m6866/6866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 15ms/step - loss: 2.0319
Epoch 5/10
[1m6866/6866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 15ms/step - loss: 1.9823
Epoch 6/10
[1m6866/6866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 15ms/step - loss: 1.9434
Epoch 7/10
[1m6866/6866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 16ms/step - loss: 2.1353
Epoch 8/10
[1m6866/6866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 16ms/step - loss: 2.1016
Epoch 9/10
[1m6866/6866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 16ms/step - loss: 2.0585
Epoch 10/10
[1m6866/6866[0m [32m━━━━━━━━━━━

6. Write text generation function

In [7]:
import numpy as np

def sample_with_temperature(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    if temperature <= 0:
        return np.argmax(preds)
    preds = np.log(preds + 1e-9) / temperature
    exp_preds = np.exp(preds)
    probs = exp_preds / np.sum(exp_preds)
    return np.random.choice(len(probs), p=probs)

def generate_text(model, seed, length=400, temperature=1.0):
    generated = seed
    seq = seed[-seq_length:]
    for _ in range(length):
        x = np.zeros((1, seq_length, vocab_size), dtype=np.bool_)
        for t, ch in enumerate(seq):
            if ch in char_to_idx:
                x[0, t, char_to_idx[ch]] = 1
        preds = model.predict(x, verbose=0)[0]
        next_idx = sample_with_temperature(preds, temperature)
        next_char = idx_to_char[next_idx]
        generated += next_char
        seq = seq[1:] + next_char
    return generated

7. Generate new text and validate

In [8]:
seed = text[:seq_length]
print("Seed:", seed)
print("\nTemperature 0.2\n", generate_text(model, seed, length=300, temperature=0.2))
print("\nTemperature 1.0\n", generate_text(model, seed, length=300, temperature=1.0))
print("\nTemperature 1.2\n", generate_text(model, seed, length=300, temperature=1.2))

Seed: M r. and Mrs. Dursley, of number four, P

Temperature 0.2
 M r. and Mrs. Dursley, of number four, Powed a so was in in and seell and so and stust and stear in the stus the so was his the me the stell in and his hind the gor gor the dich was the clid wing hing stear the deaked the so the clid stee the and steed in in in and his the dook the door the whing the sick the sill the dor the dooked the d

Temperature 1.0
 M r. and Mrs. Dursley, of number four, Peshy;.”

“No E!” you Trear it Hagrid, uld stumchns nighing Mree sruming the reamed hade nolchered, yome you facked, with clan sild at in’t bot

Hinderill,”

Hl yicer niked whood fein to Murstere on it migh his he knyand a mill senn cosndleds chere up hew and gemconing, es Prut,” sim at the mooald at

Temperature 1.2
 M r. and Mrs. Dursley, of number four, Ped, it hine, Harry’le “ as uughed feably; Verytten tall wament wotas;, iuthco araned the sroad exs-les.”

Thove; sis, to sroutos renk. M.…HAVask fot; incs, freinit flidese oll’L