<a href="https://www.kaggle.com/code/samithsachidanandan/one-to-many-learning-to-generate-text?scriptVersionId=282402280" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
!pip install protobuf==3.20.* --force-reinstall --no-deps

Collecting protobuf==3.20.*
  Downloading protobuf-3.20.3-py2.py3-none-any.whl.metadata (720 bytes)
Downloading protobuf-3.20.3-py2.py3-none-any.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.1/162.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 6.33.0
    Uninstalling protobuf-6.33.0:
      Successfully uninstalled protobuf-6.33.0
Successfully installed protobuf-3.20.3


In [2]:
import os
import numpy as np
import re
import shutil
import tensorflow as tf

import warnings
warnings.filterwarnings("ignore")

DATA_DIR = "./data"
CHECKPOINT_DIR = os.path.join(DATA_DIR, "checkpoints")
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

2025-11-28 10:51:58.916928: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764327119.174872      14 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764327119.246066      14 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Download and read texts
def download_and_read(urls):
    texts = []
    for i, url in enumerate(urls):
        p = tf.keras.utils.get_file(
            "ex1-{:d}.txt".format(i), 
            url,
            cache_dir="."
        )
        with open(p, "r", encoding="utf-8") as f:
            text = f.read()

        # clean text
        text = text.replace("\ufeff", "")
        text = text.replace("\n", " ")
        text = re.sub(r'\s+', " ", text)
        texts.append(text)
    return texts

texts = download_and_read([
    "http://www.gutenberg.org/cache/epub/28885/pg28885.txt",
    "https://www.gutenberg.org/files/12/12-0.txt"
])
text = " ".join(texts)

Downloading data from http://www.gutenberg.org/cache/epub/28885/pg28885.txt
[1m177646/177646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://www.gutenberg.org/files/12/12-0.txt
[1m172775/172775[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [4]:
# create the vocabulary
vocab = sorted(set(text))
print("vocab size: {:d}".format(len(vocab)))


# create mapping from vocab chars to ints
char2idx = {c:i for i, c in enumerate(vocab)}
idx2char = {i:c for c, i in char2idx.items()}

vocab size: 91


In [5]:
# numericize the texts
text_as_ints = np.array([char2idx[c] for c in text])
data = tf.data.Dataset.from_tensor_slices(text_as_ints)

# number of characters to show before asking for prediction
# sequences: [None, 100]
seq_length = 100
sequences = data.batch(seq_length + 1, drop_remainder=True)

def split_train_labels(sequence):
    input_seq = sequence[0:-1]
    output_seq = sequence[1:]
    return input_seq, output_seq
    
sequences = sequences.map(split_train_labels)

# set up for training
# batches: [None, 64, 100]
batch_size = 64
steps_per_epoch = len(text_as_ints) // (seq_length + 1) // batch_size
print(f"Steps per epoch: {steps_per_epoch}")

dataset = sequences.shuffle(10000).batch(batch_size, drop_remainder=True)

Steps per epoch: 50


2025-11-28 10:52:22.638781: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [6]:
class CharGenModel(tf.keras.Model):
    def __init__(self, vocab_size, num_timesteps, embedding_dim, **kwargs):
        super(CharGenModel, self).__init__(**kwargs)
        self.embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.rnn_layer = tf.keras.layers.GRU(
            num_timesteps,
            recurrent_initializer="glorot_uniform",
            recurrent_activation="sigmoid",
            stateful=True,
            return_sequences=True
        )
        self.dense_layer = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        x = self.embedding_layer(x)
        x = self.rnn_layer(x)
        x = self.dense_layer(x)
        return x

vocab_size = len(vocab)
embedding_dim = 256

In [7]:
model = CharGenModel(vocab_size, seq_length, embedding_dim)
model.build(input_shape=(batch_size, seq_length))

In [8]:
def loss(labels, predictions):
    return tf.losses.sparse_categorical_crossentropy(
        labels, predictions, from_logits=True
    )

model.compile(optimizer=tf.optimizers.Adam(), loss=loss)

In [9]:
def generate_text(model, prefix_string, char2idx, idx2char, 
                  num_chars_to_generate=1000, temperature=1.0):
    input_eval = [char2idx[s] for s in prefix_string]
    input_eval = tf.expand_dims(input_eval, 0)
    text_generated = []
    model.rnn_layer.reset_states()  
    
    for i in range(num_chars_to_generate):
        preds = model(input_eval)
        preds = tf.squeeze(preds, 0) / temperature
       
        pred_id = tf.random.categorical(preds, num_samples=1)[-1, 0].numpy() 
        text_generated.append(idx2char[pred_id])
     
        input_eval = tf.expand_dims([pred_id], 0)
    
    return prefix_string + "".join(text_generated)

In [10]:

num_epochs = 50
for i in range(num_epochs // 10):
    model.fit(
        dataset.repeat(),
        epochs=10,
        steps_per_epoch=steps_per_epoch
    )
    
    checkpoint_file = os.path.join(
        CHECKPOINT_DIR, 
        "model_epoch_{:d}.weights.h5".format((i+1)*10)
    )
    model.save_weights(checkpoint_file)
    

    gen_model = CharGenModel(vocab_size, seq_length, embedding_dim)
    gen_model.build(input_shape=(1, seq_length)) 
    gen_model.load_weights(checkpoint_file)       
    
    print("\nafter epoch: {:d}".format((i+1)*10))
    print(generate_text(gen_model, "Alice ", char2idx, idx2char))
    print("---")

Epoch 1/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 149ms/step - loss: 3.7038
Epoch 2/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 150ms/step - loss: 2.5637
Epoch 3/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 150ms/step - loss: 2.3333
Epoch 4/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 153ms/step - loss: 2.1990
Epoch 5/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 152ms/step - loss: 2.0966
Epoch 6/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 153ms/step - loss: 2.0206
Epoch 7/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 151ms/step - loss: 1.9590
Epoch 8/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 151ms/step - loss: 1.9032
Epoch 9/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 151ms/step - loss: 1.8554
Epoch 10/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 151ms/step - l