<a href="https://colab.research.google.com/github/sotetsuk/LectureColab/blob/main/char_level_lm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import time
os.environ["KERAS_BACKEND"] = "jax"

import numpy as np
from keras import layers, models, optimizers, losses, Input, Model
from keras.utils import PyDataset

In [None]:
# Load and prepare text data
def load_text():
    # Download Shakespeare dataset
    import urllib.request
    url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"

    print("Downloading Shakespeare dataset...")
    response = urllib.request.urlopen(url)
    text = response.read().decode('utf-8')

    # Use moderate amount of data for faster training
    text = text[:100000]  # First 100k characters

    return text.strip()


class TextDataset(PyDataset):
    def __init__(self, text, max_seq_length, batch_size, step=1, **kwargs):
        super().__init__(**kwargs)
        self.text = text
        self.max_seq_length = max_seq_length
        self.batch_size = batch_size

        # Create character mappings
        self.chars = sorted(list(set(text)))
        self.char_to_idx = {ch: i for i, ch in enumerate(self.chars)}
        self.idx_to_char = {i: ch for ch, i in self.char_to_idx.items()}
        self.num_chars = len(self.chars)

        # Create indices for all possible sequences
        self.indices = list(range(0, len(text) - max_seq_length, step))

    def __len__(self):
        return len(self.indices) // self.batch_size

    def __getitem__(self, idx):
        batch_indices = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]

        X = np.zeros((self.batch_size, self.max_seq_length), dtype=np.int32)
        y = np.zeros((self.batch_size, self.max_seq_length), dtype=np.int32)

        for i, start_idx in enumerate(batch_indices):
            seq = self.text[start_idx:start_idx + self.max_seq_length]
            next_seq = self.text[start_idx + 1:start_idx + self.max_seq_length + 1]

            for t, char in enumerate(seq):
                X[i, t] = self.char_to_idx[char]
            for t, char in enumerate(next_seq):
                y[i, t] = self.char_to_idx[char]

        return X, y

    def on_epoch_end(self):
        np.random.shuffle(self.indices)


def generate_text(model, char_to_idx, idx_to_char, max_seq_length,
                  num_chars, length=100):
    """Generate text using the trained model with greedy decoding."""
    generated = "the "
    print(generated, end="", flush=True)

    for _ in range(length):
        # Prepare input - use variable length up to max_seq_length
        context = generated[-max_seq_length:]
        x_pred = np.zeros((1, max_seq_length), dtype=np.int32)
        for t, char in enumerate(context):
            x_pred[0, t] = char_to_idx[char]

        # Predict next character
        preds = model.predict(x_pred, verbose=0)[0]  # Shape: (max_seq_length, num_chars)
        # Use the prediction at the position after the context
        next_idx = np.argmax(preds[len(context)-1])  # Greedy decoding
        next_char = idx_to_char[next_idx]

        generated += next_char
        print(next_char, end="", flush=True)
        time.sleep(0.1)

    return generated

# Hyperparameters

In [None]:
max_seq_length = 50
batch_size = 256
epochs = 5

## Loading and preparing data

In [None]:
text = load_text()
dataset = TextDataset(text, max_seq_length, batch_size)

print(f"Text length: {len(text)}")
print(f"Number of unique characters: {dataset.num_chars}")
print(f"Number of batches: {len(dataset)}")

Downloading Shakespeare dataset...
Text length: 100000
Number of unique characters: 61
Number of batches: 390


In [None]:
batch = dataset[0]
X, y = batch
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(X[0])
print(y[0])

X shape: (256, 50)
y shape: (256, 50)
[16 43 52 53 54  1 13 43 54 43 60 39 48  8  0 12 39 40 49 52 39  1 57 39
  1 50 52 49 37 39 39 38  1 35 48 59  1 40 55 52 54 42 39 52  5  1 42 39
 35 52]
[43 52 53 54  1 13 43 54 43 60 39 48  8  0 12 39 40 49 52 39  1 57 39  1
 50 52 49 37 39 39 38  1 35 48 59  1 40 55 52 54 42 39 52  5  1 42 39 35
 52  1]


## Build model with Functional API

In [None]:
inputs = Input(shape=(None,), dtype="int32")
x = layers.Embedding(dataset.num_chars, 16)(inputs)
x = layers.GRU(256, return_sequences=True)(x)
outputs = layers.Dense(dataset.num_chars, activation="softmax")(x)
model = Model(inputs=inputs, outputs=outputs)

model.compile(
    optimizer=optimizers.Adam(learning_rate=0.002),
    loss=losses.sparse_categorical_crossentropy,
    metrics=['accuracy']
)

model.summary()

## Generate text before training

In [None]:
generated_before = generate_text(
        model, dataset.char_to_idx, dataset.idx_to_char,
        max_seq_length, dataset.num_chars, length=200
)

the EwwMM x:bmnHAIbtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadtEadt

In [None]:
model.fit(dataset,epochs=epochs, verbose=1)

Epoch 1/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step - accuracy: 0.2645 - loss: 2.7307
Epoch 2/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 14ms/step - accuracy: 0.5240 - loss: 1.5956
Epoch 3/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - accuracy: 0.6401 - loss: 1.1840
Epoch 4/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - accuracy: 0.7429 - loss: 0.8724
Epoch 5/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step - accuracy: 0.7949 - loss: 0.7204


<keras.src.callbacks.history.History at 0x7981b450dad0>

## Generate text after training

In [None]:
generated_before = generate_text(
        model, dataset.char_to_idx, dataset.idx_to_char,
        max_seq_length, dataset.num_chars, length=200
)

the people and his country:
It shall be so.

COMINIUS:
He's a disease that have belly will be ruled.

BRUTUS:
Come, come, you are well undersaly were resely
If he have put my wisdomance.

Second Soldier:
