In [1]:
import tensorflow as tf

shakespeare_url = "https://homl.info/shakespeare"
filepath = tf.keras.utils.get_file("shakespeare.txt", shakespeare_url)

Downloading data from https://homl.info/shakespeare
[1m1115394/1115394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [2]:
with open(filepath) as f:
    shakespeare_text = f.read()

print(shakespeare_text[62:173])

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.


In [3]:
len(shakespeare_text)

1115394

In [4]:
text_vec_layer = tf.keras.layers.TextVectorization(
    split="character",
    standardize="lower"
)

text_vec_layer.adapt(shakespeare_text)
encoded = text_vec_layer(shakespeare_text)
encoded.shape

TensorShape([1115394])

In [5]:
encoded -= 2 # Because we don't need 0 (pad) and 1 (unknown)
n_tokens = text_vec_layer.vocabulary_size() - 2
dataset_size = len(encoded)

print(f"{dataset_size = }, {n_tokens = }")

dataset_size = 1115394, n_tokens = 39


In [7]:
def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length+1, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda window_ds: window_ds.batch(length+1))
    if shuffle:
        ds = ds.shuffle(buffer_size=100_000, seed=seed)
    ds = ds.batch(batch_size)

    return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

In [8]:
length = 100
tf.random.set_seed(42)
train_set = to_dataset(encoded[:1_000_000], length=length, shuffle=True, seed=42)
valid_set = to_dataset(encoded[1_000_000:1_060_000], length=length)
test_set = to_dataset(encoded[1_060_000:], length=length)

In [9]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.Dense(n_tokens, activation="softmax")
])
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="nadam",
    metrics=["accuracy"],
)
model_ckpt = tf.keras.callbacks.ModelCheckpoint(
    "./shakespeare_models/model.keras",
    monitor="val_accuracy",
    save_best_only=True,
)
history = model.fit(
    train_set,
    validation_data=valid_set,
    epochs=10,
    callbacks=[model_ckpt]
)

Epoch 1/10
  31242/Unknown [1m369s[0m 11ms/step - accuracy: 0.5433 - loss: 1.5114



[1m31247/31247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m386s[0m 12ms/step - accuracy: 0.5433 - loss: 1.5114 - val_accuracy: 0.5345 - val_loss: 1.5954
Epoch 2/10
[1m31247/31247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m441s[0m 12ms/step - accuracy: 0.5980 - loss: 1.2913 - val_accuracy: 0.5408 - val_loss: 1.5754
Epoch 3/10
[1m31247/31247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m382s[0m 12ms/step - accuracy: 0.6026 - loss: 1.2704 - val_accuracy: 0.5444 - val_loss: 1.5624
Epoch 4/10
[1m31247/31247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m382s[0m 12ms/step - accuracy: 0.6053 - loss: 1.2591 - val_accuracy: 0.5444 - val_loss: 1.5583
Epoch 5/10
[1m31247/31247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m383s[0m 12ms/step - accuracy: 0.6067 - loss: 1.2525 - val_accuracy: 0.5463 - val_loss: 1.5545
Epoch 6/10
[1m31247/31247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m440s[0m 12ms/step - accura

In [42]:
shakespeare_model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Lambda(lambda x: x-2),
    model,
])

In [88]:
input_text = tf.constant(["whomst art tho", "four"])  # string input

y_proba = shakespeare_model.predict(input_text)[:, -1]
y_pred = tf.argmax(y_proba, axis=-1)
for i in y_pred:
    print(text_vec_layer.get_vocabulary()[i+2])

(2,)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
u
:


In [96]:
def next_char(text, temperature=1):
  y_proba = shakespeare_model.predict([text], verbose=False)[0, -1:]
  rescaled_logits = tf.math.log(y_proba) / temperature
  char_id = tf.random.categorical(rescaled_logits, num_samples=1)[0, 0]
  return text_vec_layer.get_vocabulary()[char_id + 2]

In [114]:
def extend_text(text, n_chars=50, temperature=1):
  text = tf.constant([text])
  for _ in range(n_chars):
    text += next_char(text, temperature)
  return text.numpy()[0].item().decode('UTF-8')

In [119]:
val = extend_text(["To be or not to be"], temperature = 0.001)
val = val.split(sep='\n')
for s in val:
    print(s)

To be or not to be a provost:
and the servant of the world, and the 


In [120]:
val = extend_text(["To be or not to be"], temperature = 0.01)
val = val.split(sep='\n')
for s in val:
    print(s)

To be or not to be a provost:
and the servant of the world, and the 


In [122]:
val = extend_text(["To be or not to be"], temperature = 0.5)
val = val.split(sep='\n')
for s in val:
    print(s)

To be or not to be so dance
i do have ta'en love with the duke will,


In [121]:
val = extend_text(["To be or not to be"], temperature = 1)
val = val.split(sep='\n')
for s in val:
    print(s)

To be or not to bestrens
fare the intent in a word baght, on one fri


In [123]:
model.save("./shakespeare_models/final_model.keras")