<a href="https://colab.research.google.com/github/prikmm/MLprojects/blob/main/notebooks/ShakespeareanText_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [21]:
shakespeare_url = "https://homl.info/shakespeare"
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

## Encoding using Tokenizer:

In [22]:
print(shakespeare_text[:148])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?



In [23]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(shakespeare_text)

In [24]:
tokenizer.texts_to_sequences(["HIIII", "hiiii", "Hey there"])

[[7, 6, 6, 6, 6], [7, 6, 6, 6, 6], [7, 2, 16, 1, 3, 7, 2, 9, 2]]

In [25]:
tokenizer.sequences_to_texts([[20, 6, 9, 3, 4]])

['f i r t o']

In [26]:
max_id = len(tokenizer.word_index)  # no.of distinct characters
dataset_size = tokenizer.document_count
print(max_id, dataset_size)

39 1115394


In [27]:
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1
print(encoded)

[19  5  8 ... 20 26 10]


## Splitting a Sequential Dataset:

In [28]:
train_size = dataset_size * 90 //100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
for item in dataset.take(10):
    print(item)

tf.Tensor(19, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(8, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(18, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)


In [29]:
n_steps = 100
window_length = n_steps + 1
dataset = dataset.window(window_length, shift=1, drop_remainder=True)
for item in dataset.take(1):
    print(item)

<_VariantDataset shapes: (), types: tf.int64>


In [30]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))
for item in dataset.take(1):
    print(item)

tf.Tensor(
[19  5  8  7  2  0 18  5  2  5 35  1  9 23 10 21  1 19  3  8  1  0 16  1
  0 22  8  3 18  1  1 12  0  4  9 15  0 19 13  8  2  6  1  8 17  0  6  1
  4  8  0 14  1  0  7 22  1  4 24 26 10 10  4 11 11 23 10  7 22  1  4 24
 17  0  7 22  1  4 24 26 10 10 19  5  8  7  2  0 18  5  2  5 35  1  9 23
 10 15  3 13  0], shape=(101,), dtype=int64)


In [31]:
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)

In [32]:
for item in dataset.take(1):
    print(item)

tf.Tensor(
[[10 16  3 ...  0  2  6]
 [ 4  9  2 ...  5  2  6]
 [ 8  0 13 ...  3 10  7]
 ...
 [12  0 18 ...  7 11 15]
 [ 0 18  5 ...  0 12  3]
 [ 1  8  7 ...  1  7  0]], shape=(32, 101), dtype=int64)


In [33]:
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [34]:
for item in dataset.take(1):
    print(item)

(<tf.Tensor: shape=(32, 100), dtype=int64, numpy=
array([[ 1,  8,  0, ..., 25,  1,  8],
       [ 0,  3, 19, ...,  0, 15,  3],
       [12,  0, 22, ...,  0, 13, 22],
       ...,
       [ 4,  0,  2, ...,  1, 21,  1],
       [13,  0,  6, ...,  7,  2,  3],
       [13,  2,  5, ...,  1,  4, 11]])>, <tf.Tensor: shape=(32, 100), dtype=int64, numpy=
array([[ 8,  0,  2, ...,  1,  8, 21],
       [ 3, 19,  0, ..., 15,  3, 13],
       [ 0, 22,  8, ..., 13, 22, 17],
       ...,
       [ 0,  2,  4, ..., 21,  1, 11],
       [ 0,  6,  4, ...,  2,  3,  9],
       [ 2,  5,  9, ...,  4, 11,  0]])>)


In [35]:
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))

In [36]:
for X, y in dataset.take(1):
    print(X.shape, y.shape)

(32, 100, 39) (32, 100)


In [37]:
dataset = dataset.prefetch(1)

## Building Model:

In [None]:
shakespearean_model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],
                     dropout=0.2),#, recurrent_dropout=0.2),
    keras.layers.GRU(128, return_sequences=True,
                     dropout=0.2),#, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax")),
])
    
shakespearean_model.compile(loss="sparse_categorical_crossentropy",
                            optimizer=keras.optimizers.RMSprop(4e-4),
                            metrics=["accuracy"])
history = shakespearean_model.fit(dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

## Predicting a Character:

In [None]:
def preprocess(texts):
    X = np.array(tokenizer.texts_to_sequences(texts)) - 1
    return tf.one_hot(X, max_id)

In [None]:
X_new = preprocess(["How are yo"])
Y_pred = shakespearean_model.predict_classes(X_new)
tokenizer.sequences_to_texts(Y_pred + 1)[0][-1]

## Predicting multilpe characters:

In [None]:
def next_char(text, temperature=1):
    X_new = preprocess([text])
    y_proba = shakespearean_model.predict(X_new)[0, -1:, :]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]

In [None]:
def complete_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

In [None]:
print(complete_text("t", temperature=0.2))

In [None]:
print(complete_text("a", temperature=0.5))

In [None]:
print(complete_text("s", temperature=1))

In [None]:
print(complete_text("r", temperature=2))

## Stateful RNN:

Fabien Chollet gives this definition of STATEFULNESS:
<br>Boolean (default False). If True, the last state for each sample at index i in a batch will be used as initial state for the sample of index i in the following batch.
<br>

By default, Keras shuffles (permutes) the samples in X and the dependencies between Xi and Xi+1 are lost. Let’s assume there’s no shuffling in our explanation.

If the model is stateless, the cell states are reset at each sequence. With the stateful model, all the states are propagated to the next batch. It means that the state of the sample located at index i, Xi will be used in the computation of the sample Xi+bs in the next batch, where bs is the batch size (no shuffling).


In [112]:
batch_size = 32
encoded_parts = np.array_split(encoded[:train_size], batch_size)
datasets = []
for encoded_part in encoded_parts:
    dataset = tf.data.Dataset.from_tensor_slices(encoded_part)
    dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(window_length))
    datasets.append(dataset)
dataset = tf.data.Dataset.zip(tuple(datasets)).map(lambda *windows: tf.stack(windows))
dataset = dataset.repeat().map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = dataset.prefetch(1)

In [114]:
stateful_model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, stateful=True,
                     dropout=0.2, recurrent_dropout=0.2,
                     batch_input_shape=[batch_size, None, max_id]),
    keras.layers.GRU(128, return_sequences=True, stateful=True,
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,
                                                    activation="softmax"))
])

In [115]:
class ResetStatesCallback(keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()

In [None]:
stateful_model.compile(loss="sparse_categorical_crossentropy",
              optimizer="adam")
steps_per_epoch = train_size // batch_size // n_steps
stateful_model.fit(dataset, steps_per_epoch=steps_per_epoch,
                   epochs=50, callbacks=[ResetStatesCallback()])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50