<a href="https://colab.research.google.com/github/space-owner/Tensorflow-2/blob/main/Text_Generation_with_an_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### ***Text generation with an RNN***
This post is **based on the Tensorflow tutorial** for study purposes. [Link](https://www.tensorflow.org/text/tutorials/text_generation)

***Learning Point:***
- **```Many-to-Many Architecture```**
- **```tf.keras.experimental.preprocessing.StringLookup()```**
- **```tf.keras.experimental.preprocessing.StringLookup(invert=True, mask_token=None)```**
- **```tf.strings.reduce_join()```**


In [1]:
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

import numpy as np
import os
import time

print(">>> tf.version =", tf.__version__)

>>> tf.version = 2.6.0


In [2]:
path_to_file = tf.keras.utils.get_file(
    "shakespeare.txt", origin="https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt"
)

In [3]:
text = open(path_to_file, 'rb').read().decode(encoding="utf-8")
print("len(text) =", len(text))
print("text[:100] = ", text[:100])

len(text) = 1115394
text[:100] =  First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [4]:
vocab = sorted(set(text))
print(">>> unique text =", vocab)
print(">>> length of unique text =", len(vocab))

>>> unique text = ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
>>> length of unique text = 65


In [5]:
example_texts = ["abcdefg", "xyz"]

chars = tf.strings.unicode_split(example_texts, input_encoding="UTF-8")
print(">>> chars =", chars)

>>> chars = <tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>


In [6]:
ids_from_chars = preprocessing.StringLookup(
    vocabulary=list(vocab), mask_token=None
)

ids = ids_from_chars(chars)
print(">>> ids = ", ids)

>>> ids =  <tf.RaggedTensor [[40, 41, 42, 43, 44, 45, 46], [63, 64, 65]]>


In [7]:
chars_from_ids = preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None
)
chars = chars_from_ids(ids)
print(">>> chars = ", chars)

>>> chars =  <tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>


In [8]:
print(">>> tf.strings.reduce_join() =", tf.strings.reduce_join(chars, axis=-1).numpy())

>>> tf.strings.reduce_join() = [b'abcdefg' b'xyz']


In [9]:
def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [10]:
print(">>> text[:250] =", text[:250])

>>> text[:250] = First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [11]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, "UTF-8"))
print(">>> all_ids =", all_ids)

>>> all_ids = tf.Tensor([19 48 57 ... 46  9  1], shape=(1115394,), dtype=int64)


In [12]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
print(">>> ids_dataset = ", ids_dataset)

cnt = 1
for ids in ids_dataset.take(10):
    print(">>> ids {} = {}".format(
        cnt, chars_from_ids(ids).numpy().decode("utf-8")))
    cnt += 1

>>> ids_dataset =  <TensorSliceDataset shapes: (), types: tf.int64>
>>> ids 1 = F
>>> ids 2 = i
>>> ids 3 = r
>>> ids 4 = s
>>> ids 5 = t
>>> ids 6 =  
>>> ids 7 = C
>>> ids 8 = i
>>> ids 9 = t
>>> ids 10 = i


In [13]:
seq_length = 100

examples_per_epoch = len(text)//(seq_length+1)

In [14]:
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
    print("chars_from_ids = \n", chars_from_ids(seq))

chars_from_ids = 
 tf.Tensor(
[b'F' b'i' b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':'
 b'\n' b'B' b'e' b'f' b'o' b'r' b'e' b' ' b'w' b'e' b' ' b'p' b'r' b'o'
 b'c' b'e' b'e' b'd' b' ' b'a' b'n' b'y' b' ' b'f' b'u' b'r' b't' b'h'
 b'e' b'r' b',' b' ' b'h' b'e' b'a' b'r' b' ' b'm' b'e' b' ' b's' b'p'
 b'e' b'a' b'k' b'.' b'\n' b'\n' b'A' b'l' b'l' b':' b'\n' b'S' b'p' b'e'
 b'a' b'k' b',' b' ' b's' b'p' b'e' b'a' b'k' b'.' b'\n' b'\n' b'F' b'i'
 b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':' b'\n' b'Y'
 b'o' b'u' b' '], shape=(101,), dtype=string)


In [15]:
for seq in sequences.take(5):
    print("text_from_ids =")
    print(text_from_ids(seq))

text_from_ids =
tf.Tensor(b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou ', shape=(), dtype=string)
text_from_ids =
tf.Tensor(b'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k', shape=(), dtype=string)
text_from_ids =
tf.Tensor(b"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki", shape=(), dtype=string)
text_from_ids =
tf.Tensor(b"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d", shape=(), dtype=string)
text_from_ids =
tf.Tensor(b'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi', shape=(), dtype=string)


In [16]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

split_input_target(list("Tensorflow"))

(['T', 'e', 'n', 's', 'o', 'r', 'f', 'l', 'o'],
 ['e', 'n', 's', 'o', 'r', 'f', 'l', 'o', 'w'])

In [17]:
dataset = sequences.map(split_input_target)

for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target: b'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [18]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE)
)

dataset

<PrefetchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [21]:
vocab_size = len(vocab)

embedding_dim = 256

rnn_units = 1024

In [29]:
class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(rnn_units,
                                    return_sequences=True,
                                    return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)

        if return_state:
            return x, states
        else:
            return x

In [31]:
model = MyModel(
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [32]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 66) # (batch_size, sequence_length, vocab_size)


In [33]:
model.summary()

Model: "my_model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      multiple                  16896     
_________________________________________________________________
gru_1 (GRU)                  multiple                  3938304   
_________________________________________________________________
dense_1 (Dense)              multiple                  67650     
Total params: 4,022,850
Trainable params: 4,022,850
Non-trainable params: 0
_________________________________________________________________


In [34]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)

sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

In [35]:
sampled_indices

array([32, 55, 30, 55,  4, 57, 47, 12, 43, 22,  8, 24, 41, 14, 52, 30, 33,
       62, 54, 57, 25, 41, 20, 57, 35, 35,  8, 13, 46, 53, 58, 26, 62, 57,
        6, 54, 61, 42,  1, 61, 14, 25,  3,  8, 52, 62,  3, 53, 51, 53,  6,
       58,  2,  1, 18, 44, 35,  2, 37,  7, 16, 14, 61, 31, 60, 52, 57, 44,
       55, 31, 24, 64, 23, 41,  7, 56, 48, 62, 51, 14, 48, 25, 44, 62, 33,
       61, 37,  4, 58,  0, 49, 27, 51, 42, 23, 64,  2, 51, 45, 18])

In [36]:
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())

Input:
 b'to her.\n\nQUEEN ELIZABETH:\nAn honest tale speeds best being plainly told.\n\nKING RICHARD III:\nThen in '

Next Char Predictions:
 b"SpQp$rh;dI-KbAmQTworLbGrVV-?gnsMwr'ovc\nvAL!-mw!nln's \nEeV X,CAvRumrepRKyJb,qiwlAiLewTvX$s[UNK]jNlcJy lfE"
