<a href="https://colab.research.google.com/github/space-owner/Tensorflow-2/blob/main/Text%20Generation%20with%20an%20RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### ***Text generation with an RNN***
This post is **based on the Tensorflow tutorial** for study purposes. [Link](https://www.tensorflow.org/text/tutorials/text_generation)

***Learning Point:***
- **```Many-to-Many Architecture```**
- **```tf.keras.experimental.preprocessing.StringLookup()```**
- **```tf.keras.experimental.preprocessing.StringLookup(invert=True, mask_token=None)```**
- **```tf.strings.reduce_join()```**


In [None]:
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

import numpy as np
import os
import time

print(">>> tf.version =", tf.__version__)

>>> tf.version = 2.6.0


In [None]:
path_to_file = tf.keras.utils.get_file(
    "shakespeare.txt", origin="https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt"
)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [None]:
text = open(path_to_file, 'rb').read().decode(encoding="utf-8")
print("len(text) =", len(text))
print("text[:100] = ", text[:100])

len(text) = 1115394
text[:100] =  First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [None]:
vocab = sorted(set(text))
print(">>> unique text =", vocab)
print(">>> length of unique text =", len(vocab))

>>> unique text = ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
>>> length of unique text = 65


In [None]:
example_texts = ["abcdefg", "xyz"]

chars = tf.strings.unicode_split(example_texts, input_encoding="UTF-8")
print(">>> chars =", chars)

>>> chars = <tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>


In [None]:
ids_from_chars = preprocessing.StringLookup(
    vocabulary=list(vocab), mask_token=None
)

ids = ids_from_chars(chars)
print(">>> ids = ", ids)

>>> ids =  <tf.RaggedTensor [[40, 41, 42, 43, 44, 45, 46], [63, 64, 65]]>


In [None]:
chars_from_ids = preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None
)
chars = chars_from_ids(ids)
print(">>> chars = ", chars)

>>> chars =  <tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>


In [None]:
print(">>> tf.strings.reduce_join() =", tf.strings.reduce_join(chars, axis=-1).numpy())

>>> tf.strings.reduce_join() = [b'abcdefg' b'xyz']


In [None]:
def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [None]:
print(">>> text[:250] =", text[:250])

>>> text[:250] = First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [None]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, "UTF-8"))
print(">>> all_ids =", all_ids)

>>> all_ids = tf.Tensor([19 48 57 ... 46  9  1], shape=(1115394,), dtype=int64)


In [None]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
print(">>> ids_dataset = ", ids_dataset)

cnt = 1
for ids in ids_dataset.take(10):
    print(">>> ids {} = {}".format(
        cnt, chars_from_ids(ids).numpy().decode("utf-8")))
    cnt += 1

>>> ids_dataset =  <TensorSliceDataset shapes: (), types: tf.int64>
>>> ids 1 = F
>>> ids 2 = i
>>> ids 3 = r
>>> ids 4 = s
>>> ids 5 = t
>>> ids 6 =  
>>> ids 7 = C
>>> ids 8 = i
>>> ids 9 = t
>>> ids 10 = i


In [None]:
seq_length = 100

examples_per_epoch = len(text)//(seq_length+1)

In [None]:
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
    print("chars_from_ids = \n", chars_from_ids(seq))

tf.Tensor(
[b'F' b'i' b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':'
 b'\n' b'B' b'e' b'f' b'o' b'r' b'e' b' ' b'w' b'e' b' ' b'p' b'r' b'o'
 b'c' b'e' b'e' b'd' b' ' b'a' b'n' b'y' b' ' b'f' b'u' b'r' b't' b'h'
 b'e' b'r' b',' b' ' b'h' b'e' b'a' b'r' b' ' b'm' b'e' b' ' b's' b'p'
 b'e' b'a' b'k' b'.' b'\n' b'\n' b'A' b'l' b'l' b':' b'\n' b'S' b'p' b'e'
 b'a' b'k' b',' b' ' b's' b'p' b'e' b'a' b'k' b'.' b'\n' b'\n' b'F' b'i'
 b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':' b'\n' b'Y'
 b'o' b'u' b' '], shape=(101,), dtype=string)


In [None]:
for seq in sequences.take(5):
    print("text_from_ids =")
    print(text_from_ids(seq))

text_from_ids =
tf.Tensor(b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou ', shape=(), dtype=string)
text_from_ids =
tf.Tensor(b'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k', shape=(), dtype=string)
text_from_ids =
tf.Tensor(b"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki", shape=(), dtype=string)
text_from_ids =
tf.Tensor(b"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d", shape=(), dtype=string)
text_from_ids =
tf.Tensor(b'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi', shape=(), dtype=string)


In [None]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

split_input_target(list("Tensorflow"))

(['T', 'e', 'n', 's', 'o', 'r', 'f', 'l', 'o'],
 ['e', 'n', 's', 'o', 'r', 'f', 'l', 'o', 'w'])

In [None]:
dataset = sequences.map(split_input_target)

for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target: b'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [None]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE)
)

dataset

<PrefetchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>