#  Chapter 16: Natural Language Processing with RNNs and Attention

In [26]:
import tensorflow as tf
from pathlib import Path

In [2]:

shakespeare_url = "https://homl.info/shakespeare"  # shortcut URL
filepath = tf.keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

Downloading data from https://homl.info/shakespeare
[1m1115394/1115394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [4]:
print(shakespeare_text[:80])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.


In [6]:
all_chars  = "".join(sorted(set(shakespeare_text.lower())))
print(all_chars)
print(len(all_chars))


 !$&',-.3:;?abcdefghijklmnopqrstuvwxyz
39


## Generating Shakespearean Text Using a Character RNN

### Preparing Dataset for a char level rnn model

#### Text Vectorization

In [11]:
text_vec_layer = tf.keras.layers.TextVectorization(split='character', standardize='lower')
text_vec_layer.adapt([shakespeare_text])
encoded = text_vec_layer([shakespeare_text])[0]
encoded

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([21,  7, 10, ..., 22, 28, 12])>

In [13]:
encoded -= 2 # drop 0 for padding and 1 for unkown tokens
n_tokens = text_vec_layer.vocabulary_size()-2
dataset_size = len(encoded)
print("n_tokens:", n_tokens)
print("dataset_size:", dataset_size)

n_tokens: 39
dataset_size: 1115394


it is seq2seq model

In [16]:
def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
    dataset = tf.data.Dataset.from_tensor_slices(sequence)
    dataset = dataset.window(length+1, shift=1, drop_remainder =True)
    dataset = dataset.flat_map(lambda window: window.batch(length+1))
    if shuffle:
      dataset = dataset.shuffle(buffer_size=100_000, seed=seed)
    dataset = dataset.batch(batch_size)
    return dataset.map(lambda window: (window[:,:-1], window[:,1:])).prefetch(1)

In [None]:
# There's just one sample in this dataset: the input represents "to b" and the
# output represents "o be"
list(to_dataset(text_vec_layer(["To be"])[0], length=4))

In [17]:
length = 100
tf.random.set_seed(42)

train_set = to_dataset(encoded[:1_000_000], length=100, shuffle=True, seed=42)
valid_set = to_dataset(encoded[1_000_000:1_060_000], length=100)
test_set = to_dataset(encoded[1_060_000:], length=100)


### Building and Training the Char-RNN Model


In [19]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.Dense(n_tokens, activation='softmax'),
])


model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam',
              metrics=['accuracy'])

model_ckpt = tf.keras.callbacks.ModelCheckpoint(
 "my_shakespeare_model.keras", monitor="val_accuracy", save_best_only=True)

In [21]:
model.summary()

In [20]:
# history = model.fit(train_set, validation_data=valid_set, epochs=4,
#  callbacks=[model_ckpt])


Epoch 1/4
  31245/Unknown [1m417s[0m 13ms/step - accuracy: 0.5475 - loss: 1.4964

  self.gen.throw(typ, value, traceback)


[1m31247/31247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m432s[0m 13ms/step - accuracy: 0.5475 - loss: 1.4964 - val_accuracy: 0.5328 - val_loss: 1.6016
Epoch 2/4


KeyboardInterrupt: 

#### shakespeare model

In [22]:
# shakespeare_model = tf.keras.Sequential([
#     text_vec_layer,
#     tf.keras.layers.Lambda(lambda X:X-2),
#     model
# ])

In [33]:
# downloads the pretrained model
url = "https://github.com/sayedgamal99/Data-Science/blob/main/Educational/Hands-On-Machine-Learning/CH16%20Natural%20Language%20Processing%20with%20RNNs%20and%20Attention/models/"
path = tf.keras.utils.get_file("shakespeare_model.keras", url)
shakespeare_model = tf.keras.models.load_model(path)

ValueError: File not found: filepath=/root/.keras/datasets/shakespeare_model.keras. Please ensure the file is an accessible `.keras` zip file.

In [32]:
model_path

PosixPath('/root/.keras/datasets/shakespeare_model.keras')

In [31]:
path

'/root/.keras/datasets/shakespeare_model.keras'

In [24]:
shakespeare_model.summary()

TypeError: 'NoneType' object is not subscriptable

predicting next character:

In [None]:
y_propas = model.predict(['To be or not to b'])[0]
print(y_propas, y_propas.shape)
text_vec_layer.get_vocabulary()[y_propas[-1]+2]
