<a href="https://colab.research.google.com/github/rgobinat/NLP-Projects/blob/master/Shakespeare_Text_Generation_with_an_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import tensorflow as tf
import numpy as np


#Data Preprocessing
#Load File
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

#define Vocab
text = open(path_to_file, 'rb').read().decode(encoding = 'utf-8')
vocab = sorted(set(text))

#Char to ID & Id to Char using StringLookup Model
ids_from_chars = tf.keras.layers.StringLookup(vocabulary = list(vocab), mask_token = None)
chars_from_ids = tf.keras.layers.StringLookup(vocabulary = ids_from_chars.get_vocabulary(),invert=True, mask_token=None )
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

#Prepare Dataset for Training
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))

#Create a TensorFlow DataSet
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

seq_length = 100
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

#Split sequences into Xs and Ys
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

#Dataset to input to Model
BATCH_SIZE = 64
BUFFER_SIZE = 10000
dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

In [4]:
#DO NOT RUN THIS
#Same Code as above **** Ignore
ids_from_chars = tf.keras.layers.StringLookup(vocabulary = list(vocab), mask_token = None)
chars_from_ids = tf.keras.layers.StringLookup(vocabulary = ids_from_chars.get_vocabulary(), invert = True, mask_token = None)
def texts_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis = -1)
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
seq_length = 100
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)
def split_sequences(sequence):
  input_text = sequence[:-1]
  target_text = sequence[1:]
  return input_text, target_text
dataset = sequences.map(split_sequences)
BATCH_SIZE=64
BUFFER_SIZE=10000
dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE)
)

In [3]:
import os as os

#Enable runtime to GPU for faster model training

#Deine, Compile and Fit the Model
vocab_size = len(ids_from_chars.get_vocabulary())
embedding_dim = 256
rnn_units = 1024

class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

model = MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)


#Try the Model - Untrained
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())
#TRY


#Train the Model
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", example_batch_mean_loss)
tf.exp(example_batch_mean_loss).numpy()

model.compile(optimizer='adam', loss=loss)
model.summary()

#Configure Checkpoints
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

#Execute Training
EPOCHS = 20
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])


(64, 100, 66) # (batch_size, sequence_length, vocab_size)
Input:
 b" seeth not this palpable device?\nYet who's so blind, but says he sees it not?\nBad is the world; and "

Next Char Predictions:
 b"W\nNx:gZbZ:FgJ[UNK]lwB:EPfr,TDF;fSsn;QWNF;RhTGHsiwJNExuY'ztiFOY;VTmJ3'YSlkagxuEBO BXZF:XWCKrlxzLzyccnkXrq"
Prediction shape:  (64, 100, 66)  # (batch_size, sequence_length, vocab_size)
Mean loss:         tf.Tensor(4.189696, shape=(), dtype=float32)
Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  16896     
                                                                 
 gru (GRU)                   multiple                  3938304   
                                                                 
 dense (Dense)               multiple                  67650     
                                                                 
Total 

In [6]:
#Geenrate Text
import time

seed_text = 'Shylock:'

class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)
start = time.time()
states = None
next_char = tf.constant([seed_text])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

Shylock: there's a my bloody, treacherous,
More than the world and should be his done. Come, officer:
The more he is faked for the new-deliver'd;
He shall appear it in yonder enough:'
bear ye good to him! Come on, come hither;
But as you will command your famous loan,
As rice prepare for all the which your pidges put by
likeness of the miracles of inevite:
But for the loving presence may have we? I will perform'd
My wretchedness but thirst will be my drops?

TYBROSS OF YORK:
Grave heart?

CORIOLANUS:
The thirf, your sheep-speed, sir.

BRUTUS:
'Tis dead;
By thieves I could with those that I love that he hath,
By 'twere expected in his holy house.
This lie, how canst thou made me spile away?

RATCLIFF:
Where is he?

GRUMIO:
Renowned all to the end.

CLAUDIO:
Then show't thyself will off you to the trial; and besee
The tread of traitor spices of histingly there,
By draw any meredest father's life to life
What you cannot had ere it proclaimed aid.
Alone his sister! kiss me ere itward son,
