In [1]:
import tensorflow as tf

import numpy as np
import os
import time

In [2]:
path_to_file = "/Users/AnaPSilva/Documents/Ana/Ironhack/Bootcamp/Final_Project/Data/Poem_Play/maya_angelou.txt"

In [3]:
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print(f'Length of text: {len(text)} characters')

Length of text: 26102 characters


In [4]:
# Take a look at the first 250 characters in text
print(text[:250])

THE BLACK FAMILY PLEDGE

BECAUSE we have forgotten our ancestors,
our children no longer give us honor.

BECAUSE we have lost the path our ancestors cleared
kneeling in perilous undergrowth,
our children cannot find their way.

BECAUSE we have banish


In [5]:
# The unique characters in the file
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

63 unique characters


In [6]:
## Before training, you need to convert the strings to a numerical representation.
## convert each character into a numeric ID. 
## It just needs the text to be split into tokens first.
example_texts = ['abcdefg', 'xyz']

chars = tf.strings.unicode_split(example_texts, input_encoding='UTF-8')
chars

2022-03-12 12:40:07.058820: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

In [7]:
## from tokens to character IDs
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)
ids = ids_from_chars(chars)
ids

<tf.RaggedTensor [[38, 39, 40, 41, 42, 43, 44], [61, 62, 63]]>

In [8]:
## it will also be important to invert this representation and 
## recover human-readable strings from it.
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

In [9]:
chars = chars_from_ids(ids)
chars

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

In [10]:
## join the characters back into strings.
tf.strings.reduce_join(chars, axis=-1).numpy()

array([b'abcdefg', b'xyz'], dtype=object)

In [11]:
def text_from_ids(ids):
  """join the characters back into strings"""
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

#### Create training examples and targets

In [12]:
## convert the text vector into a stream of character indices.
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids

<tf.Tensor: shape=(26102,), dtype=int64, numpy=array([32, 21, 18, ...,  1,  1,  1])>

In [13]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

T
H
E
 
B
L
A
C
K
 


In [14]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)
examples_per_epoch

258

In [15]:
## The batch method lets you easily convert 
## these individual characters to sequences of the desired size.
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
  print(chars_from_ids(seq))

tf.Tensor(
[b'T' b'H' b'E' b' ' b'B' b'L' b'A' b'C' b'K' b' ' b'F' b'A' b'M' b'I'
 b'L' b'Y' b' ' b'P' b'L' b'E' b'D' b'G' b'E' b'\n' b'\n' b'B' b'E' b'C'
 b'A' b'U' b'S' b'E' b' ' b'w' b'e' b' ' b'h' b'a' b'v' b'e' b' ' b'f'
 b'o' b'r' b'g' b'o' b't' b't' b'e' b'n' b' ' b'o' b'u' b'r' b' ' b'a'
 b'n' b'c' b'e' b's' b't' b'o' b'r' b's' b',' b'\n' b'o' b'u' b'r' b' '
 b'c' b'h' b'i' b'l' b'd' b'r' b'e' b'n' b' ' b'n' b'o' b' ' b'l' b'o'
 b'n' b'g' b'e' b'r' b' ' b'g' b'i' b'v' b'e' b' ' b'u' b's' b' ' b'h'
 b'o' b'n' b'o'], shape=(101,), dtype=string)


In [16]:
## It's easier to see what this is doing if you join the 
## tokens back into strings

for seq in sequences.take(10):
  print(text_from_ids(seq).numpy())

b'THE BLACK FAMILY PLEDGE\n\nBECAUSE we have forgotten our ancestors,\nour children no longer give us hono'
b'r.\n\nBECAUSE we have lost the path our ancestors cleared\nkneeling in perilous undergrowth,\nour childre'
b'n cannot find their way.\n\nBECAUSE we have banished the God of our ancestors,\nour children cannot pray'
b'.\n\nBECAUSE the old wails of our ancestors have faded beyond our hearing,\nour children cannot hear us '
b'crying.\n\nBECAUSE we have abandoned our wisdom of mothering and fathering,\nour befuddled children give'
b' birth to children\nthey neither want nor understand.\n\nBECAUSE we have forgotten how to love, the adve'
b'rsary is within our\ngates, an holds us up to the mirror of the world shouting,\n"Regard the loveless"\n'
b'\nTherefore we pledge to bind ourselves to one another, to embrace our\nlowliest, to keep company with '
b'our loneliest, to educate our illiterate,\nto feed our starving, to clothe our ragged, to do all good '
b'things,\nknowing that we are

In [17]:
def split_input_target(sequence):
    """takes a sequence as input, duplicates, 
    and shifts it to align the input and label for each timestep"""
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [18]:
dataset = sequences.map(split_input_target)
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'THE BLACK FAMILY PLEDGE\n\nBECAUSE we have forgotten our ancestors,\nour children no longer give us hon'
Target: b'HE BLACK FAMILY PLEDGE\n\nBECAUSE we have forgotten our ancestors,\nour children no longer give us hono'


#### Create training batches

In [19]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

#### Build The Model

- **tf.keras.layers.Embedding:** The input layer. A trainable lookup table that will map each character-ID to a vector with embedding_dim dimensions;
- **tf.keras.layers.GRU:** A type of RNN with size units=rnn_units (You can also use an LSTM layer here.)
- **tf.keras.layers.Dense:** The output layer, with vocab_size outputs. It outputs one logit for each character in the vocabulary. These are the log-likelihood of each character according to the model.

In [20]:
# Length of the vocabulary in chars
vocab_size = len(vocab)
display(vocab_size)
# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

63

In [21]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [22]:
model = MyModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

#### Try the model

In [23]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 64) # (batch_size, sequence_length, vocab_size)


In [24]:
model.summary()

Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  16384     
                                                                 
 gru (GRU)                   multiple                  3938304   
                                                                 
 dense (Dense)               multiple                  65600     
                                                                 
Total params: 4,020,288
Trainable params: 4,020,288
Non-trainable params: 0
_________________________________________________________________


In [25]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

## This gives us, at each timestep, a prediction of the next character index
sampled_indices

array([ 4, 12, 52,  1, 17, 31, 22, 18, 46, 58, 29, 43, 21, 28, 21, 55, 12,
       56, 27, 54, 56,  6,  7, 50,  7,  0,  0, 32, 25,  2, 41,  0, 42, 33,
        0, 28, 55, 33, 14, 45, 52, 47, 20, 44, 39, 30, 43, 15,  9, 53, 29,
       29, 46, 13, 28, 19,  5, 60, 30, 50,  1, 42, 31, 36, 15, 30, 13, 50,
        9, 62, 17, 39, 44, 10, 50,  0, 48, 47, 15, 57, 56, 41, 54, 45, 39,
       29, 38, 60, 32, 20, 22, 32, 43, 63, 33, 12, 15, 25, 14, 23])

In [26]:
## Decode these to see the text predicted by this untrained model
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())

Input:
 b' Garden of Babylon\nHanging as eternal beauty\nIn our collective memory\nNot the Grand Canyon\nKindled i'

Next Char Predictions:
 b"':o\nDSIEiuPfHOHr:sNqs),m,[UNK][UNK]TL d[UNK]eU[UNK]OrUAhojGgbRfB.pPPi?OF(wRm\neSYBR?m.yDbg0m[UNK]kjBtsdqhbPawTGITfzU:BLAJ"


#### Train the model
- At this point the problem can be treated as a standard classification problem. Given the previous RNN state, and the input this time step, predict the class of the next character.

In [27]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [28]:
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", example_batch_mean_loss)

Prediction shape:  (64, 100, 64)  # (batch_size, sequence_length, vocab_size)
Mean loss:         tf.Tensor(4.1589293, shape=(), dtype=float32)


In [29]:
tf.exp(example_batch_mean_loss).numpy()

64.00296

In [30]:
## training procedure using the tf.keras.Model.compile method. 
## Use tf.keras.optimizers.Adam with default arguments and the loss function.
model.compile(optimizer='adam', loss=loss)

In [31]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_poem_mayaangelou_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [32]:
EPOCHS = 350
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/350
Epoch 2/350
Epoch 3/350
Epoch 4/350
Epoch 5/350
Epoch 6/350
Epoch 7/350
Epoch 8/350
Epoch 9/350
Epoch 10/350
Epoch 11/350
Epoch 12/350
Epoch 13/350
Epoch 14/350
Epoch 15/350
Epoch 16/350
Epoch 17/350
Epoch 18/350
Epoch 19/350
Epoch 20/350
Epoch 21/350
Epoch 22/350
Epoch 23/350
Epoch 24/350
Epoch 25/350
Epoch 26/350
Epoch 27/350
Epoch 28/350
Epoch 29/350
Epoch 30/350
Epoch 31/350
Epoch 32/350
Epoch 33/350
Epoch 34/350
Epoch 35/350
Epoch 36/350
Epoch 37/350
Epoch 38/350
Epoch 39/350
Epoch 40/350
Epoch 41/350
Epoch 42/350
Epoch 43/350
Epoch 44/350
Epoch 45/350
Epoch 46/350
Epoch 47/350
Epoch 48/350
Epoch 49/350
Epoch 50/350
Epoch 51/350
Epoch 52/350
Epoch 53/350
Epoch 54/350
Epoch 55/350
Epoch 56/350
Epoch 57/350
Epoch 58/350
Epoch 59/350
Epoch 60/350
Epoch 61/350
Epoch 62/350
Epoch 63/350
Epoch 64/350
Epoch 65/350
Epoch 66/350
Epoch 67/350
Epoch 68/350
Epoch 69/350
Epoch 70/350
Epoch 71/350
Epoch 72/350
Epoch 73/350
Epoch 74/350
Epoch 75/350
Epoch 76/350
Epoch 77/350
Epoch 78

In [33]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [34]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [37]:
start = time.time()
states = None
next_char = tf.constant(['Life '])
result = [next_char]

for n in range(500):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

Life ppanos
upon the laughte is muth and ages.
But today, the Rock cries out to us, clearler she pity we have lived through and live through still,
Have sharpened our senses and tough.
She sent them away,
underground, overland, in coaches and
shoeless.

When you learn, teachers she crick co sace ever in the gings of hearived and the shenes

When we come to it
We, this people, on this wayward, floating body
Created on this earth, ofreg noedgnigg, the nears,
The nearh of my haust,
by lough you a bodder 

________________________________________________________________________________

Run time: 2.1359128952026367


In [38]:
tf.saved_model.save(one_step_model, 'Maya Angelou')





INFO:tensorflow:Assets written to: Maya Angelou/assets


INFO:tensorflow:Assets written to: Maya Angelou/assets
