In [41]:
import tensorflow as tf

import numpy as np
import os
import time

In [42]:
path_to_file = "/Users/AnaPSilva/Documents/Ana/Ironhack/Bootcamp/Final_Project/Data/Poem_Play/emilydickinson.txt"

In [43]:
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print(f'Length of text: {len(text)} characters')

Length of text: 69212 characters


In [44]:
# Take a look at the first 250 characters in text
print(text[:250])

I.                 
LIFE.

        I.

I'm nobody!  Who are you?
Are you nobody, too?
Then there 's a pair of us -- don't tell!
They 'd banish us, you know.

How dreary to be somebody!
How public, like a frog
To tell your name the livelong day
To an 


In [45]:
# The unique characters in the file
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

64 unique characters


In [46]:
## Before training, you need to convert the strings to a numerical representation.
## convert each character into a numeric ID. 
## It just needs the text to be split into tokens first.
example_texts = ['abcdefg', 'xyz']

chars = tf.strings.unicode_split(example_texts, input_encoding='UTF-8')
chars

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

In [47]:
## from tokens to character IDs
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)
ids = ids_from_chars(chars)
ids

<tf.RaggedTensor [[39, 40, 41, 42, 43, 44, 45], [62, 63, 64]]>

In [48]:
## it will also be important to invert this representation and 
## recover human-readable strings from it.
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

In [49]:
chars = chars_from_ids(ids)
chars

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

In [50]:
## join the characters back into strings.
tf.strings.reduce_join(chars, axis=-1).numpy()

array([b'abcdefg', b'xyz'], dtype=object)

In [51]:
def text_from_ids(ids):
  """join the characters back into strings"""
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

#### Create training examples and targets

In [52]:
## convert the text vector into a stream of character indices.
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids

<tf.Tensor: shape=(69212,), dtype=int64, numpy=array([22, 10,  2, ..., 46, 43,  3])>

In [53]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

I
.
 
 
 
 
 
 
 
 


In [54]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)
examples_per_epoch

685

In [55]:
## The batch method lets you easily convert 
## these individual characters to sequences of the desired size.
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
  print(chars_from_ids(seq))

tf.Tensor(
[b'I' b'.' b' ' b' ' b' ' b' ' b' ' b' ' b' ' b' ' b' ' b' ' b' ' b' '
 b' ' b' ' b' ' b' ' b' ' b'\n' b'L' b'I' b'F' b'E' b'.' b'\n' b'\n' b' '
 b' ' b' ' b' ' b' ' b' ' b' ' b' ' b'I' b'.' b'\n' b'\n' b'I' b"'" b'm'
 b' ' b'n' b'o' b'b' b'o' b'd' b'y' b'!' b' ' b' ' b'W' b'h' b'o' b' '
 b'a' b'r' b'e' b' ' b'y' b'o' b'u' b'?' b'\n' b'A' b'r' b'e' b' ' b'y'
 b'o' b'u' b' ' b'n' b'o' b'b' b'o' b'd' b'y' b',' b' ' b't' b'o' b'o'
 b'?' b'\n' b'T' b'h' b'e' b'n' b' ' b't' b'h' b'e' b'r' b'e' b' ' b"'"
 b's' b' ' b'a'], shape=(101,), dtype=string)


In [56]:
## It's easier to see what this is doing if you join the 
## tokens back into strings

for seq in sequences.take(5):
  print(text_from_ids(seq).numpy())

b"I.                 \nLIFE.\n\n        I.\n\nI'm nobody!  Who are you?\nAre you nobody, too?\nThen there 's a"
b" pair of us -- don't tell!\nThey 'd banish us, you know.\n\nHow dreary to be somebody!\nHow public, like "
b'a frog\nTo tell your name the livelong day\nTo an admiring bog!\n\n\n\n\n        II.\n\nI bring an unaccustome'
b'd wine\nTo lips long parching, next to mine,\nAnd summon them to drink.\n\nCrackling with fever, they ess'
b'ay;\nI turn my brimming eyes away,\nAnd come next hour to look.\n\nThe hands still hug the tardy glass;\nT'


In [57]:
def split_input_target(sequence):
    """takes a sequence as input, duplicates, 
    and shifts it to align the input and label for each timestep"""
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [58]:
dataset = sequences.map(split_input_target)
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b"I.                 \nLIFE.\n\n        I.\n\nI'm nobody!  Who are you?\nAre you nobody, too?\nThen there 's "
Target: b".                 \nLIFE.\n\n        I.\n\nI'm nobody!  Who are you?\nAre you nobody, too?\nThen there 's a"


#### Create training batches

In [59]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

#### Build The Model

- **tf.keras.layers.Embedding:** The input layer. A trainable lookup table that will map each character-ID to a vector with embedding_dim dimensions;
- **tf.keras.layers.GRU:** A type of RNN with size units=rnn_units (You can also use an LSTM layer here.)
- **tf.keras.layers.Dense:** The output layer, with vocab_size outputs. It outputs one logit for each character in the vocabulary. These are the log-likelihood of each character according to the model.

In [60]:
# Length of the vocabulary in chars
vocab_size = len(vocab)
display(vocab_size)
# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

64

In [61]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [62]:
model = MyModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

#### Try the model

In [63]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 65) # (batch_size, sequence_length, vocab_size)


In [64]:
model.summary()

Model: "my_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     multiple                  16640     
                                                                 
 gru_1 (GRU)                 multiple                  3938304   
                                                                 
 dense_1 (Dense)             multiple                  66625     
                                                                 
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


In [65]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

## This gives us, at each timestep, a prediction of the next character index
sampled_indices

array([62, 24,  2, 27, 56, 39, 30, 13, 33, 39, 24, 13, 57, 22, 56, 10, 36,
       31, 51, 31,  1, 59, 24, 25, 45, 61, 29, 37, 13, 33, 13, 62, 48, 39,
       13,  8, 19, 17, 36, 43, 20, 62, 30, 56, 48, 24, 56, 42, 36, 13, 44,
       20, 25, 50, 42, 12, 23, 53, 35,  5,  8,  5, 16, 55,  1, 44, 22, 13,
       49, 26,  4, 61, 20, 27,  3,  9, 29, 51, 52, 36, 19, 61, 55,  5,  7,
       61, 30, 49,  5,  0, 25, 29, 20, 29, 15, 20,  8, 19, 23, 61])

In [66]:
## Decode these to see the text predicted by this untrained model
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())

Input:
 b'r dimples, too.\n\nI left the place with all my might, --\nMy prayer away I threw;\nThe quiet ages picke'

Next Char Predictions:
 b'xK NraQ?TaK?sIr.WRmR\nuKLgwPX?T?xja?,FDWeGxQrjKrdW?fGLld;JoV\',\'Cq\nfI?kM"wGN!-PmnWFwq\')wQk\'[UNK]LPGPBG,FJw'


#### Train the model
- At this point the problem can be treated as a standard classification problem. Given the previous RNN state, and the input this time step, predict the class of the next character.

In [67]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [68]:
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", example_batch_mean_loss)

Prediction shape:  (64, 100, 65)  # (batch_size, sequence_length, vocab_size)
Mean loss:         tf.Tensor(4.17461, shape=(), dtype=float32)


In [69]:
tf.exp(example_batch_mean_loss).numpy()

65.01449

In [70]:
## training procedure using the tf.keras.Model.compile method. 
## Use tf.keras.optimizers.Adam with default arguments and the loss function.
model.compile(optimizer='adam', loss=loss)

In [71]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_poem_emilydickinson_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [72]:
EPOCHS = 225
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/225
Epoch 2/225
Epoch 3/225
Epoch 4/225
Epoch 5/225
Epoch 6/225
Epoch 7/225
Epoch 8/225
Epoch 9/225
Epoch 10/225
Epoch 11/225
Epoch 12/225
Epoch 13/225
Epoch 14/225
Epoch 15/225
Epoch 16/225
Epoch 17/225
Epoch 18/225
Epoch 19/225
Epoch 20/225
Epoch 21/225
Epoch 22/225
Epoch 23/225
Epoch 24/225
Epoch 25/225
Epoch 26/225
Epoch 27/225
Epoch 28/225
Epoch 29/225
Epoch 30/225
Epoch 31/225
Epoch 32/225
Epoch 33/225
Epoch 34/225
Epoch 35/225
Epoch 36/225
Epoch 37/225
Epoch 38/225
Epoch 39/225
Epoch 40/225
Epoch 41/225
Epoch 42/225
Epoch 43/225
Epoch 44/225
Epoch 45/225
Epoch 46/225
Epoch 47/225
Epoch 48/225
Epoch 49/225
Epoch 50/225
Epoch 51/225
Epoch 52/225
Epoch 53/225
Epoch 54/225
Epoch 55/225
Epoch 56/225
Epoch 57/225
Epoch 58/225
Epoch 59/225
Epoch 60/225
Epoch 61/225
Epoch 62/225
Epoch 63/225
Epoch 64/225
Epoch 65/225
Epoch 66/225
Epoch 67/225
Epoch 68/225
Epoch 69/225
Epoch 70/225
Epoch 71/225
Epoch 72/225
Epoch 73/225
Epoch 74/225
Epoch 75/225
Epoch 76/225
Epoch 77/225
Epoch 78

In [73]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [74]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [76]:
start = time.time()
states = None
next_char = tf.constant([' '])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

 a livid claw.

The birds put up the bars to nests,
The cattle fled to barns;
There came one drop of eventained
In any out atain.

I then I wook a deeace that died, -- their fister from the rask
Of vauterful rosts
      The hills just tell the others clause,
And cleebed the ged his stare!




        XXII. 

     PRECODEN SEC.

The room with thee itself I sure, --
And what time the weaver sleeps
   Who spun the breadths of blue!

Write me how the hiles -- that Jesuon out.

  
            VII. 

       IN THE GHEDED.

Be only of the bode!

But I, gropp
dis stirp it erstaming chair,

Some, too frain sele alone,
"thing in the ones that Mie.

The brave a sort mengle of the baye
I hourd I 'm  'r the old horizonts to deam.

The hauph we touch the smmeres pouth,
She ere the heaven recore.

Putites, night, will just agoning;
And triund, and voried
That you, so late, consider me,
The sparrow of your
Beautie lows away.

Obr fore that die aster
For every little knoll,
Busy neetless, on fermocr,
T

In [77]:
tf.saved_model.save(one_step_model, 'Emily Dickinson')



2022-03-12 11:41:25.051861: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: Emily Dickinson/assets


INFO:tensorflow:Assets written to: Emily Dickinson/assets
