In [3]:
import tensorflow as tf

import numpy as np
import os
import time
import warnings
warnings.filterwarnings("ignore")

In [4]:
path_to_file = "/Users/AnaPSilva/Documents/Ana/Ironhack/Bootcamp/Final_Project/Data/Politic Speeches/trump_speeches.txt"

In [5]:
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print(f'Length of text: {len(text)} characters')

Length of text: 1200199 characters


In [6]:
# Take a look at the first 250 characters in text
print(text[:250])

My fellow Americans: Four years ago, we launched a great national effort to rebuild our country, to renew its spirit, and to restore the allegiance of this government to its citizens. In short, we embarked on a mission to make America great again — f


In [7]:
# The unique characters in the file
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

97 unique characters


In [8]:
## Before training, you need to convert the strings to a numerical representation.
## convert each character into a numeric ID. 
## It just needs the text to be split into tokens first.
example_texts = ['abcdefg', 'xyz']

chars = tf.strings.unicode_split(example_texts, input_encoding='UTF-8')
chars

2022-03-13 12:50:03.394803: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

In [9]:
## from tokens to character IDs
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)
ids = ids_from_chars(chars)
ids

<tf.RaggedTensor [[60, 61, 62, 63, 64, 65, 66], [83, 84, 85]]>

In [10]:
## it will also be important to invert this representation and 
## recover human-readable strings from it.
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

In [11]:
chars = chars_from_ids(ids)
chars

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

In [12]:
## join the characters back into strings.
tf.strings.reduce_join(chars, axis=-1).numpy()

array([b'abcdefg', b'xyz'], dtype=object)

In [13]:
def text_from_ids(ids):
  """join the characters back into strings"""
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

#### Create training examples and targets

In [14]:
## convert the text vector into a stream of character indices.
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids

<tf.Tensor: shape=(1200199,), dtype=int64, numpy=array([43, 84,  1, ..., 62, 60, 13])>

In [15]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

M
y
 
f
e
l
l
o
w
 


In [16]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)
examples_per_epoch

11883

In [17]:
## The batch method lets you easily convert 
## these individual characters to sequences of the desired size.
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
  print(chars_from_ids(seq))

tf.Tensor(
[b'M' b'y' b' ' b'f' b'e' b'l' b'l' b'o' b'w' b' ' b'A' b'm' b'e' b'r'
 b'i' b'c' b'a' b'n' b's' b':' b' ' b'F' b'o' b'u' b'r' b' ' b'y' b'e'
 b'a' b'r' b's' b' ' b'a' b'g' b'o' b',' b' ' b'w' b'e' b' ' b'l' b'a'
 b'u' b'n' b'c' b'h' b'e' b'd' b' ' b'a' b' ' b'g' b'r' b'e' b'a' b't'
 b' ' b'n' b'a' b't' b'i' b'o' b'n' b'a' b'l' b' ' b'e' b'f' b'f' b'o'
 b'r' b't' b' ' b't' b'o' b' ' b'r' b'e' b'b' b'u' b'i' b'l' b'd' b' '
 b'o' b'u' b'r' b' ' b'c' b'o' b'u' b'n' b't' b'r' b'y' b',' b' ' b't'
 b'o' b' ' b'r'], shape=(101,), dtype=string)


In [18]:
## It's easier to see what this is doing if you join the 
## tokens back into strings

for seq in sequences.take(5):
  print(text_from_ids(seq).numpy())

b'My fellow Americans: Four years ago, we launched a great national effort to rebuild our country, to r'
b'enew its spirit, and to restore the allegiance of this government to its citizens. In short, we embar'
b'ked on a mission to make America great again \xe2\x80\x94 for all Americans.As I conclude my term as the 45th Pr'
b'esident of the United States, I stand before you truly proud of what we have achieved together. We di'
b'd what we came here to do \xe2\x80\x94 and so much more.This week, we inaugurate a new administration and pray f'


In [19]:
def split_input_target(sequence):
    """takes a sequence as input, duplicates, 
    and shifts it to align the input and label for each timestep"""
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [20]:
dataset = sequences.map(split_input_target)
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'My fellow Americans: Four years ago, we launched a great national effort to rebuild our country, to '
Target: b'y fellow Americans: Four years ago, we launched a great national effort to rebuild our country, to r'


#### Create training batches

In [21]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

#### Build The Model

- **tf.keras.layers.Embedding:** The input layer. A trainable lookup table that will map each character-ID to a vector with embedding_dim dimensions;
- **tf.keras.layers.GRU:** A type of RNN with size units=rnn_units (You can also use an LSTM layer here.)
- **tf.keras.layers.Dense:** The output layer, with vocab_size outputs. It outputs one logit for each character in the vocabulary. These are the log-likelihood of each character according to the model.

In [22]:
# Length of the vocabulary in chars
vocab_size = len(vocab)
display(vocab_size)
# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

97

In [23]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [24]:
model = MyModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

#### Try the model

In [25]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 98) # (batch_size, sequence_length, vocab_size)


In [26]:
model.summary()

Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  25088     
                                                                 
 gru (GRU)                   multiple                  3938304   
                                                                 
 dense (Dense)               multiple                  100450    
                                                                 
Total params: 4,063,842
Trainable params: 4,063,842
Non-trainable params: 0
_________________________________________________________________


In [27]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

## This gives us, at each timestep, a prediction of the next character index
sampled_indices

array([97, 91, 88, 93, 67, 85, 46, 69,  5, 79, 16, 61, 83, 44, 62, 10, 69,
       65, 36, 18,  1, 74, 69, 80, 75,  3, 89, 57, 85, 65,  1, 47, 54, 90,
       73, 36, 55, 70, 94, 75, 78, 82, 93, 57, 29,  2,  5, 14, 56, 36, 24,
       56,  7, 78, 36, 20, 49, 86, 43, 58, 59, 59,  0, 75, 97, 62, 36, 85,
       54, 32, 31,  2, 88, 78, 88, 14, 67, 12, 35, 70, 44, 54, 34, 87, 87,
       15, 26, 84, 93, 15, 67,  2, 30, 67, 33, 84, 87, 82, 45, 80])

In [28]:
## Decode these to see the text predicted by this untrained model
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())

Input:
 b've a very special surprise. I am thrilled to inform you that your husband is back from deployment. H'

Next Char Predictions:
 b'\xe2\x80\xa6\xe2\x80\x93\xc3\xa9\xe2\x80\x98hzPj$t1bxNc)jfF3 ojup"\xc3\xb3[zf QX\xe2\x80\x91nFYk\xe2\x80\x99psw\xe2\x80\x98[>!$/ZF9Z&sF5S\xc3\xa0M\\]][UNK]p\xe2\x80\xa6cFzXBA!\xc3\xa9s\xc3\xa9/h-EkNXD\xc3\xa1\xc3\xa10;y\xe2\x80\x980h!?hCy\xc3\xa1wOu'


#### Train the model
- At this point the problem can be treated as a standard classification problem. Given the previous RNN state, and the input this time step, predict the class of the next character.

In [29]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [30]:
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", example_batch_mean_loss)

Prediction shape:  (64, 100, 98)  # (batch_size, sequence_length, vocab_size)
Mean loss:         tf.Tensor(4.5847845, shape=(), dtype=float32)


In [31]:
tf.exp(example_batch_mean_loss).numpy()

97.98207

In [32]:
## training procedure using the tf.keras.Model.compile method. 
## Use tf.keras.optimizers.Adam with default arguments and the loss function.
model.compile(optimizer='adam', loss=loss)

In [33]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_trump_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [34]:
EPOCHS = 50
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/50
 12/185 [>.............................] - ETA: 13:11 - loss: 4.5777

KeyboardInterrupt: 

In [None]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [None]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [None]:
start = time.time()
states = None
next_char = tf.constant(['I '])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

I see they’re buying? Because I want to tell you that scorie? I don’t make a good choice?” A lot of the Army wants to get a lot could be—Grabably around the country together, and what governors have done a really good. He was there. We’re stocking people anymore. You made it close, expected. Nobel President China concers and many other people are doing with them, AMy is only matters back into the love of our current system should need.Every American values, having tuese massive tax rule where destiny will never be very successful.We can add that Mariforn Tom Wal and-have machines to kill the very real crisis.THE PRESIDENT: I don’t want to say, “Could you do then, we’ll show you it’s and our roadbablish control. We’cled about our cooperation works, wash an incredible veruetion, because we all unjesten testing. And can we look into our cities, we appreciate it.But, yeah, please, we don’t want to give a listen to a national expand.But they should extend and the numbers — very important. (

In [None]:
tf.saved_model.save(one_step_model, 'Trump')



2022-03-12 09:14:32.051650: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: Trump/assets


INFO:tensorflow:Assets written to: Trump/assets
