# HW_04 Problem #3

<img src="assets/Hull-Robert-HW-04-881d387a.png" width="600" />

### This Code was written by Robert 'Quinn' Hull, and borrowed elements from several other resources:

* The bulk of this script is the TensorFlow article about text generation: https://www.tensorflow.org/tutorials/text/text_generation
* Much of the text was removed to make this shorter and more intelligible
* The text describing my work is available at the end of the script. 




In [212]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Setup

### Import TensorFlow and other libraries

In [265]:
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

import numpy as np
import os
import time

### Download the Shakespeare dataset


In [266]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

### Read the data

First, look in the text:

In [267]:
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# # subset training text (temporary)
# text = text[:11153]
# length of text is the number of characters in it
print('Length of text: {} characters'.format(len(text)))
# The unique characters in the file
vocab = sorted(set(text))
print('{} unique characters'.format(len(vocab)))

Length of text: 1115394 characters
65 unique characters


## Process the text

### Vectorize the text


In [268]:
example_texts = ['abcdefg', 'xyz']

chars = tf.strings.unicode_split(example_texts, input_encoding='UTF-8')
print('characters in ', chars)

ids_from_chars = preprocessing.StringLookup(
    vocabulary=list(vocab))

ids = ids_from_chars(chars)
ids

print('IDs ', ids)

chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True)

chars = chars_from_ids(ids)
chars

print('characters out ', chars)

# You can `tf.strings.reduce_join` to join the characters back into strings. 
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

characters in  <tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>
IDs  <tf.RaggedTensor [[41, 42, 43, 44, 45, 46, 47], [64, 65, 66]]>
characters out  <tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>


### Create training examples, targets, batches


In [269]:
# Create examples and targets
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

# set constants
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

# set batches
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

# split dataset
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

# create training batches
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

## Build The Model

In [285]:
# models
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True, 
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else: 
      return x

# new models, an LSTM
class NewModel1(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.lstm = tf.keras.layers.LSTM(rnn_units,
                                   return_sequences=True, 
                                   return_state=False) # change from GRU to LSTM
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    # print(states)
    # print(training)
    if states is None:
      states = self.lstm.get_initial_state(x)
    # print(states)
    x = self.lstm(x) # , initial_state=states) # , training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else: 
      return x

# new models2, an LSTM
class NewModel2(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.lstm = tf.keras.layers.LSTM(rnn_units,
                                   return_sequences=True, 
                                   return_state=True) # change from GRU to LSTM
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    # print(states)
    # print(training)
    if states is None:
      states = self.lstm.get_initial_state(x)
    # print(states)
    x = self.lstm(x , initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else: 
      return x

In [286]:
# model chars
# Length of the vocabulary in chars
vocab_size = len(vocab) # QH NOTE: this = 65. 

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = int(1024) # 1024

# set model
model = NewModel2(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

# optimizer and loss function
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer='adam', loss=loss)

# set number of epochs
EPOCHS = 10

## Training


### Easy Train

In [287]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

# train model (naive)
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/10


ValueError: ignored

## Generate text

The following makes a single step prediction:

In [273]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature=temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "" or "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['','[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices = skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())]) 
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits] 
    predicted_logits, states =  self.model(inputs=input_ids, states=states, 
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "" or "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)
    
    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [274]:
temp = 1.0
one_step_model = OneStep(model, chars_from_ids, ids_from_chars, temperature=temp)

In [275]:
start = time.time()
states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]

pred_char_len = 50

for n in range(pred_char_len):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()

print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)

print(f"\nRun time: {end - start}")

ROMEO:
Ag:
Ange notifaloulyo ced bleathird thostll?
Thit 

________________________________________________________________________________

Run time: 0.8481040000915527


## Evaluate

### Bleu
* From: https://towardsdatascience.com/how-to-evaluate-text-generation-models-metrics-for-automatic-evaluation-of-nlp-models-e1c251b04ec1
* BLEU is a precision focused metric that calculates n-gram overlap of the reference and generated texts.

In [276]:
# prep
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu, sentence_bleu

def bleu(ref, gen):
    ''' 
    calculate pair wise bleu score. uses nltk implementation
    Args:
        For word comparison
          references : a list of reference sentences 
          candidates : a list of candidate(generated) sentences
        For character comparison
          references : a list of reference sentences 
          candidates : a list of candidate(generated) sentences
    Returns:
        bleu score(float)
    '''
    ref_bleu = []
    gen_bleu = []
    print('test1')
    for l in gen:
        print('test2')
        gen_bleu.append(l.split())
    print(gen_bleu)
    for i,l in enumerate(ref):
        print('test3')
        ref_bleu.append([l.split()])
    print(ref_bleu)
    cc = SmoothingFunction()
    print(cc)
    score_bleu = corpus_bleu(ref_bleu, gen_bleu) 
    return score_bleu

In [277]:
bleu_score = 'NA' # bleu(['hillo'],['hello'])

# Final Comments and Text
* In general, the script experiments with:
  * hyperparameters: 
     * embedding_dim, rnn_units, temperature, epochs
  * Model Structure
    * LSTM v RNN
* Lingering questions I have:
  * I was mostly unsuccessful at using an LSTM for this. It's definitely in how I am setting up this model.
  * I never figured out how to properly assess this model. I started to experiment with the BLEU algorithm, but I wasn't sure this is built for character generation in this context. I.E., I wasn't sure how to apply this to our use-case where the text generated was mostly 'random'


In [278]:
cats = ['bleu_score', 'Training loss (final)', 'Training sequence length', 
        'Training Buffer Size', 'Training Epochs', 'Training Batch Size', 
        'RNN model: RNN units', 'RNN model: Embedding Dim', 
        'RNN model: Name of loss function', 'RNN model: Summary', 'RNN model: history object',
        'Prediction: character length', 'Prediction: temp constant', 'Prediction: result',
        'Prediction: model object']
      
print(cats, '\n')

# save1 = [bleu_score, history.history['loss'][-1], seq_length, 
#          BUFFER_SIZE, EPOCHS, BATCH_SIZE, 
#          rnn_units, embedding_dim,
#          loss.name, model, history,
#          pred_char_len, temp, result, 
#          one_step_model]
print('Model 1 is the `custom` model using the customized model object in the original code\n',
      'There is no calculation of bleu score for this one, because I havent figured out at this point\n',
      'how to best evaluate the model. \n')
print(save1, '\n\n')

# save2 = [bleu_score, history.history['loss'][-1], seq_length, 
#          BUFFER_SIZE, EPOCHS, BATCH_SIZE, 
#          rnn_units, embedding_dim,
#          loss.name, model, history,
#          pred_char_len, temp, result, 
#          one_step_model]
print('Model 2 is our baseline model using the class MyModel with architecture the same as in the original code\n',
      'From the persective of training loss, it performs nearly as well as the previous custom model\n',
      np.round(save2[1],2), 'versus', np.round(save1[1],2), '\n')
print(save2, '\n\n')

# save3 = [bleu_score, history.history['loss'][-1], seq_length, 
#          BUFFER_SIZE, EPOCHS, BATCH_SIZE, 
#          rnn_units, embedding_dim,
#          loss.name, model, history,
#          pred_char_len, temp, result, 
#          one_step_model]
print('Model 3 is our baseline model using the class MyModel with architecture the same as in the original code\n',
      'to highlight the impact of epochs on training performance, we have reduced it from',save2[4], 'to', save3[4], '\n',
      'From the persective of training loss, it performs worse than before',np.round(save3[1],2), 'versus', np.round(save2[1],2), '\n',
      'The predicted text is less coherent, too:\n', 'Model 3 :', save3[13].numpy()[0], '\n', 'Model 2 :', save2[13].numpy()[0], '\n')
print(save3, '\n\n')

# save4 = [bleu_score, history.history['loss'][-1], seq_length, 
#          BUFFER_SIZE, EPOCHS, BATCH_SIZE, 
#          rnn_units, embedding_dim,
#          loss.name, model, history,
#          pred_char_len, temp, result, 
#          one_step_model]
print('Model 4 is our baseline model using the class MyModel with architecture the same as in the original code\n',
      'to highlight the impact of epochs on training performance, we have increased it from',save3[4], 'to', save4[4], '\n',
      'From the persective of training loss, it performs better than before',np.round(save4[1],2), 'versus', np.round(save3[1],2), '\n',
      'The predicted text is way more coherent, almost Shakespearean:\n', 'Model 4 :', save4[13].numpy()[0], '\n', 'Model 3 :', save3[13].numpy()[0], '\n',
      'This performance bump does come at the expense of time, though (20s / epoch * 30 epochs = 10 minutes, \n')
print(save4, '\n\n')

# save5 = [bleu_score, history.history['loss'][-1], seq_length, 
#          BUFFER_SIZE, EPOCHS, BATCH_SIZE, 
#          rnn_units, embedding_dim,
#          loss.name, model, history,
#          pred_char_len, temp, result, 
#          one_step_model]
print('Model 5 is our baseline model using the class MyModel with architecture the same as in the original code\n',
      'to explore ways to speed up training and preserve text coherence, I reduce the size of the input text \n',
      'by two orders of magnitude: 1115394 characters to 11153 characters and keep epochs =',save5[4], '\n',
      'From the persective of training loss, it performs far worse than before',np.round(save5[1],2), 'versus', np.round(save4[1],2), '\n',
      'The predicted text is nonsense :\n', 'Model 6 :', save5[13].numpy()[0], '\n', 'Model 5 :', save4[13].numpy()[0], '\n',
      'This might be because of undertraining, or an issue in how weve indexed the vocabulary \n',
      'This saves training speed substantially! - 1 s / epoch \n')
print(save5, '\n\n')

# save6 = [bleu_score, history.history['loss'][-1], seq_length, 
#          BUFFER_SIZE, EPOCHS, BATCH_SIZE, 
#          rnn_units, embedding_dim,
#          loss.name, model, history,
#          pred_char_len, temp, result, 
#          one_step_model]
print('Model 6 is our baseline model using the class MyModel, returning to epochs = ', save2[4], 'and characters = 1115394\n',
      'Here we start to vary the architecture of the model, so rnn_units =', save6[6], 'up from', save2[6], '\n',
      'From the persective of training loss, it performs better than baseline (Model 2)',np.round(save6[1],2), 'versus', np.round(save2[1],2), '\n',
      'The predicted text is probably more coherent :\n', 'Model 6 :', save6[13].numpy()[0], '\n', 'Model 2 :', save2[13].numpy()[0], '\n',
      'Worth noting that the run-time nearly tripled, from 20 s / epoch to 55 s / epoch \n')
print(save6, '\n\n')

# save7 = [bleu_score, history.history['loss'][-1], seq_length, 
#          BUFFER_SIZE, EPOCHS, BATCH_SIZE, 
#          rnn_units, embedding_dim,
#          loss.name, model, history,
#          pred_char_len, temp, result, 
#          one_step_model]
print('Model 7 is our baseline model using the class MyModel \n',
      'Where we decrease the rnn_units =', save7[6], 'down from', save6[6], '\n',
      'From the persective of training loss, it performs worse than baseline (Model 2)',np.round(save7[1],2), 'versus', np.round(save2[1],2), '\n',
      'The predicted text is arguably no more or less coherent :\n', 'Model 7 :', save7[13].numpy()[0], '\n', 'Model 2 :', save2[13].numpy()[0], '\n',
      'Worth noting that the run-time didnt change very much \n ')
print(save7, '\n\n')

# save8 = [bleu_score, history.history['loss'][-1], seq_length, 
#          BUFFER_SIZE, EPOCHS, BATCH_SIZE, 
#          rnn_units, embedding_dim,
#          loss.name, model, history,
#          pred_char_len, temp, result, 
#          one_step_model]
print('Model 8 is our first LSTM model using the class NewModel1 \n',
      'Where we return the rnn_units =', save8[6], '\n',
      'From the persective of training loss, it performs _____ than baseline (Model 2)',np.round(save8[1],2), 'versus', np.round(save2[1],2), '\n',
      'The predicted text is totally incoherent :\n', 'Model 8 :', save8[13].numpy()[0], '\n',
      'I have definitely made an error in how I set up this LSTM network \n ')
print(save8, '\n\n')

save9 = [bleu_score, history.history['loss'][-1], seq_length, 
         BUFFER_SIZE, EPOCHS, BATCH_SIZE, 
         rnn_units, embedding_dim,
         loss.name, model, history,
         pred_char_len, temp, result, 
         one_step_model]
print('Model 9 is our 2nd LSTM model using the class NewModel2 \n',
      'Ive tried to mess around with the model so that it passes the state\n',
      'From the persective of training loss, it performs _____ than baseline (Model 2)',np.round(save9[1],2), 'versus', np.round(save2[1],2), '\n',
      'The predicted text is totally incoherent :\n', 'Model 9 :', save9[13].numpy()[0], '\n',
      'I have definitely made an error in how I set up this LSTM network \n ')
print(save9, '\n\n')



['bleu_score', 'Training loss (final)', 'Training sequence length', 'Training Buffer Size', 'Training Epochs', 'Training Batch Size', 'RNN model: RNN units', 'RNN model: Embedding Dim', 'RNN model: Name of loss function', 'RNN model: Summary', 'RNN model: history object', 'Prediction: character length', 'Prediction: temp constant', 'Prediction: result', 'Prediction: model object'] 

Model 1 is the `custom` model using the customized model object in the original code
 There is no calculation of bleu score for this one, because I havent figured out at this point
 how to best evaluate the model. 

['NA', 1.1910383701324463, 100, 10000, 10, 64, 1024, 256, 'sparse_categorical_crossentropy', <__main__.CustomTraining object at 0x7efee66c6cd0>, <tensorflow.python.keras.callbacks.History object at 0x7eff4b279410>, 50, 1.0, <tf.Tensor: shape=(1,), dtype=string, numpy=
array([b'ROMEO:\nO, thy sun doth let\nRend her he should he knife.\n'],
      dtype=object)>, <__main__.OneStep object at 0x7efea