In [1]:
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing
import numpy as np
import os
import time

In [None]:
path_to_file = tf.keras.utils.get_file('austen_plus_shakespeare.txt', 'https://raw.githubusercontent.com/sethlinares/rnn-texts-cse450/main/austen_plus_shakespeare.txt')
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
print('Length of text: {} characters'.format(len(text)))

Downloading data from https://raw.githubusercontent.com/sethlinares/rnn-texts-cse450/main/austen_plus_shakespeare.txt
Length of text: 10296344 characters


In [None]:
print(text[:200])

VOLUME I



CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings of
existence; and had lived nearly twenty-


In [None]:
vocab = sorted(set(text))
print('{} unique characters'.format(len(vocab)))
print(vocab)

103 unique characters
['\t', '\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '}', '£', 'À', 'Æ', 'Ç', 'É', 'à', 'â', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'î', 'œ', '—', '‘', '’', '“', '”', '…']


In [None]:
ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab))
chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True)

def text_from_ids(ids):
  joinedTensor = tf.strings.reduce_join(chars_from_ids(ids), axis=-1)
  return joinedTensor.numpy().decode("utf-8")

In [2]:
testids = ids_from_chars(["T", "r", "u", "t", "h"])
testids

NameError: name 'ids_from_chars' is not defined

In [None]:
chars_from_ids(testids)

<tf.Tensor: shape=(5,), dtype=string, numpy=array([b'T', b'r', b'u', b't', b'h'], dtype=object)>

In [None]:
testString = text_from_ids( testids )
testString

'Truth'

In [None]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids

<tf.Tensor: shape=(10296344,), dtype=int64, numpy=array([48, 41, 38, ..., 45,  2,  2])>

In [3]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

NameError: name 'all_ids' is not defined

In [None]:
seq_length = 100
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text


dataset = sequences.map(split_input_target)

In [None]:
for input_example, target_example in  dataset.take(1):
    print("Input: ", text_from_ids(input_example))
    print("--------")
    print("Target: ", text_from_ids(target_example))

Input:  VOLUME I



CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happ
--------
Target:  OLUME I



CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy


In [4]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

NameError: name 'dataset' is not defined

In [None]:
!pip install tensorflow-addons

Collecting tensorflow-addons
  Downloading tensorflow_addons-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (591 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/591.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m591.0/591.0 kB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.7 (from tensorflow-addons)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow-addons
Successfully installed tensorflow-addons-0.20.0 typeguard-2.13.3


In [None]:
import tensorflow_addons as tfa
class austen_plus_shakespeare(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__()
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.dropout1 = tf.keras.layers.Dropout(.1)
    self.rnn1 = tf.keras.layers.LSTM(rnn_units, return_sequences=True, return_state=True)
    self.rnn2 = tf.keras.layers.LSTM(rnn_units, return_sequences=True, return_state=True)
    self.dropout2 = tf.keras.layers.Dropout(.1)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    x = self.dropout1(x, training=training)
    if states is None:
      states = [self.rnn1.get_initial_state(x), self.rnn2.get_initial_state(x)]
    x, h1, c1 = self.rnn1(x, initial_state=states[0], training=training)
    x, h2, c2 = self.rnn2(x, initial_state=states[1], training=training)

    states = [(h1, c1), (h2, c2)]
    x = self.dropout2(x, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

model = austen_plus_shakespeare(vocab_size=len(ids_from_chars.get_vocabulary()), embedding_dim=512, rnn_units=2048)


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")


(1024, 100, 104) # (batch_size, sequence_length, vocab_size)


In [None]:
model.summary()

Model: "austen_plus_shakespeare"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  53248     
                                                                 
 dropout (Dropout)           multiple                  0         
                                                                 
 lstm (LSTM)                 multiple                  20979712  
                                                                 
 lstm_1 (LSTM)               multiple                  33562624  
                                                                 
 dropout_1 (Dropout)         multiple                  0         
                                                                 
 dense (Dense)               multiple                  213096    
                                                                 
Total params: 54,808,680
Trainable params: 

In [None]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer='adam', loss=loss)




In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)
model.fit(dataset, epochs=150, callbacks=[early_stopping])

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<keras.callbacks.History at 0x7f61cfd8abf0>

In [None]:
!pip install language-tool-python

Collecting language-tool-python
  Downloading language_tool_python-2.7.1-py3-none-any.whl (34 kB)
Installing collected packages: language-tool-python
Successfully installed language-tool-python-2.7.1


In [None]:
import language_tool_python

class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=.5):
    super().__init__()
    self.temperature=temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    skip_ids = self.ids_from_chars(['','[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        values=[-float('inf')]*len(skip_ids),
        indices = skip_ids,
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask,validate_indices=False)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    predicted_logits, states =  self.model(inputs=input_ids, states=states,
                                          return_state=True)
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature

    predicted_logits = predicted_logits + self.prediction_mask
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    return self.chars_from_ids(predicted_ids), states


In [None]:
import difflib


one_step_model = OneStep(model, chars_from_ids, ids_from_chars)


states = None
next_char = tf.constant(['The world seemed like such a peaceful place until the magic tree was discovered in London.'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)

generated_text = result[0].numpy().decode('utf-8')
print("Generated text before correction:")
print(generated_text)


def correct_text(text):
    tool = language_tool_python.LanguageTool('en-US')
    corrected_text = tool.correct(text)
    return corrected_text

corrected_text = correct_text(generated_text)
print("\nGenerated text after correction:")
print(corrected_text)

def print_diff(text1, text2):
    diff = difflib.ndiff(text1.splitlines(keepends=True), text2.splitlines(keepends=True))
    print(''.join(diff))
print("\nDifferences between the generated text and the corrected text:")
print_diff(generated_text, corrected_text)



Generated text before correction:
The world seemed like such a peaceful place until the magic tree was discovered in London.

The only door and a new intends were false, as the fathers do, and on many
account.  Mrs. Weston was the very month with child.  She had been used
by either and Mrs. Dashwood, and they all come downstairs. Fanny was left with her
in the land, the difference was great. Had she expected that one on the wedding night
she rescended thus unborn in the custom, she said, ‘The pity of it,
    then their mother’s order to another, to steal the stains.


                   45

The chine of all the fair ladies is grown,
Good gracious war, her lips are comprehended,
    Not many doubts and honourable lands.
It is such a common that is honourable.
What was the one?

CORIOLANUS.
The god of soldiers,
With the consent of such time’s time to bear neglect
What we can do now with thee on him they both
To stop and through every occasion
Now to have it but bounds and scorns to heave