<a href="https://colab.research.google.com/github/semaxspaul/semaxspaul_Shakespeare/blob/main/Shakespeare.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, GRU
from tensorflow.keras.losses import sparse_categorical_crossentropy

In [None]:
path_to_file = '/content/shakespeare.txt'

In [None]:
text = open(path_to_file, 'r').read()

In [None]:
vocabulary = sorted(set(text))

In [None]:
VOCAB_SIZE = len(vocabulary)

In [None]:
char_to_ind = {char:ind for ind, char in enumerate(vocabulary)}

In [None]:
ind_to_char = np.array(vocabulary)

In [None]:
encoded_text = np.array([char_to_ind[c] for c in text])

In [None]:
encoded_text.shape

(5445609,)

In [None]:
SEQ_LEN = 120  # Depends on text structure

In [None]:
TOTAL_NUM_SEQ = len(text) // (SEQ_LEN+1)
TOTAL_NUM_SEQ

45005

In [None]:
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)

In [None]:
char_dataset

<_TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>

In [None]:
# for item in char_dataset.take(300):
#   print(ind_to_char[item.numpy()])

In [None]:
sequences = char_dataset.batch(SEQ_LEN+1, drop_remainder=True)

In [None]:
sequences

<_BatchDataset element_spec=TensorSpec(shape=(121,), dtype=tf.int64, name=None)>

In [None]:
def create_seq_targets(seq):
  input_txt = seq[:-1] # My name is Pau
  target_txt = seq[1:] # y name is Paul
  return input_txt, target_txt

In [None]:
dataset = sequences.map(create_seq_targets)

In [None]:
for input, target in dataset.take(1):
  print(input.numpy())
  print(''.join(ind_to_char[input.numpy()]))
  print('\n')
  print(target.numpy())
  print(''.join(ind_to_char[target.numpy()]))

[ 0  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0
  1  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74
  1 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45
 63 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74
 60  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75]

                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But


[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0  1
  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74  1
 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45 63
 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74 60
  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75  1]
                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But 


In [None]:
BATCH_SIZE = 128  # Training batch

In [None]:
buffer_size = 10000  # Shuffle only 10000 not whole dataset

dataset = dataset.shuffle(buffer_size).batch(BATCH_SIZE, drop_remainder=True)

In [None]:
dataset

<_BatchDataset element_spec=(TensorSpec(shape=(128, 120), dtype=tf.int64, name=None), TensorSpec(shape=(128, 120), dtype=tf.int64, name=None))>

In [None]:
EMBED_DIM = 64 # Adjustable to VOCAB_SIZE
RNN_NEURONS = 1024

In [None]:
def sparse_cat_loss(y_true, y_pred):
  return sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)

In [None]:
def create_model(vocab_size, embed_dim, rnn_neurons, batch_size):
  model = Sequential()

  model.add(Embedding(input_dim=vocab_size, output_dim=embed_dim, batch_input_shape=[batch_size, None]))

  model.add(GRU(rnn_neurons, return_sequences=True, stateful=True,
                recurrent_initializer='glorot_uniform'))
  model.add(Dense(vocab_size))

  model.compile(optimizer='adam', loss=sparse_cat_loss)

  return model

In [None]:
model = create_model(vocab_size=VOCAB_SIZE, embed_dim=EMBED_DIM, rnn_neurons=RNN_NEURONS, batch_size=BATCH_SIZE)

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (128, None, 64)           5376      
                                                                 
 gru (GRU)                   (128, None, 1024)         3348480   
                                                                 
 dense (Dense)               (128, None, 84)           86100     
                                                                 
Total params: 3439956 (13.12 MB)
Trainable params: 3439956 (13.12 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)

In [None]:
example_batch_predictions.shape

TensorShape([128, 120, 84])

In [None]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)

In [None]:
sampled_indices

<tf.Tensor: shape=(120, 1), dtype=int64, numpy=
array([[17],
       [50],
       [63],
       [56],
       [43],
       [69],
       [35],
       [66],
       [71],
       [33],
       [19],
       [16],
       [65],
       [68],
       [29],
       [43],
       [24],
       [ 8],
       [ 6],
       [50],
       [74],
       [54],
       [30],
       [25],
       [74],
       [61],
       [11],
       [17],
       [ 9],
       [41],
       [12],
       [74],
       [23],
       [56],
       [24],
       [53],
       [65],
       [79],
       [25],
       [ 5],
       [ 5],
       [75],
       [54],
       [69],
       [29],
       [33],
       [ 9],
       [56],
       [75],
       [21],
       [67],
       [13],
       [66],
       [32],
       [23],
       [10],
       [ 7],
       [40],
       [46],
       [46],
       [80],
       [12],
       [ 1],
       [30],
       [18],
       [63],
       [81],
       [12],
       [48],
       [56],
       [24],
       [19],
       [78],
   

In [None]:
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

In [None]:
sampled_indices

array([17, 50, 63, 56, 43, 69, 35, 66, 71, 33, 19, 16, 65, 68, 29, 43, 24,
        8,  6, 50, 74, 54, 30, 25, 74, 61, 11, 17,  9, 41, 12, 74, 23, 56,
       24, 53, 65, 79, 25,  5,  5, 75, 54, 69, 29, 33,  9, 56, 75, 21, 67,
       13, 66, 32, 23, 10,  7, 40, 46, 46, 80, 12,  1, 30, 18, 63, 81, 12,
       48, 56, 24, 19, 78, 77, 68, 35, 33, 83, 66, 58, 82, 79,  6, 68, 35,
       20, 47, 29, 52, 42, 33, 42, 72, 78, 80,  7, 45, 14, 25, 62, 31, 54,
       39, 27, 60, 65, 81, 70, 76, 67, 44, 22, 45,  1, 55, 79, 37, 24, 77,
       76])

In [None]:
ind_to_char[sampled_indices]

array(['6', 'Y', 'h', 'a', 'R', 'n', 'J', 'k', 'p', 'H', '8', '5', 'j',
       'm', 'D', 'R', '>', ',', '(', 'Y', 's', '_', 'E', '?', 's', 'f',
       '0', '6', '-', 'P', '1', 's', '<', 'a', '>', ']', 'j', 'x', '?',
       "'", "'", 't', '_', 'n', 'D', 'H', '-', 'a', 't', ':', 'l', '2',
       'k', 'G', '<', '.', ')', 'O', 'U', 'U', 'y', '1', ' ', 'E', '7',
       'h', 'z', '1', 'W', 'a', '>', '8', 'w', 'v', 'm', 'J', 'H', '}',
       'k', 'c', '|', 'x', '(', 'm', 'J', '9', 'V', 'D', '[', 'Q', 'H',
       'Q', 'q', 'w', 'y', ')', 'T', '3', '?', 'g', 'F', '_', 'N', 'B',
       'e', 'j', 'z', 'o', 'u', 'l', 'S', ';', 'T', ' ', '`', 'x', 'L',
       '>', 'v', 'u'], dtype='<U1')

In [None]:
epochs = 10
model.fit(dataset, epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7816921a1540>

In [None]:
model.save('/content/drive/MyDrive/shakespeare')

In [None]:
model_two = create_model(vocab_size=VOCAB_SIZE, embed_dim=EMBED_DIM, rnn_neurons=RNN_NEURONS, batch_size=1)

model_two.load_weights('/content/drive/MyDrive/shakespeare')

model_two.build(tf.TensorShape([1, None]))

In [None]:
model_two.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (1, None, 64)             5376      
                                                                 
 gru_2 (GRU)                 (1, None, 1024)           3348480   
                                                                 
 dense_2 (Dense)             (1, None, 84)             86100     
                                                                 
Total params: 3439956 (13.12 MB)
Trainable params: 3439956 (13.12 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
def generate_text(model, start_seed, gen_size=500, temp=1.0):
  num_generate = gen_size

  input_eval = [char_to_ind[s] for s in start_seed]
  input_eval = tf.expand_dims(input_eval, 0)

  text_generated = []
  temperature = temp

  model.reset_states()

  for i in range(num_generate):
    predictions = model(input_eval)
    predictions = tf.squeeze(predictions, 0)

    predictions = predictions/temperature

    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

    input_eval = tf.expand_dims([predicted_id], 0)

    text_generated.append(ind_to_char[predicted_id])

  return (start_seed + ''.join(text_generated))

In [None]:
print(generate_text(model_two, 'PAUL', gen_size=1000))

PAULYNET, DUKE OF YORK and the DUKE OF CLARENCE, and all
    And swemp'red desert, and laughter trusts
    The chase of my effect.
    That nothing only guest not what Valentine,
    And there is nurs are punited unto me;
    Here is a fool; all of your nupling beat
    Art prais'd the fortune venved,
    Or at, the Our parl; which shall not yield
    Writin to th' crums. This you shall shake thy.
  ANTIGONUS. I am too guest;
    For we have an ear may here it,
    Or, ave all deed of purpose crivet                    [HORTENSIO] What have I, Thomas Hortensio?
  PAULINA.'Sice.
  EVANS. Could I, behold? Say, what a school by him?
  FIRST SENATOR. Cassio's uncle, A found room is names,
    But give up they were not as freshared,
     That cert's a strange own steady paper.
          Dies particular:
    His liegens winter and unless his flack!
  Friar. Faith, not for thy quanity! I can see
    I charge thee to Miton Antepory.
  BEROWNE. Fair father; any that you have at honestly
    Thou