<a href="https://colab.research.google.com/github/ridwibra/Text-Generation/blob/main/text_generation_shakespeare.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import tensorflow as tf 

In [3]:
path_to_file = "/content/shakespeare.txt"

In [4]:
text = open(path_to_file, 'r').read()

In [5]:
print(text[400:800])

hat art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bud buriest thy content,
  And tender churl mak'st waste in niggarding:
    Pity the world, or else this glutton be,
    To eat the world's due, by the grave and thee.


                     2
  When forty winters shall besiege thy brow,
  And dig deep trenches in thy beauty's field,
  Thy youth's pro


In [6]:
#grab all the unique characters
vocab = sorted(set(text))

In [7]:
vocab

['\n',
 ' ',
 '!',
 '"',
 '&',
 "'",
 '(',
 ')',
 ',',
 '-',
 '.',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '>',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 ']',
 '_',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '|',
 '}']

In [8]:
len(vocab)

84

In [9]:
#vectorize the text and create encoding dictionary
char_to_ind = {char:ind for ind, char in enumerate(vocab)}

In [10]:
char_to_ind['R']

43

In [11]:
ind_to_char = np.array(vocab)

In [12]:
ind_to_char[43]

'R'

In [13]:
encoded_text = np.array([char_to_ind[c] for c in text])

In [14]:
encoded_text.shape

(6494185,)

understand the text sequences;
use tensorflow datasets to generate batches;
shuffle batches;

In [15]:
print(text[:500])


                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bu


In [16]:
lines = '''
From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
'''

In [17]:
len(lines)

133

In [18]:
seq_len = 120

In [19]:
total_num_seq = len(text)//(seq_len+1)

In [20]:
total_num_seq

53670

In [21]:
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)

In [22]:
type(char_dataset)

tensorflow.python.data.ops.dataset_ops.TensorSliceDataset

In [23]:
#create batches
sequences = char_dataset.batch(seq_len+1, drop_remainder=True)


In [24]:
#create target text sequence
def create_seq_targets(seq):
  input_txt = seq[:-1]
  target_txt = seq[1:]
  return input_txt, target_txt

In [25]:
dataset = sequences.map(create_seq_targets)

In [26]:
for input_txt, target_txt in dataset.take(1):
  print(input_txt.numpy())
  print("".join(ind_to_char[input_txt.numpy()]))
  print('\n')
  print(target_txt.numpy())
  print("".join(ind_to_char[target_txt.numpy()]))

[ 0  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0
  1  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74
  1 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45
 63 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74
 60  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75]

                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But


[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0  1
  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74  1
 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45 63
 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74 60
  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75  1]
                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But 


In [27]:
batch_size = 128

In [28]:
buffer_size = 10000
dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

In [29]:
dataset

<BatchDataset shapes: ((128, 120), (128, 120)), types: (tf.int64, tf.int64)>

create the model

In [30]:
vocab_size = len(vocab)

In [31]:
embed_dim = 64

In [32]:
rnn_neurons = 1026

In [33]:
from tensorflow.keras.losses import sparse_categorical_crossentropy

In [34]:
def sparse_cat_loss(y_true, y_pred):
  return sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)

In [35]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense

In [36]:
def create_model (vocab_size, embed_dim, rnn_neurons,batch_size):
  model = Sequential()
  model.add(Embedding(vocab_size, embed_dim, batch_input_shape= [batch_size, None]))
  model.add(GRU(rnn_neurons, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'))
  model.add(Dense(vocab_size))
  model.compile('adam', loss=sparse_cat_loss)
  return model

In [37]:
model = create_model(vocab_size=vocab_size, embed_dim=embed_dim, rnn_neurons=rnn_neurons, batch_size=batch_size)

In [38]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (128, None, 64)           5376      
                                                                 
 gru (GRU)                   (128, None, 1026)         3361176   
                                                                 
 dense (Dense)               (128, None, 84)           86268     
                                                                 
Total params: 3,452,820
Trainable params: 3,452,820
Non-trainable params: 0
_________________________________________________________________


In [39]:
epochs = 40

In [40]:
model.fit(dataset, epochs=epochs)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7f5ab5255fd0>

In [41]:
model.save('my_model.h5') 

In [42]:
from tensorflow.keras.models import load_model

In [43]:
model = create_model(vocab_size, embed_dim, rnn_neurons, batch_size=1 )


In [44]:
model.load_weights('/content/my_model.h5')
model.build(tf.TensorShape([1, None]))

In [45]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (1, None, 64)             5376      
                                                                 
 gru_1 (GRU)                 (1, None, 1026)           3361176   
                                                                 
 dense_1 (Dense)             (1, None, 84)             86268     
                                                                 
Total params: 3,452,820
Trainable params: 3,452,820
Non-trainable params: 0
_________________________________________________________________


In [46]:
def generate_text(model, start_seed, gen_size=500, temp=1.0):
  #number to generate
  num_generate = gen_size
  #evaluate the input text and convert the text to index
  input_eval = [char_to_ind[s] for s in start_seed]
  #expand it to meet the batch format shape
  input_eval= tf.expand_dims(input_eval, 0)
  #holds the generated text
  text_generated = []
  #how surprising you want the results to be 
  temperature = temp
  #reset the state of the model
  model.reset_states()
  for i in range(num_generate):
    predictions = model(input_eval)
    #remove the batch shape dimension
    predictions = tf.squeeze(predictions, 0)

    predictions = predictions / temperature
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
    input_eval = tf.expand_dims([predicted_id],0)
    text_generated.append(ind_to_char[predicted_id])
  return (start_seed+"".join(text_generated))

In [47]:
print(generate_text(model, "Juliet", gen_size=1000))

Juliet
    That running hath not been as sick and flike.
  VIRe sharp up a briefe.
    If thou beest a fair particular and soft
    Of good to learn a good merrily much.
  IAGO.                                               Why, show both your instruments
    Do clear 'gainst all his ventmes of your youth,
    Cut it at him. In him that branch'd fortunes with
    from me that thou art a fool
    That shall remember with another's pardon.
  LEONTES. You know that wish endeavours.
  PATROCLUS. No, Part.
    [To ANTIGONUS] I wish you black not be me.
  DESDEMONA.                 Ret Angelo and hear him draw our lordships, and there is like a
    good sight of his own noses, blessing for his sake, have lov'd me.
  IAGO. If it be so, what Shoft senses tell Again,
    And I for one of Christor Henry did my poor petition
    Is to let him but as fair death in Rome,
    And he's but jest? Why, I thank you,
    Because her honour and my    In your own strife- I am sent to fly a claud
    And Ma