In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import tensorflow as tf

In [3]:
## Importing dataset (text corpus)
path_to_file = r"..\shakespeare.txt"

In [4]:
text = open(path_to_file).read()

In [5]:
text[:200]

"\n                     1\n  From fairest creatures we desire increase,\n  That thereby beauty's rose might never die,\n  But as the riper should by time decease,\n  His tender heir might bear his memory:\n "

In [6]:
## Getting all the unique characters
vocab = sorted(set(text))

In [7]:
## Vectorizing the text creating enumeration which is pairs of characters

In [8]:
char_to_index = {char:ind for ind,char in enumerate(vocab)} ## Dictionary of characters


In [9]:
index_to_char = np.array(vocab)

In [10]:
index_to_char[33]

'H'

In [11]:
encoded_text = np.array([char_to_index[c] for c in text])

In [12]:
encoded_text.shape

(5445609,)

In [13]:
sample = text[:500]

In [14]:
sample

"\n                     1\n  From fairest creatures we desire increase,\n  That thereby beauty's rose might never die,\n  But as the riper should by time decease,\n  His tender heir might bear his memory:\n  But thou contracted to thine own bright eyes,\n  Feed'st thy light's flame with self-substantial fuel,\n  Making a famine where abundance lies,\n  Thy self thy foe, to thy sweet self too cruel:\n  Thou that art now the world's fresh ornament,\n  And only herald to the gaudy spring,\n  Within thine own bu"

In [15]:
encoded_text[:500]

array([ 0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1, 12,  0,  1,  1, 31, 73, 70, 68,  1, 61, 56, 64,
       73, 60, 74, 75,  1, 58, 73, 60, 56, 75, 76, 73, 60, 74,  1, 78, 60,
        1, 59, 60, 74, 64, 73, 60,  1, 64, 69, 58, 73, 60, 56, 74, 60,  8,
        0,  1,  1, 45, 63, 56, 75,  1, 75, 63, 60, 73, 60, 57, 80,  1, 57,
       60, 56, 76, 75, 80,  5, 74,  1, 73, 70, 74, 60,  1, 68, 64, 62, 63,
       75,  1, 69, 60, 77, 60, 73,  1, 59, 64, 60,  8,  0,  1,  1, 27, 76,
       75,  1, 56, 74,  1, 75, 63, 60,  1, 73, 64, 71, 60, 73,  1, 74, 63,
       70, 76, 67, 59,  1, 57, 80,  1, 75, 64, 68, 60,  1, 59, 60, 58, 60,
       56, 74, 60,  8,  0,  1,  1, 33, 64, 74,  1, 75, 60, 69, 59, 60, 73,
        1, 63, 60, 64, 73,  1, 68, 64, 62, 63, 75,  1, 57, 60, 56, 73,  1,
       63, 64, 74,  1, 68, 60, 68, 70, 73, 80, 21,  0,  1,  1, 27, 76, 75,
        1, 75, 63, 70, 76,  1, 58, 70, 69, 75, 73, 56, 58, 75, 60, 59,  1,
       75, 70,  1, 75, 63

In [16]:
## Creating batches
## The character length is dependent on the work you are doing. Check the lines or the text to understand what makes a good
## size for getting the meaning out of the data.

seq_len = 120 ## Character length


In [17]:
total_seq = len(text) // (seq_len+1)

In [18]:
total_seq

45005

In [19]:
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)

In [20]:
char_dataset ## This is now a dataset type which has associated function that can be used to do operations using them

<TensorSliceDataset shapes: (), types: tf.int32>

In [21]:
#for item in char_dataset.take(500):
    #print(index_to_char[item.numpy()])

In [22]:
## creating sequences
sequence = char_dataset.batch(seq_len+1,drop_remainder = True) ## batch method converts the data into sequence of characters which can be passed in as batches.

In [23]:
## 1. Grab input text sequence
## 2. Assign the target text sequence shifted by one step forward and then group them as a tuple

def create_seq_target(seq):
    input_text = seq[:-1] ## Starting from the beginning all the way up to the second last character
    target_text = seq[1:]
    
    return input_text,target_text

In [24]:
dataset = sequence.map(create_seq_target)

In [25]:
for input_txt,target_txt in dataset.take(2):
    print(input_txt.numpy())
    print("".join(index_to_char[input_txt.numpy()]))
    print(target_txt.numpy())
    print("".join(index_to_char[target_txt.numpy()]))

[ 0  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0
  1  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74
  1 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45
 63 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74
 60  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75]

                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But
[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0  1
  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74  1
 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45 63
 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74 60
  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75  1]
                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But 
[56 74  1 75 63 60  1 73 6

In [26]:
batch_size = 128

In [27]:
buffer_size =1000

dataset = dataset.shuffle(buffer_size).batch(batch_size,drop_remainder=True)

In [28]:
dataset

<BatchDataset shapes: ((128, 120), (128, 120)), types: (tf.int32, tf.int32)>

In [30]:
vocab_size = len(vocab)
vocab_size

84

In [31]:
embed_dim = 64

In [53]:
rnn_neurons = 1026

In [33]:
from tensorflow.keras.losses import sparse_categorical_crossentropy ## For one hot encoded data

In [54]:
def sparse_cat_loss(y_true,y_pred):
    return sparse_categorical_crossentropy(y_true,y_pred,from_logits = True)

In [55]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense

In [56]:
def create_model(vocab,embed_dim,rnn_neurons,batch_size):
    model = Sequential()
    
    model.add(Embedding(vocab_size,embed_dim,batch_input_shape = [batch_size,None]))
    model.add(GRU(rnn_neurons, return_sequences=True,
                 stateful=True,recurrent_initializer='glorot_uniform'))
    model.add(Dense(vocab_size))
    
    model.compile('adam',loss = sparse_cat_loss)
    
    return model
    

In [57]:
model = create_model(vocab=vocab_size,embed_dim = embed_dim,
                    rnn_neurons = rnn_neurons,batch_size = batch_size)

In [58]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (128, None, 64)           5376      
_________________________________________________________________
gru_4 (GRU)                  (128, None, 1026)         3361176   
_________________________________________________________________
dense_4 (Dense)              (128, None, 84)           86268     
Total params: 3,452,820
Trainable params: 3,452,820
Non-trainable params: 0
_________________________________________________________________


In [59]:
for input_example_batch,target_example_batch in dataset.take(1):
    example_batch_pred = model(input_example_batch)

In [60]:
example_batch_pred.shape

TensorShape([128, 120, 84])

In [61]:
sampled_index = tf.random.categorical(example_batch_pred[0],num_samples=1)

In [62]:
sampled_index = tf.squeeze(sampled_index,axis = -1).numpy()

In [63]:
index_to_char[sampled_index]

array(['b', 'G', '0', 'P', '<', 'z', 'A', 'B', '.', ']', '(', 'W', 'Z',
       "'", '<', 'l', '0', 'p', '<', 'L', 'B', 'H', '7', '!', 'z', 's',
       'A', 'l', '(', '(', 'q', '>', 'c', 'S', 'v', '2', 'u', 'Z', '(',
       '.', 'a', 'B', 'g', 'l', 'q', 'd', ';', '[', 'K', 'D', 'm', 'N',
       '\n', 'J', 'a', 'h', '"', 'r', '_', 't', ':', ',', 'a', '5', '7',
       '|', ' ', 'y', '7', 'v', 'P', 'p', '0', 'w', 'e', 'V', 'k', 'j',
       '<', '}', 'D', 'S', 'T', '0', 'H', 'W', ']', '0', '?', 'h', '_',
       'h', 'U', 'K', 'z', '?', '7', '_', 'x', '2', '?', 'M', 'o', '&',
       'b', ')', 'T', 'r', 'G', 'q', '&', '6', 'w', '-', '9', 'V', 'Y',
       '2', 'F', 'o'], dtype='<U1')

In [47]:
epochs = 10

In [None]:
model.fit(dataset,epochs = epochs)

In [49]:
## Loading the model weights
from tensorflow.keras.models import load_model

In [64]:
model = create_model(vocab,embed_dim,rnn_neurons,batch_size = 1)

model.load_weights(r'..\shakespeare_gen.h5')

model.build(tf.TensorShape([1,None]))

In [65]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (1, None, 64)             5376      
_________________________________________________________________
gru_5 (GRU)                  (1, None, 1026)           3361176   
_________________________________________________________________
dense_5 (Dense)              (1, None, 84)             86268     
Total params: 3,452,820
Trainable params: 3,452,820
Non-trainable params: 0
_________________________________________________________________


In [66]:
def gen_text(model,start_seed,gen_size = 500,temp = 1.0):
    
    num_generate = gen_size
    
    input_eval = [char_to_index[s] for s in start_seed]
    
    input_eval = tf.expand_dims(input_eval,0)
    
    text_gen = []
    
    temperature = temp
    
    model.reset_states()
    
    for i in range(num_generate):
        preds = model(input_eval)
        
        preds= tf.squeeze(preds,0)
        
        preds = preds/temperature
        
        pred_id = tf.random.categorical(preds,num_samples=1)[-1,0].numpy()
        
        input_eval = tf.expand_dims([pred_id],0)
        
        text_gen.append(index_to_char[pred_id])
        
    return (start_seed+"".join(text_gen))

In [67]:
print(gen_text(model,"JULIET",gen_size = 1000))

JULIETH. Teach thy sweet parent castrears safe!
  BOLINGBROKE. They could not cry his conscience like his
    place.
  AGUMCE. It mayst. An honourable, marry, you must put on
    here!                  [They convers'd with Leonato's]
  
                  Re-enter ANNE Pyrmenakes

Enter AEDILIUS  
  HERMIA. Allanon, wrath in deed-window, must of fool;
    Holding the holidims Coriolanus.
    And Duma mistress.
  PERDITA. The oracle of himself?
  VALENTINE. Ay, indeed, she shall be suffer'd in this universar.
  PAULINA. But yet, we'll nothing; and I pray thee, I perceive
    This Bolingbroke.
  SIR TOBY. [To BEROWNE. When did thee twenty bitter breathless, skip
    'Gainst Ajax will divicer his injuries all,  
    And lawfully doth hard of humblest friends.
  KING RICHARD. Away! -will win away to-morrow.
                             Exit a SERVANT

  SECOND LORD. Mistress Barbato, O'ercover my kindred friends- but I
    have business to down like me here is to have her neather
    tell m