In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.losses import sparse_categorical_crossentropy

In [2]:
path_to_file = 'shakespeare.txt'

In [3]:
text = open(path_to_file, 'r').read()

In [4]:
print(text[10000:30000])

 sullied night,
    And all in war with Time for love of you,
    As he takes from you, I engraft you new.


                     16  
  But wherefore do not you a mightier way
  Make war upon this bloody tyrant Time?
  And fortify your self in your decay
  With means more blessed than my barren rhyme?
  Now stand you on the top of happy hours,
  And many maiden gardens yet unset,
  With virtuous wish would bear you living flowers,
  Much liker than your painted counterfeit:
  So should the lines of life that life repair
  Which this (Time's pencil) or my pupil pen
  Neither in inward worth nor outward fair
  Can make you live your self in eyes of men.
    To give away your self, keeps your self still,
    And you must live drawn by your own sweet skill.


                     17
  Who will believe my verse in time to come
  If it were filled with your most high deserts?
  Though yet heaven knows it is but as a tomb
  Which hides your life, and shows not half your parts:  
  If I could

In [5]:
vocab = sorted(set(text))

In [6]:
# vocab

In [7]:
len(vocab)

84

In [8]:
# for pair in enumerate(vocab):
#     print(pair)

In [9]:
char_to_ind = {char:ind for ind, char in enumerate(vocab)}
char_to_ind['H']

33

In [10]:
ind_to_char = np.array(vocab)
ind_to_char

array(['\n', ' ', '!', '"', '&', "'", '(', ')', ',', '-', '.', '0', '1',
       '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '>', '?',
       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
       '[', ']', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
       'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
       'w', 'x', 'y', 'z', '|', '}'], dtype='<U1')

In [11]:
ind_to_char[33]

'H'

In [12]:
encoded_text = np.array([char_to_ind[c] for c in text])

In [13]:
encoded_text.shape

(5445609,)

In [14]:
sample = text[:500]
print(sample)


                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bu


In [15]:
encoded_text[:500]

array([ 0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1, 12,  0,  1,  1, 31, 73, 70, 68,  1, 61, 56, 64,
       73, 60, 74, 75,  1, 58, 73, 60, 56, 75, 76, 73, 60, 74,  1, 78, 60,
        1, 59, 60, 74, 64, 73, 60,  1, 64, 69, 58, 73, 60, 56, 74, 60,  8,
        0,  1,  1, 45, 63, 56, 75,  1, 75, 63, 60, 73, 60, 57, 80,  1, 57,
       60, 56, 76, 75, 80,  5, 74,  1, 73, 70, 74, 60,  1, 68, 64, 62, 63,
       75,  1, 69, 60, 77, 60, 73,  1, 59, 64, 60,  8,  0,  1,  1, 27, 76,
       75,  1, 56, 74,  1, 75, 63, 60,  1, 73, 64, 71, 60, 73,  1, 74, 63,
       70, 76, 67, 59,  1, 57, 80,  1, 75, 64, 68, 60,  1, 59, 60, 58, 60,
       56, 74, 60,  8,  0,  1,  1, 33, 64, 74,  1, 75, 60, 69, 59, 60, 73,
        1, 63, 60, 64, 73,  1, 68, 64, 62, 63, 75,  1, 57, 60, 56, 73,  1,
       63, 64, 74,  1, 68, 60, 68, 70, 73, 80, 21,  0,  1,  1, 27, 76, 75,
        1, 75, 63, 70, 76,  1, 58, 70, 69, 75, 73, 56, 58, 75, 60, 59,  1,
       75, 70,  1, 75, 63

In [16]:
lines = '''
From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
'''
seq_len = 120

In [17]:
total_num_seq = len(text) // (seq_len+1)
total_num_seq

45005

In [18]:
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)

In [19]:
type(char_dataset)

tensorflow.python.data.ops.dataset_ops.TensorSliceDataset

In [20]:
# for item in char_dataset.take(300):
#     print(ind_to_char[item.numpy()])

In [21]:
sequences = char_dataset.batch(seq_len+1, drop_remainder=True)

In [22]:
def create_seq(seq):
    input_text = seq[:-1]
    target_text = seq[1:]
    return input_text, target_text

In [23]:
dataset = sequences.map(create_seq)

In [24]:
for input_text, target_text in dataset.take(1):
    print(input_text.numpy())
    print(''.join(ind_to_char[input_text.numpy()]))
    print(target_text.numpy())
    print(''.join(ind_to_char[target_text.numpy()]))

[ 0  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0
  1  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74
  1 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45
 63 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74
 60  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75]

                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But
[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0  1
  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74  1
 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45 63
 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74 60
  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75  1]
                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But 


In [25]:
batch_size = 128

In [26]:
buffer_size = 10000
dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

In [27]:
dataset

<BatchDataset shapes: ((128, 120), (128, 120)), types: (tf.int64, tf.int64)>

In [28]:
vocab_len = len(vocab)
vocab_len

84

In [29]:
embedded_dim = 64

In [30]:
rnn_neurons = 1024

In [53]:
def sparse_cat_loss(y_true, y_pred):
    return sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)

In [32]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense

In [33]:
def create_model(vocab_size, embed_size, rnn_neurons, batch_size):
    model  = Sequential()
    model.add(Embedding(vocab_size, embed_size, batch_input_shape=[batch_size, None]))
    model.add(GRU(rnn_neurons, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'))
    model.add(Dense(vocab_size))
    
    model.compile('adam', loss=sparse_cat_loss)
    return model

In [34]:
model = create_model(vocab_size=vocab_len, embed_size=embedded_dim, rnn_neurons=rnn_neurons, batch_size=batch_size)

In [35]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (128, None, 64)           5376      
_________________________________________________________________
gru (GRU)                    (128, None, 1024)         3348480   
_________________________________________________________________
dense (Dense)                (128, None, 84)           86100     
Total params: 3,439,956
Trainable params: 3,439,956
Non-trainable params: 0
_________________________________________________________________


In [36]:
for input_ex_batch, target_ex_batch in dataset.take(1):
    ex_batch_pred = model(input_ex_batch)

In [37]:
ex_batch_pred.shape

TensorShape([128, 120, 84])

In [38]:
sample_indeces = tf.random.categorical(ex_batch_pred[0], num_samples=1)

In [39]:
sample_indeces

<tf.Tensor: shape=(120, 1), dtype=int64, numpy=
array([[41],
       [18],
       [43],
       [22],
       [71],
       [31],
       [27],
       [19],
       [78],
       [62],
       [60],
       [69],
       [72],
       [29],
       [68],
       [28],
       [19],
       [19],
       [ 4],
       [ 5],
       [51],
       [ 9],
       [63],
       [ 5],
       [61],
       [78],
       [52],
       [31],
       [77],
       [65],
       [52],
       [26],
       [71],
       [51],
       [48],
       [21],
       [53],
       [11],
       [70],
       [78],
       [62],
       [42],
       [39],
       [73],
       [56],
       [37],
       [19],
       [62],
       [65],
       [11],
       [80],
       [10],
       [80],
       [54],
       [53],
       [82],
       [16],
       [51],
       [ 5],
       [11],
       [49],
       [78],
       [32],
       [10],
       [32],
       [27],
       [76],
       [28],
       [83],
       [82],
       [60],
       [ 0],
       [38],
   

In [40]:
sample_indeces = tf.squeeze(sample_indeces, axis=-1).numpy()

In [41]:
ind_to_char[sample_indeces]

array(['P', '7', 'R', ';', 'p', 'F', 'B', '8', 'w', 'g', 'e', 'n', 'q',
       'D', 'm', 'C', '8', '8', '&', "'", 'Z', '-', 'h', "'", 'f', 'w',
       '[', 'F', 'v', 'j', '[', 'A', 'p', 'Z', 'W', ':', ']', '0', 'o',
       'w', 'g', 'Q', 'N', 'r', 'a', 'L', '8', 'g', 'j', '0', 'y', '.',
       'y', '_', ']', '|', '5', 'Z', "'", '0', 'X', 'w', 'G', '.', 'G',
       'B', 'u', 'C', '}', '|', 'e', '\n', 'M', 'M', '3', 'X', 'h', '.',
       '!', 'l', '"', 'd', 'k', 'o', '8', 'd', 'l', '<', ',', 'Y', "'",
       '!', 'L', 'C', 'K', 'a', '-', 'v', 'V', 'I', 'q', 'K', ',', 'g',
       'K', 'd', 'o', 'u', '!', ' ', 'S', 'e', '>', 'n', 'T', '|', '6',
       ',', 'B', 'z'], dtype='<U1')

In [42]:
# model.fit(dataset, epochs=30)

In [43]:
from tensorflow.keras.models import load_model

In [56]:
model = create_model(vocab_len, embedded_dim, rnn_neurons, batch_size=1)
model.summary()
# model.load_weights('shakespeare_gen.h5')
# model.build(tf.TensorShape[1, None])

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (1, None, 64)             5376      
_________________________________________________________________
gru_8 (GRU)                  (1, None, 1024)           3348480   
_________________________________________________________________
dense_8 (Dense)              (1, None, 84)             86100     
Total params: 3,439,956
Trainable params: 3,439,956
Non-trainable params: 0
_________________________________________________________________


ValueError: Shapes (64, 3072) and (64, 3078) are incompatible