<a href="https://colab.research.google.com/github/suhitaghosh10/colab_examples/blob/master/Shakespeare_txt_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import tensorflow as tf
import numpy as np
import os
import time

In [2]:
path_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [3]:
text = open(path_file, 'rb').read().decode(encoding='utf-8')
print ('Length of text: {} characters'.format(len(text)))

Length of text: 1115394 characters


In [4]:
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [5]:
# The unique characters in the file
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

65 unique characters


In [6]:
print(len(vocab))
vocab[0:5]

65


['\n', ' ', '!', '$', '&']

In [0]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [8]:
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  '\n':   0,
  ' ' :   1,
  '!' :   2,
  '$' :   3,
  '&' :   4,
  "'" :   5,
  ',' :   6,
  '-' :   7,
  '.' :   8,
  '3' :   9,
  ':' :  10,
  ';' :  11,
  '?' :  12,
  'A' :  13,
  'B' :  14,
  'C' :  15,
  'D' :  16,
  'E' :  17,
  'F' :  18,
  'G' :  19,
  ...
}


In [9]:
text_as_int.shape
print(text_as_int[0:20])
print(idx2char[text_as_int[0:20]])

[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56]
['F' 'i' 'r' 's' 't' ' ' 'C' 'i' 't' 'i' 'z' 'e' 'n' ':' '\n' 'B' 'e' 'f'
 'o' 'r']


In [49]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)
print(examples_per_epoch)


11043


(1115394,)

In [56]:
# Create training examples / targets
val_set_index = int(0.2 * len(text))
train_set = text_as_int[0:val_set_index]
val_set = text_as_int[val_set_index:len(text)]
train_char_dataset = tf.data.Dataset.from_tensor_slices(train_set)
val_char_dataset = tf.data.Dataset.from_tensor_slices(val_set)
for i in train_char_dataset.take(5):
  print(idx2char[i.numpy()])

for i in val_char_dataset.take(10):
  print(idx2char[i.numpy()])


F
i
r
s
t
r
 
y
e
a
r
s


H
a


In [58]:
train_text_sequences = train_char_dataset.batch(seq_length+1, drop_remainder=True)
val_text_sequences = val_char_dataset.batch(seq_length+1, drop_remainder=True)
print(train_text_sequences, val_text_sequences)
for item in train_text_sequences.take(5):
  print(repr(''.join(idx2char[item.numpy()])))

print('---------')
for item in val_text_sequences.take(5):
  print(repr(''.join(idx2char[item.numpy()])))


<BatchDataset shapes: (101,), types: tf.int64> <BatchDataset shapes: (101,), types: tf.int64>
'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'
---------
"r years\nHath not yet dived into the world's deceit\nNor more can you distinguish of a man\nThan of his "
'outward show; which, God he knows,\nSeldom or never jumpeth with the heart.\nThose uncles which you wan'
"t were dangerous;\nYour grace attended to their sugar'd words,\nBut look'd not on the poison of their h"
'earts :\nGod keep you from 

In [0]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

In [60]:
train_dataset = train_text_sequences.map(split_input_target)
val_dataset = val_text_sequences.map(split_input_target)
train_dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [61]:
for input_example, target_example in  train_dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))


Input data:  'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target data: 'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [0]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)

In [0]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024


In [0]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):

  model = tf.keras.Sequential([
                               
                               tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
                               tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                               tf.keras.layers.Dense(vocab_size)]
      
                              )
  return model


In [0]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)


In [66]:
for input_example_batch, target_example_batch in train_dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")


(64, 100, 65) # (batch_size, sequence_length, vocab_size)


In [0]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()


In [68]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))


Input: 
 "hat do appear,\nTheir needless vouches? Custom calls me to't:\nWhat custom wills, in all things should"

Next Char Predictions: 
 "aheWdqreX:P;hSPkawZSvYDBRRGEOEnZNAWqqVTBxB.,'aI'BvsWhtlQAAda&'DDiJlJm&widtGH?Ec\nhMGFJbHKEVC,iNpRF-s-"


In [69]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())


Prediction shape:  (64, 100, 65)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.1754785


In [0]:
model_checkpoint = tf.keras.callbacks.ModelCheckpoint('model.h5', monitor='val_loss',
                                           save_best_only=True,
                                           verbose=1,
                                           mode='min')

In [73]:
model.compile(optimizer='adam', loss=loss)


history = model.fit(train_dataset, epochs=10, callbacks=[model_checkpoint], validation_data=val_dataset)


Epoch 1/10
Epoch 00001: val_loss improved from inf to 3.05007, saving model to model.h5
Epoch 2/10
Epoch 00002: val_loss improved from 3.05007 to 2.54596, saving model to model.h5
Epoch 3/10
Epoch 00003: val_loss improved from 2.54596 to 2.38631, saving model to model.h5
Epoch 4/10
Epoch 00004: val_loss improved from 2.38631 to 2.31056, saving model to model.h5
Epoch 5/10
Epoch 00005: val_loss improved from 2.31056 to 2.24577, saving model to model.h5
Epoch 6/10
Epoch 00006: val_loss improved from 2.24577 to 2.18229, saving model to model.h5
Epoch 7/10
Epoch 00007: val_loss improved from 2.18229 to 2.12581, saving model to model.h5
Epoch 8/10
Epoch 00008: val_loss improved from 2.12581 to 2.07517, saving model to model.h5
Epoch 9/10
Epoch 00009: val_loss improved from 2.07517 to 2.03452, saving model to model.h5
Epoch 10/10
Epoch 00010: val_loss improved from 2.03452 to 1.99356, saving model to model.h5


In [0]:
#### prediction
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights('model.h5')

model.build(tf.TensorShape([1, None]))


In [0]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 1000

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  print(input_eval)
  input_eval = tf.expand_dims(input_eval, 0)
  print(input_eval.shape)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.0

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the character returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted character as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))


In [76]:
print(generate_text(model, start_string=u"Rome "))


[30, 53, 51, 43, 1]
(1, 5)
Rome have y ah hears,
Make his lise my &Akn'sbay,
detare it in a stond I, why hest turbeld goods tostermen ton uit teir bet;
And tround us'd heave hour fooron feart, sur, we partion. I would bus nat: I hear heard
I was near that ye ervy ather the gods, home
That I lafe come, lettor: whence tham I soat fooret.
Nouthing the searte this were on all todshy.

QUEEN MARGARET:
You eeat; whis is with,
As my bard maven I'st a a place fos tether
To you are of.

KICSICINIUS:
I way not to have crile:
Which he sarvint swarm;
To of diving conds; but cullopaict.

QULER:
And he hadbs,
Alvain I he it.

BRUTUS:
Othy? I fouxt mear,
As nevore!
And your vertence as of your goge,
And theil uc:
Thang as thee flear deditidion'd to tuses thag,?
If thou ard him
ond dive with as mearing atsiness,
And het me rapies, brithrite, then mest upon to since in thoute than wo bejour drear: thou shild,
And math reat, word, had, forkect
Thou be qutered solf face stild of I heart,
QUS:
Gol, of the

In [0]:
###using gradient tape


model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE,
  )


optimizer = tf.keras.optimizers.Adam()



In [0]:
@tf.function
def train_step(inp, target):
  with tf.GradientTape() as tape:
    predictions = model(inp)
    loss = tf.reduce_mean(
        tf.keras.losses.sparse_categorical_crossentropy(
            target, predictions, from_logits=True))
  grads = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(grads, model.trainable_variables))

  return loss

In [79]:
# Training step
EPOCHS = 10

for epoch in range(EPOCHS):
  start = time.time()

  # initializing the hidden state at the start of every epoch
  # initally hidden is None
  hidden = model.reset_states()

  for (batch_n, (inp, target)) in enumerate(dataset):
    loss = train_step(inp, target)

    if batch_n % 100 == 0:
      template = 'Epoch {} Batch {} Loss {}'
      print(template.format(epoch+1, batch_n, loss))

  # saving (checkpoint) the model every 5 epochs
  if (epoch + 1) % 5 == 0:
    model.save_weights(checkpoint_prefix.format(epoch=epoch))

  print ('Epoch {} Loss {:.4f}'.format(epoch+1, loss))
  print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

model.save_weights(checkpoint_prefix.format(epoch=epoch))


Epoch 1 Batch 0 Loss 4.175806999206543
Epoch 1 Batch 100 Loss 2.3720622062683105
Epoch 1 Loss 2.1846
Time taken for 1 epoch 13.01163125038147 sec

Epoch 2 Batch 0 Loss 2.1800589561462402
Epoch 2 Batch 100 Loss 1.9297655820846558
Epoch 2 Loss 1.9009
Time taken for 1 epoch 11.82506799697876 sec

Epoch 3 Batch 0 Loss 1.7756026983261108
Epoch 3 Batch 100 Loss 1.7015266418457031
Epoch 3 Loss 1.5853
Time taken for 1 epoch 11.859676837921143 sec

Epoch 4 Batch 0 Loss 1.6328121423721313
Epoch 4 Batch 100 Loss 1.5122989416122437
Epoch 4 Loss 1.5188
Time taken for 1 epoch 11.969431161880493 sec

Epoch 5 Batch 0 Loss 1.4673004150390625
Epoch 5 Batch 100 Loss 1.4484620094299316
Epoch 5 Loss 1.4903
Time taken for 1 epoch 11.974555492401123 sec

Epoch 6 Batch 0 Loss 1.3874232769012451
Epoch 6 Batch 100 Loss 1.41978120803833
Epoch 6 Loss 1.3930
Time taken for 1 epoch 12.035968542098999 sec

Epoch 7 Batch 0 Loss 1.3656483888626099
Epoch 7 Batch 100 Loss 1.3535572290420532
Epoch 7 Loss 1.3430
Time take