In [40]:
import os
import time
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

### Data Collection:

* [Shakespeare dataset](https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt)
* [Shakespeare dataset](https://raw.githubusercontent.com/reddyprasade/Deep-Learning-with-Tensorflow-2.x/main/Data/input.txt)

In [41]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt','https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [42]:
# Read the text file data
text = open(path_to_file,'rb').read().decode(encoding='utf-8')
print("Length of text:{} charaters".format(len(text)))

Length of text:1115394 charaters


In [43]:
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [44]:
vocab = sorted(set(text))
print("{} Unique Characters From Text Data".format(len(vocab)))

65 Unique Characters From Text Data


In [45]:
example_text = ['abcdefg','xyz']

In [46]:
chars = tf.strings.unicode_split(example_text,input_encoding='UTF-8')
chars

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

In [47]:
ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab))

In [48]:
ids_from_chars

<tensorflow.python.keras.layers.preprocessing.string_lookup.StringLookup at 0x7f818a9bdac8>

In [49]:
ids = ids_from_chars(chars)
ids

<tf.RaggedTensor [[41, 42, 43, 44, 45, 46, 47], [64, 65, 66]]>

In [50]:
chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True)

In [51]:
chars = chars_from_ids(ids)
chars

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

In [52]:
tf.strings.reduce_join(chars,axis=-1).numpy()

array([b'abcdefg', b'xyz'], dtype=object)

In [53]:
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [54]:
all_ids = ids_from_chars(tf.strings.unicode_split(text,'UTF-8'))
all_ids

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([20, 49, 58, ..., 47, 10,  2])>

In [55]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [56]:
for ids in ids_dataset.take(10):
  print(chars_from_ids(ids).numpy().decode('utf-8'))

F
i
r
s
t
 
C
i
t
i


In [57]:
len(text)

1115394

In [58]:
seq_length = 100
example_per_epoch =len(text)//(seq_length+1)

In [59]:
sequences = ids_dataset.batch(seq_length+1,drop_remainder=True)

for seq in sequences.take(1):
  print(chars_from_ids(seq))

tf.Tensor(
[b'F' b'i' b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':'
 b'\n' b'B' b'e' b'f' b'o' b'r' b'e' b' ' b'w' b'e' b' ' b'p' b'r' b'o'
 b'c' b'e' b'e' b'd' b' ' b'a' b'n' b'y' b' ' b'f' b'u' b'r' b't' b'h'
 b'e' b'r' b',' b' ' b'h' b'e' b'a' b'r' b' ' b'm' b'e' b' ' b's' b'p'
 b'e' b'a' b'k' b'.' b'\n' b'\n' b'A' b'l' b'l' b':' b'\n' b'S' b'p' b'e'
 b'a' b'k' b',' b' ' b's' b'p' b'e' b'a' b'k' b'.' b'\n' b'\n' b'F' b'i'
 b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':' b'\n' b'Y'
 b'o' b'u' b' '], shape=(101,), dtype=string)


In [60]:
for seq in sequences.take(5):
  print(text_from_ids(seq).numpy())

b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
b'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
b"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
b"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
b'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'


In [61]:
def split_input_target(sequences):
  input_text = sequences[:-1]
  target_text = sequences[1:]
  return input_text,target_text


In [62]:
split_input_target(list('Python'))

(['P', 'y', 't', 'h', 'o'], ['y', 't', 'h', 'o', 'n'])

In [63]:
dataset = sequences.map(split_input_target)

In [64]:
for input_example, target_example in dataset.take(1):
  print("Input :", text_from_ids(input_example).numpy())
  print("Target:", text_from_ids(target_example).numpy())

Input : b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target: b'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [65]:
BATCH_SIZE = 64

BUFFER_SIZE = 10000


In [66]:
dataset = (dataset
           .shuffle(BUFFER_SIZE)
           .batch(BATCH_SIZE,drop_remainder=True)
           .prefetch(tf.data.experimental.AUTOTUNE)
           )

In [67]:
dataset

<PrefetchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [68]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

In [69]:
class MyModel(tf.keras.Model):
  
  
  def __init__(self,vocab_size,embedding_dim,rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size,embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)
  


  def call(self,inputs,states=None,return_state=False,training=False):
    x = inputs
    x = self.embedding(x,training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x,states = self.gru(x, initial_state=states,training=training)
    x = self.dense(x,training=training)
    if return_state:
      return x,states
    else:
      return x

In [70]:
model = MyModel(vocab_size= len(ids_from_chars.get_vocabulary()),
                embedding_dim=embedding_dim,
                rnn_units = rnn_units
    
)

In [71]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions  = model(input_example_batch)
  print(example_batch_predictions.shape)

(64, 100, 67)


* **64** is my batch_size
* **100** is my sequence_length
* **67** is my vocab_size

In [72]:
model.summary()

Model: "my_model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      multiple                  17152     
_________________________________________________________________
gru_1 (GRU)                  multiple                  3938304   
_________________________________________________________________
dense_1 (Dense)              multiple                  68675     
Total params: 4,024,131
Trainable params: 4,024,131
Non-trainable params: 0
_________________________________________________________________


In [73]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [74]:
sampled_indices

array([61, 34, 53,  7, 42, 54, 14, 12,  8, 58, 45, 27, 23, 62, 52, 23, 26,
       13, 38, 63, 64, 54, 27, 54, 18, 14, 60, 65, 61, 32, 29, 52, 19, 19,
       27, 41, 39, 58, 36, 63,  2, 18,  5, 43, 59, 11, 44, 38, 65, 55,  8,
       62, 33, 44, 64, 13, 39, 55, 45, 35, 38, 61, 39, 53, 51, 10, 62, 53,
       42, 63, 10, 44, 41, 45, 23, 20, 23, 45, 65, 21, 11, 20, 38, 65, 66,
       30,  9,  8, 54, 39,  3, 11, 66, 36,  0,  8,  1, 45, 32, 66])

In [75]:
print("Input \n",text_from_ids(input_example_batch[0].numpy()))

Input 
 tf.Tensor(b"t wash'd\nMy nose that bled, or foil'd some debile wretch.--\nWhich, without note, here's many else ha", shape=(), dtype=string)


In [76]:
print("Next Char Predication:\n",text_from_ids(sampled_indices).numpy())

Next Char Predication:
 b"uTm'bn?:,reMIvlIL;XwxnMnD?tyuROlEEMaYrVw\nD$cs3dXyo,vSdx;YoeUXuYmk.vmbw.daeIFIeyG3FXyzP-,nY 3zV,[UNK]eRz"


In [77]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [79]:
example_batch_loss  = loss(target_example_batch,example_batch_predictions)
mean_loss = example_batch_loss.numpy().mean()

In [80]:
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", mean_loss)

Prediction shape:  (64, 100, 67)  # (batch_size, sequence_length, vocab_size)
Mean loss:         4.2058086


In [81]:
tf.exp(mean_loss).numpy()

67.074814

In [82]:
model.compile(optimizer='adam',loss=loss)

In [83]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir,"ckpt_{epoch}")
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [84]:
EPOCHS =2

In [85]:
history = model.fit(dataset,epochs=EPOCHS,
                    callbacks=[checkpoint_callback])

Epoch 1/2
Epoch 2/2


### Generate text
![](https://www.tensorflow.org/tutorials/text/images/text_generation_sampling.png)

In [86]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature=temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "" or "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['','[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices = skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())]) 
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits] 
    predicted_logits, states =  self.model(inputs=input_ids, states=states, 
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "" or "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [87]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [93]:
start = time.time()
states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()

print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)

print(f"\nRun time: {end - start}")

ROMEO:
Haver, and herred, then hels a friss this hamp hepperourep;
Aroon make God my freen ag that ple wauld Lifter?

Gord:
Ay, to thes I Grom the not then I
To I kand bet theme
Fouthio, faild alluf, the lealy.

MORDANEE:
Foo, by tonquid med, Ay sweat Nor?

KENINSIT:
My to the kind.

merd Ad one way thouss know, in os I have say be off to whe inceres and that?
UMes you savee haw
Kivese enflate or our thesf things Yor wos longe anturd
I
seall and my wainger med ree foo frem fir my kning;
my 'tis and once come ol cheed,
Toll the satatrence aven you.

Secon.
The, where allot-gonder-Chatt hurd make.

CAMIO:
Fare theor have soon fear he is this unis;
Ay preate, preat chomour have comeming.

VARYY:
If ivond Hist kime to thy seeph the king
Yot hear ublid-be outange corded
As thou'd so vist it him the raves: my los:
Wicluch ravore in wry ferter.

First My Clitenty of I
Am all gild bum flow I I love,
I to make is I smecken it a plock:
The causeres a diny the this fanty you nom.

First:
Wh tramb

In [94]:
start = time.time()
states = None
next_char = tf.constant(['ROMEO:', 'ROMEO:', 'ROMEO:', 'ROMEO:', 'ROMEO:'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()

print(result, '\n\n' + '_'*80)


print(f"\nRun time: {end - start}")

tf.Tensor(
[b"ROMEO:\nMaight in this non?\n\nThes Lirhim,\ntheer payse Masceme of my lets.\n's not icloved thou abin\nWesoush him blows, fere in hig dishag,\nThengefoul seast that ie the un preatus that in you,\nThee! frise these out tiret, lots camo.\n\nLADY CIONCEMHA:\nWhow, ceford's ale!\n\nAULOLENE: shey dad: it? What kenglenfarrass me; by ty ane\nTo eny sence mo juete my lende\nTo trow pasty hear dadse try qoue'e wail;\nThrwe! say, 'tin I lo,\nFor sen all agh thater:\nThe kinds of the slole af the eremous beo, of thoudd,\nHitht by thou papfot?\n\nJlAULENS:\nMy thou agan themel if is anch'd will of Lo, cho packs,\nAnd ame to mask not\nThat a patrery to mor, in shemely of thak tur\nin granken dy ustersed's mastor thim\nFrempece,\nMowhank, I may be come son, ho my, bone Beart a farte.\n\nCARUME:\nIt in anvithouse\nbul nourd wifl Vaistir!\n\nPirilt Cameling\napore sluen's loath ar' in erfor;\nAnd that! ar censte was she to my, lose you sleign?\n\nANCELYO:\n\nHers:\nFad lisiosss stall 

In [95]:
tf.saved_model.save(one_step_model,'one_step')






INFO:tensorflow:Assets written to: one_step/assets


INFO:tensorflow:Assets written to: one_step/assets


In [96]:
one_step_reloaded = tf.saved_model.load('one_step')

In [97]:
states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]

for n in range(100):
  next_char, states = one_step_reloaded.generate_one_step(next_char, states=states)
  result.append(next_char)
print(tf.strings.join(result)[0].numpy().decode("utf-8"))









ROMEO:
Mastingt uthe to miperort?

JUCILIF:
Clids then my frat os a mard.
Go sorblecck? Which sir's fallic
