<a href="https://colab.research.google.com/github/sharad28/Fun_implementation/blob/main/NLP/text_generation/text_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Text Generation using RNN

In [1]:
import tensorflow as tf
import numpy as np
import os 
import time

In [2]:
os.makedirs("data",exist_ok=True)

In [4]:
# dataset_file_name = 'shakespeare.txt'
# dataset_file_origin = 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt'
# dataset_path = os.path.join("data",dataset_file_name)

In [5]:
!curl https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt > './data/shakespeare.txt'

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100 1089k  100 1089k    0     0  8189k      0 --:--:-- --:--:-- --:--:-- 8189k


In [6]:
class config:
  path_to_file = os.path.join('data','shakespeare.txt')
  seq_length = 100

  batch_size = 64
  buffer_size = 10000

  embedding_dim = 256

  rnn_units = 1024

  EPOCHS = 30
 
  checkpoint_dir = './training_ckpt' 

In [7]:
text = open(config.path_to_file, "rb").read().decode(encoding='utf-8')
text [:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [8]:
vocab = sorted(set(text))
len(vocab)
config.vocab = vocab

In [9]:
char2idx = {char:idx for idx, char in enumerate(vocab)}
char2idx 

{'\n': 0,
 ' ': 1,
 '!': 2,
 '$': 3,
 '&': 4,
 "'": 5,
 ',': 6,
 '-': 7,
 '.': 8,
 '3': 9,
 ':': 10,
 ';': 11,
 '?': 12,
 'A': 13,
 'B': 14,
 'C': 15,
 'D': 16,
 'E': 17,
 'F': 18,
 'G': 19,
 'H': 20,
 'I': 21,
 'J': 22,
 'K': 23,
 'L': 24,
 'M': 25,
 'N': 26,
 'O': 27,
 'P': 28,
 'Q': 29,
 'R': 30,
 'S': 31,
 'T': 32,
 'U': 33,
 'V': 34,
 'W': 35,
 'X': 36,
 'Y': 37,
 'Z': 38,
 'a': 39,
 'b': 40,
 'c': 41,
 'd': 42,
 'e': 43,
 'f': 44,
 'g': 45,
 'h': 46,
 'i': 47,
 'j': 48,
 'k': 49,
 'l': 50,
 'm': 51,
 'n': 52,
 'o': 53,
 'p': 54,
 'q': 55,
 'r': 56,
 's': 57,
 't': 58,
 'u': 59,
 'v': 60,
 'w': 61,
 'x': 62,
 'y': 63,
 'z': 64}

In [10]:
char2idx.items()

dict_items([('\n', 0), (' ', 1), ('!', 2), ('$', 3), ('&', 4), ("'", 5), (',', 6), ('-', 7), ('.', 8), ('3', 9), (':', 10), (';', 11), ('?', 12), ('A', 13), ('B', 14), ('C', 15), ('D', 16), ('E', 17), ('F', 18), ('G', 19), ('H', 20), ('I', 21), ('J', 22), ('K', 23), ('L', 24), ('M', 25), ('N', 26), ('O', 27), ('P', 28), ('Q', 29), ('R', 30), ('S', 31), ('T', 32), ('U', 33), ('V', 34), ('W', 35), ('X', 36), ('Y', 37), ('Z', 38), ('a', 39), ('b', 40), ('c', 41), ('d', 42), ('e', 43), ('f', 44), ('g', 45), ('h', 46), ('i', 47), ('j', 48), ('k', 49), ('l', 50), ('m', 51), ('n', 52), ('o', 53), ('p', 54), ('q', 55), ('r', 56), ('s', 57), ('t', 58), ('u', 59), ('v', 60), ('w', 61), ('x', 62), ('y', 63), ('z', 64)])

In [11]:
idx2char = {idx:char for char,idx in char2idx.items()}
idx2char

{0: '\n',
 1: ' ',
 2: '!',
 3: '$',
 4: '&',
 5: "'",
 6: ',',
 7: '-',
 8: '.',
 9: '3',
 10: ':',
 11: ';',
 12: '?',
 13: 'A',
 14: 'B',
 15: 'C',
 16: 'D',
 17: 'E',
 18: 'F',
 19: 'G',
 20: 'H',
 21: 'I',
 22: 'J',
 23: 'K',
 24: 'L',
 25: 'M',
 26: 'N',
 27: 'O',
 28: 'P',
 29: 'Q',
 30: 'R',
 31: 'S',
 32: 'T',
 33: 'U',
 34: 'V',
 35: 'W',
 36: 'X',
 37: 'Y',
 38: 'Z',
 39: 'a',
 40: 'b',
 41: 'c',
 42: 'd',
 43: 'e',
 44: 'f',
 45: 'g',
 46: 'h',
 47: 'i',
 48: 'j',
 49: 'k',
 50: 'l',
 51: 'm',
 52: 'n',
 53: 'o',
 54: 'p',
 55: 'q',
 56: 'r',
 57: 's',
 58: 't',
 59: 'u',
 60: 'v',
 61: 'w',
 62: 'x',
 63: 'y',
 64: 'z'}

In [12]:
text_as_int = np.array([char2idx[c] for c in text])
text[:13],text_as_int[:13] 

('First Citizen', array([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52]))

In [13]:
examples_per_epoch = len(text)//(config.seq_length+1)
examples_per_epoch

11043

In [14]:
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
for i in char_dataset.take(10):
    print(idx2char[i.numpy()])


F
i
r
s
t
 
C
i
t
i


In [15]:
sequences = char_dataset.batch(batch_size=config.seq_length+1,drop_remainder=True)

for item in sequences.take(5):
  print("".join([idx2char[i] for i in item.numpy()]))

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 
are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you k
now Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us ki
ll him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be d
one: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citi


In [16]:
"".join([idx2char[i] for i in item.numpy()])

'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'

In [17]:
def split_input_target(chunk):
  input_text = chunk[:-1]
  target_text = chunk[1:]
  return input_text, target_text

dataset = sequences.map(split_input_target)  

In [18]:
for input_example,target_example in dataset.take(1):
  print('input character:')
  print("".join([idx2char[i] for i in input_example.numpy()]))
  print('')
  print('target character:')
  print("".join([idx2char[i] for i in target_example.numpy()]))

input character:
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You

target character:
irst Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 


In [19]:
dataset = dataset.shuffle(buffer_size=config.buffer_size).batch(config.batch_size,drop_remainder=True)
dataset

<BatchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

In [24]:
config.vocab_size = len(vocab)
config.vocab_size

65

In [25]:
def build_model(vocab_size, embedding_dim, rnn_unit,batch_size):
  
  model = tf.keras.Sequential([
      tf.keras.layers.Embedding(vocab_size,  embedding_dim, batch_input_shape=[batch_size,None]),
      tf.keras.layers.GRU(rnn_unit,
                          return_sequences=True,
                          stateful=True,
                          recurrent_initializer='glorot_uniform'),
                          tf.keras.layers.Dense(vocab_size)])
  return model


In [26]:
model = build_model(vocab_size = config.vocab_size,
                    embedding_dim = config.embedding_dim,
                    rnn_unit = config.rnn_units,
                    batch_size = config.batch_size)

In [28]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 256)           16640     
                                                                 
 gru (GRU)                   (64, None, 1024)          3938304   
                                                                 
 dense (Dense)               (64, None, 65)            66625     
                                                                 
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________
