In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

## Text Generation using RNN

### Import TensorFlow and other libraries

In [2]:
# Lets import some libraries
import os
import time
import datetime
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import tensorflow as tf

%matplotlib inline




In [3]:
def fn_verify_dir(_path):
    '''
    Arg:
        path: path to verify the directory
    returns:
        create dir if it does not exists
    '''
    if os.path.exists(_path): # check if the path exists. Maybe a file or a folder
        
        print(_path, ' exists') # advised the user
        
    else:
        
        os.makedirs(_path) # create the path
        
        print("Created folder : ", _path)

In [4]:
# Some basic parameters

inpDir = '../input' # location where input data is stored
outDir = '../output' # location to store outputs
modelDir = '../models' # location to store models
subDir = 'text_gen' # location to store models


RANDOM_STATE = 24 # for initialization ----- REMEMBER: to remove at the time of promotion to production

np.random.seed(RANDOM_STATE) # Set Random Seed for reproducible  results

BATCH_SIZE = 64

EPOCHS = 50 # number of cycles to run

ALPHA = 0.1 # learning rate

In [5]:
physical_devices = tf.config.list_physical_devices('GPU') 

if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

### Reading Shakespeare dataset

In [6]:
filePath = os.path.join(inpDir, subDir, 'shakespeare.txt')
filePath

'../input\\text_gen\\shakespeare.txt'

In [7]:
#reading the text file
text = open(filePath, 'rb').read().decode(encoding='utf-8')

len(text)

1115395

In [8]:
print(text[:400])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it 


In [9]:
#character level vocabulary

vocab = sorted(set(text))
len(vocab)

65

In [10]:
#vocab

In [11]:
#every character assigning index

#create indexing of vocab
char2idx = {u:i for i, u in enumerate(vocab)}    #iterating over whole vocabulary, picking up character, adding index

idx2char = np.array(vocab)                       #index to character

text_as_int = np.array([char2idx[c] for c in text])   

text_as_int.shape

(1115395,)

In [12]:
text_as_int

array([18, 47, 56, ...,  8,  0,  0])

In [13]:
type(text_as_int)

numpy.ndarray

In [14]:
text_as_int.shape

(1115395,)

In [15]:
idx2char[47]

'i'

In [16]:
char2idx

{'\n': 0,
 ' ': 1,
 '!': 2,
 '$': 3,
 '&': 4,
 "'": 5,
 ',': 6,
 '-': 7,
 '.': 8,
 '3': 9,
 ':': 10,
 ';': 11,
 '?': 12,
 'A': 13,
 'B': 14,
 'C': 15,
 'D': 16,
 'E': 17,
 'F': 18,
 'G': 19,
 'H': 20,
 'I': 21,
 'J': 22,
 'K': 23,
 'L': 24,
 'M': 25,
 'N': 26,
 'O': 27,
 'P': 28,
 'Q': 29,
 'R': 30,
 'S': 31,
 'T': 32,
 'U': 33,
 'V': 34,
 'W': 35,
 'X': 36,
 'Y': 37,
 'Z': 38,
 'a': 39,
 'b': 40,
 'c': 41,
 'd': 42,
 'e': 43,
 'f': 44,
 'g': 45,
 'h': 46,
 'i': 47,
 'j': 48,
 'k': 49,
 'l': 50,
 'm': 51,
 'n': 52,
 'o': 53,
 'p': 54,
 'q': 55,
 'r': 56,
 's': 57,
 't': 58,
 'u': 59,
 'v': 60,
 'w': 61,
 'x': 62,
 'y': 63,
 'z': 64}

In [17]:
dataset = tf.data.Dataset.from_tensor_slices([1.,2.,3.])

print (list(dataset.as_numpy_iterator()))

[1.0, 2.0, 3.0]


In [18]:
print(text[:10], text[1:11])

First Citi irst Citiz


In [19]:
seq_length = 100

example_per_epoch = len(text) // (seq_length+1)  #output needs extra character to predict next character

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int) 

for i in char_dataset.take(10):
    
    print (i.numpy(), '|', idx2char[i.numpy()])

18 | F
47 | i
56 | r
57 | s
58 | t
1 |  
15 | C
47 | i
58 | t
47 | i


In [20]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True) #sequence lenght in batches and dropping remaining characters

for item in sequences.take(2):    #if take(2)-> two tensors 
    
    print (item)

tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59  1], shape=(101,), dtype=int32)
tf.Tensor(
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1 49], shape=(101,), dtype=int32)


In [21]:
#converting back to characters to see sequence
for item in sequences.take(2):
    
    print (repr( ''.join(idx2char[item.numpy()] ) ) )   

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'


In [22]:
#converting sequences into input and ouput text

def split_input_target(chunk):
    
    input_text = chunk[:-1]
    
    target_text = chunk[1:]
    
    return input_text, target_text

dataset = sequences.map(split_input_target)  #data mapping is complete at this step

In [23]:
for inp_ex, tar_ex in dataset.take (2):
    print (repr( ''.join(idx2char[inp_ex.numpy()] ) ))
    print (repr( ''.join(idx2char[tar_ex.numpy()] ) ))
    print ('*'*50, '\n')

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
************************************************** 

'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you '
're all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
************************************************** 



In [24]:
#shuffling using buffer to save cache memory

BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<_BatchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int32, name=None), TensorSpec(shape=(64, 100), dtype=tf.int32, name=None))>

In [25]:
vocab_size = len(vocab)

embedding_dim = 256     #embedding need to be tuned to create relation

rnn_units = 1024


In [26]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    
    model = tf.keras.models.Sequential([
        
        tf.keras.layers.Embedding(vocab_size, 
                                  embedding_dim, 
                                  batch_input_shape= [batch_size, None]),
        
        tf.keras.layers.GRU(rnn_units,
                            return_sequences=True, 
                            stateful=True, 
                            recurrent_initializer='glorot_uniform'
                           ),
        tf.keras.layers.Dense(vocab_size)
    ])
    
    return model

## model without training 

In [27]:

# vocab_size, embedding_dim, rnn_units, batch_size
model = build_model(vocab_size= len(vocab), 
                    embedding_dim=embedding_dim, 
                    rnn_units = rnn_units,
                    batch_size= BATCH_SIZE)




In [28]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 256)           16640     
                                                                 
 gru (GRU)                   (64, None, 1024)          3938304   
                                                                 
 dense (Dense)               (64, None, 65)            66625     
                                                                 
Total params: 4021569 (15.34 MB)
Trainable params: 4021569 (15.34 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [29]:
for input_ex_batch, target_ex_batch in dataset.take(1):
    ex_batch_pred = model(input_ex_batch)

ex_batch_pred

<tf.Tensor: shape=(64, 100, 65), dtype=float32, numpy=
array([[[-7.71450205e-03,  1.18302181e-02,  2.66546849e-04, ...,
         -7.36422045e-03, -6.56804396e-03, -2.72386335e-03],
        [ 3.07571143e-03,  4.23251046e-03, -8.56857561e-03, ...,
          9.44747869e-03,  5.40684676e-04,  2.01759534e-03],
        [ 8.26581381e-03,  1.88353881e-02, -1.82907488e-02, ...,
          4.60136682e-03,  6.64701918e-03, -1.03462860e-02],
        ...,
        [-4.96507902e-03,  6.07218593e-03,  2.36177444e-02, ...,
         -1.13936439e-02, -1.64370649e-02, -5.02436282e-03],
        [-3.12405569e-03, -4.86289710e-03,  1.78474188e-02, ...,
          7.91952200e-03, -1.31450510e-02,  1.72061846e-05],
        [-2.17728270e-03, -2.87159067e-03,  8.94669071e-03, ...,
          1.17589831e-02, -8.97912309e-03,  1.05955526e-02]],

       [[ 1.10546816e-02, -7.17321876e-03,  2.05308199e-03, ...,
         -9.39912070e-03, -4.67778649e-03, -1.08755995e-02],
        [ 3.53640947e-03, -9.53755341e-03,  4.18

In [30]:
ex_batch_pred.shape  #64 is batchsize  , timesteps are 100,  features are 65

TensorShape([64, 100, 65])

In [31]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 256)           16640     
                                                                 
 gru (GRU)                   (64, None, 1024)          3938304   
                                                                 
 dense (Dense)               (64, None, 65)            66625     
                                                                 
Total params: 4021569 (15.34 MB)
Trainable params: 4021569 (15.34 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## model with training

In [32]:
#from 3d to 2d
sampled_indices = tf.random.categorical(ex_batch_pred[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis = -1).numpy()

sampled_indices

array([59,  3, 16,  6, 49, 33,  0, 61,  1, 61, 61, 35, 21, 26, 39, 32, 18,
       32,  5,  0, 56, 41, 46, 52,  1,  7, 48,  9, 51,  8,  5, 50, 58,  0,
       54, 43, 10, 22, 54, 10, 38, 48, 16, 19, 38, 62, 60, 10, 64,  9,  3,
       39, 44, 49, 37, 28, 19, 31,  6, 11, 15, 46, 19, 28, 44,  2,  9,  2,
       50,  1, 32, 22,  9, 27, 21, 21, 55,  7, 28, 13, 32, 52, 14,  3, 22,
       32, 63, 46, 60, 11, 17, 23,  0, 41, 53, 37,  7, 25,  1, 31],
      dtype=int64)

In [33]:
loss_fn = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [34]:
model.compile(optimizer = 'adam', loss=loss_fn)  #compiling model with adam




In [35]:
#checkpoint to every 
chkPtPath = os.path.join(modelDir, subDir)

chkPtPrefix = os.path.join(chkPtPath, 'chkpt_{epoch}')

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=chkPtPrefix,
                                                        save_weights_only=True)

In [36]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/50

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [37]:
tf.train.latest_checkpoint(chkPtPath)

'../models\\text_gen\\chkpt_50'

In [38]:
#builing model once again for batch size 1, for each character

model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(chkPtPath))

model.build ( tf.TensorShape ( [1, None ] ) )

In [39]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (1, None, 256)            16640     
                                                                 
 gru_1 (GRU)                 (1, None, 1024)           3938304   
                                                                 
 dense_1 (Dense)             (1, None, 65)             66625     
                                                                 
Total params: 4021569 (15.34 MB)
Trainable params: 4021569 (15.34 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [40]:
def generate_text(model, start_string):
    
    num_generate =  1000
    input_eval = [char2idx[s] for s in start_string] # [37, 48, 56 ]
    print (f'Input: {start_string} | {input_eval}\n')
    input_eval = tf.expand_dims(input_eval, 0) # tf.Tensor (1, 1, 5)
    text_generated = []
    
    model.reset_states()  #state reset as 
    
    for i in range(num_generate):
        
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predict_td = tf.random.categorical(predictions, 
                                            num_samples=1)[-1,0].numpy()
        
        input_eval = tf.expand_dims([predict_td], 0)       #converting into texy
        text_generated.append(idx2char[predict_td])
        
    return start_string+''.join(text_generated)

In [41]:
print (generate_text(model, start_string=u'ROMEO:'))

Input: ROMEO: | [30, 27, 25, 17, 27, 10]

ROMEO:
Why, what's the marther
They not hear him health of sweet beauty,
To stop these forthtiolitage.

ROMEO:
It was the infant troop?

BIANCA:
Why, then you please to end whose rod esched;
Who for this controvate will I be
Of 'twere not by judge: they love Julius 'gainst the field
II lesser his smiles lord: it is not known
Whether you can him
Approach'd coloces here; this is a miracle,
I, not my company, stealing how can heaven bid Bolingbroke
and more than you or I yet shues this is the royalties enjoy
your estate and state and bring out justice,--

AUFIDIUS:
That I were slain!
Away to them, and in that save beats;
Mow's would have beat the man that hath the woe,
How should they follow upon
A Fresh shook way: die? Thou'rt a friendstund caps and child
Did not beck fair march in your velvate
Dideafter to reside scandage of the western shames,
And let us any inferr's sun;
Not sweat is nothing of the adverse remorse
To win her severel thou art.
