In [1]:
import os
import gc
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from datetime import datetime, timedelta

import tensorflow as tf

parent_dir = os.path.abspath('..')
sys.path.insert(0, parent_dir)
from utils.helper import fn_plot_tf_hist

In [2]:
PATIENCE = 20
LR_FACTOR = .2
LR_PATIENCE = 5
BUFFER_SIZE = 10000

In [3]:
inpDir = os.path.join('..', 'input')
outDir = '../output'
modelDir = os.path.join('..', 'model')
subDir = os.path.join('text_gen')
fileName = 'shakespeare.txt'

EPOCHS = 30
ALPHA = .001
TEST_SIZE = .2  


BATCH_SIZE = 64 # default batch size fot tf
RANDOM_STATE = 24 # for initialization ----- REMEMBER: to remove at the time of promotion to production
np.random.RandomState(RANDOM_STATE) # Set Random Seed for reproducible results
tf.random.set_seed(RANDOM_STATE) ######

# parameters for Matplotlib
params = {'legend.fontsize': 'large',
          'figure.figsize': (15, 8),
          'axes.labelsize': 'large',
          'axes.titlesize':'x-large',
          'xtick.labelsize':'large',
          'ytick.labelsize':'large'
         }

CMAP = 'brg' # plt.cm.Spectral

plt.rcParams.update(params)

## define data


In [5]:
filePath = os.path.join(inpDir, subDir, fileName)

In [6]:
text = open(filePath, 'rb').read().decode(encoding = 'utf-8') #The decode method in the context of bytes objects in Python is used to convert a byte sequence (binary data) into a string using a specific character encoding. 

len(text)

1115395

In [7]:
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


## character base modelling

In [9]:
vocab = sorted(set(text)) #  A sorted list of unique characters in the text.
len(vocab) # all the characters  used in text file

65

In [10]:
char2idx = {u: i for i, u in enumerate(vocab)} # dict of index of character with character being key

idx2char = np.array(vocab) # in array i can refer element by idx

In [11]:
text_as_int = np.array([char2idx [c] for c in text]) # will  have all sequence of integer which are index of those chars in `idx2char`
text_as_int.shape

(1115395,)

In [12]:
display(type(text_as_int))
idx2char[text_as_int[0]]

numpy.ndarray

'F'

In [13]:
seq_length = 100 # how many character will the data will work at a time

example_per_epoch = len(text) // (seq_length + 1)
# examples_per_epoch tells us how many training sequences can be created from the given text, based on the defined sequence length (seq_length).

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(20): #Prints the first 20 characters from the dataset along with their indices.
    print(i.numpy(), end = ' : ')
    print(idx2char[i.numpy()])
    

18 : F
47 : i
56 : r
57 : s
58 : t
1 :  
15 : C
47 : i
58 : t
47 : i
64 : z
43 : e
52 : n
10 : :
0 : 

14 : B
43 : e
44 : f
53 : o
56 : r


In [14]:
sequences = char_dataset.batch(seq_length+1, drop_remainder = True)# Groups the dataset into sequences of length seq_length + 1 (100+1 in this case).
# drop_remainder=True ensures only complete sequences are included.

for item in sequences.take(2):
    print(item)
    print(repr(''.join(idx2char[item.numpy()])))

tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59  1], shape=(101,), dtype=int32)
'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
tf.Tensor(
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1 49], shape=(101,), dtype=int32)
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'


In [15]:
len(sequences)

11043

In [16]:
def split_input_target(chunk):
    input_text = chunk[:-1] # first hundred characters
    target_text = chunk[1:] # offset one as target
    return input_text, target_text
# Splits each sequence of length 101 into:
# Input: First 100 characters.
# Target: Next 100 characters (used as the target for prediction).
dataset = sequences.map(split_input_target)

In [17]:
# The repr function in Python returns a string representation of an object that is designed to be unambiguous.

In [18]:
for inp_ex, tar_ex in dataset.take(2):
    print(repr(''.join(idx2char[inp_ex.numpy()])))
    print(repr(''.join(idx2char[tar_ex.numpy()])))

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you '
're all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'


#### preprocessing and fetching

In [20]:
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE,
                                             drop_remainder = True)
dataset

<_BatchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int32, name=None), TensorSpec(shape=(64, 100), dtype=tf.int32, name=None))>

In [21]:
vocab_size = len(vocab)
embedding_dim = 256 # generally 2 ^ (power) embedding works better
rnn_unit = 1024


## preparing model

In [23]:
def build_model(vocab_size,
                embedding_dim,
                rnn_units,
                batch_size = BATCH_SIZE):
    return tf.keras.Sequential([
        tf.keras.layers.Input(shape = (None, ), batch_size = batch_size),
        
        tf.keras.layers.Embedding(vocab_size, embedding_dim),

        tf.keras.layers.GRU(rnn_units, return_sequences = True,
                           stateful = True,
                          recurrent_initializer = 'glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])

In [24]:
model = build_model(vocab_size, embedding_dim, rnn_unit, batch_size = BATCH_SIZE)

model.summary()

#### unit testing

In [59]:
# Unit testing:~ checkin gif the model is working or not
for inp_ex, tar_ex in dataset.take(1):
    ex_pred = model(inp_ex)

In [26]:
ex_pred.shape # 64: batch size, 100: time step, : 

TensorShape([64, 100, 65])

In [27]:
### why we sample

In [28]:
sample_indices = tf.random.categorical(ex_pred[0], num_samples = 1)
print(sample_indices.shape)

(100, 1)


In [29]:
sample_indices = tf.squeeze(sample_indices, axis = -1).numpy()

In [30]:
sample_indices.shape

(100,)

In [63]:
display(sample_indices)
print(repr(''.join(idx2char[sample_indices])))

array([ 6,  7, 41, 15, 39, 39, 10, 36, 52, 19, 31, 43, 61, 16, 49,  3, 44,
        7,  5, 30, 43, 14, 16, 36, 21, 61, 53, 25, 64, 49, 53, 60, 27, 48,
       45, 35, 12, 53, 38, 26, 12, 35, 49,  1, 27, 58, 23, 59, 10, 15,  5,
       46, 43, 48, 57, 23, 60, 15, 32, 53, 21,  8,  1, 55, 52, 27, 56, 59,
       48, 55, 57,  0, 16, 48, 20, 36, 22, 55, 21, 55, 23, 30, 16, 32,  0,
       50, 23,  4, 46, 45, 63, 28,  3,  5, 56, 64, 11, 57, 43,  4],
      dtype=int64)

",-cCaa:XnGSewDk$f-'ReBDXIwoMzkovOjgW?oZN?Wk OtKu:C'hejsKvCToI. qnOrujqs\nDjHXJqIqKRDT\nlK&hgyP$'rz;se&"


### training model

In [67]:
loss_fn = tf.losses.SparseCategoricalCrossentropy(from_logits = True)
model.compile (optimizer = 'adam',
               loss = loss_fn,
               metrics = ['accuracy'])

In [89]:
chkPath = os.path.join(modelDir, subDir)
chkPtPrefix = os.path.join(chkPath, 'chkpt_{epoch}.keras') # chkPtPrefix: telling how you should make name of the file i save

chkpt_callback = tf.keras.callbacks.ModelCheckpoint(filepath = chkPtPrefix ,)

In [None]:
history = model.fit(dataset, epochs = EPOCHS,
                    callbacks = [chkpt_callback],
                    verbose = 1)

Epoch 1/30
[1m 73/172[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m1:38[0m 994ms/step - accuracy: 0.1736 - loss: 3.6055

In [81]:
los_df = pd.DataFrame(history.history)
fn_plot_tf_hist(loss_df)

NameError: name 'history' is not defined