In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Character-level language modeling using LSTMs 

@Author: Sameer Kesava
          * TensorFlow version: 2.0.0-alpha
          * Training data: "Tragedy of Hamlet" https://www.gutenberg.org/cache/epub/1787/pg1787.txt
          * 1 custom layer converting the input uint8 type data into one-hot float32 type categorical data
          * 1 LSTM layer with 1024 units with Stateful = True 
          * 1 Dense layer with the number of units equal to the number of unique characters
          * Number of time steps/Sequence Length: 100  
          * Dropout: 0      
          * Categorical Cross Entropy Loss function    
          * Adam Optimizer(learning rate: .001)
          * Batch size: 64
          * Epochs: 100
          * Categorical accuracy: 99.95%
          * Prediction Function 1 uses tf.argmax for the next character
          * Prediction Function 2 uses tf.random.categorical for the next character following the tensorflow "Text Generation" example
 

In [2]:
import tensorflow as tf
import numpy as np

In [3]:
tf.executing_eagerly()

True

In [4]:
tf.test.is_gpu_available()

False

### Loading the text file

In [5]:
# Remove the headers from the text file before reading it in
with open('pg2265_noheader.txt', 'rt', encoding = 'utf-8') as f:
    text = f.read()

In [6]:
len(text)

162850

In [7]:
text[0:23]

'The Tragedie of Hamlet\n'

##### Number of unique characters

In [8]:
char_set = sorted(set(text))
char_set[:10]

['\n', ' ', '!', '&', "'", '(', ')', ',', '-', '.']

In [9]:
num_classes = len(char_set)
num_classes

65

In [10]:
char2int = {ch:i for i, ch in enumerate(char_set)}
char2int

{'\n': 0,
 ' ': 1,
 '!': 2,
 '&': 3,
 "'": 4,
 '(': 5,
 ')': 6,
 ',': 7,
 '-': 8,
 '.': 9,
 '1': 10,
 ':': 11,
 ';': 12,
 '?': 13,
 'A': 14,
 'B': 15,
 'C': 16,
 'D': 17,
 'E': 18,
 'F': 19,
 'G': 20,
 'H': 21,
 'I': 22,
 'K': 23,
 'L': 24,
 'M': 25,
 'N': 26,
 'O': 27,
 'P': 28,
 'Q': 29,
 'R': 30,
 'S': 31,
 'T': 32,
 'V': 33,
 'W': 34,
 'Y': 35,
 'Z': 36,
 '[': 37,
 ']': 38,
 'a': 39,
 'b': 40,
 'c': 41,
 'd': 42,
 'e': 43,
 'f': 44,
 'g': 45,
 'h': 46,
 'i': 47,
 'j': 48,
 'k': 49,
 'l': 50,
 'm': 51,
 'n': 52,
 'o': 53,
 'p': 54,
 'q': 55,
 'r': 56,
 's': 57,
 't': 58,
 'u': 59,
 'v': 60,
 'w': 61,
 'x': 62,
 'y': 63,
 'z': 64}

In [11]:
# or {i:ch for i, ch in enumerate(char_set)}
# or int2char = dict(enumerate(char_set))
# or simply this
int2char_np = np.array(char_set)
int2char_np

array(['\n', ' ', '!', '&', "'", '(', ')', ',', '-', '.', '1', ':', ';',
       '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M',
       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'Z', '[', ']',
       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'],
      dtype='<U1')

#### Converting the text to integers

In [12]:
text_int = [char2int[x] for x in text]

In [13]:
len(text_int)

162850

##### Creating an array with dtype as uint8. If more than 256 characters, this needs to change

In [14]:
text_int = np.array(text_int, dtype = np.uint8)

### Creating X and Y data for training

In [15]:
# The number of time steps for training
seq_length = 100

##### Using tensorflow dataset class for fast processing

In [16]:
text_int_tensor = tf.data.Dataset.from_tensor_slices(text_int)
for i in text_int_tensor.take(10):
    print(int2char_np[i.numpy()])

T
h
e
 
T
r
a
g
e
d


In [17]:
sequenced_batch = text_int_tensor.batch(batch_size=seq_length+1, drop_remainder=True)

In [18]:
for i in sequenced_batch.take(2):
    print(repr(''.join(int2char_np[i.numpy()])))

'The Tragedie of Hamlet\n\nActus Primus. Scoena Prima.\n\nEnter Barnardo and Francisco two Centinels.\n\n  B'
"arnardo. Who's there?\n  Fran. Nay answer me: Stand & vnfold\nyour selfe\n\n   Bar. Long liue the King\n\n "


In [19]:
def X_Y_batches(batch):
    input_data = batch[:-1]
    target_data = batch[1:]
    return input_data, target_data

In [20]:
dataset = sequenced_batch.map(X_Y_batches)

In [21]:
for x_ts, y_ts in dataset.take(1):
    print(repr(''.join(int2char_np[x_ts.numpy()])))
    print(repr(''.join(int2char_np[y_ts.numpy()])))

'The Tragedie of Hamlet\n\nActus Primus. Scoena Prima.\n\nEnter Barnardo and Francisco two Centinels.\n\n  '
'he Tragedie of Hamlet\n\nActus Primus. Scoena Prima.\n\nEnter Barnardo and Francisco two Centinels.\n\n  B'


In [22]:
batch_size = 64

In [23]:
tf.random.set_seed(seed = 100)
dataset = dataset.shuffle(buffer_size=10000).batch(batch_size = batch_size, drop_remainder = True)

In [24]:
count = 0
for i in dataset.take(-1):
    count += 1
count

25

In [25]:
train_x = []
train_y = []
for ts_x,ts_y in dataset.take(-1):
    train_x.append(ts_x.numpy())
    train_y.append(ts_y.numpy())

##### Converting to numpy array

In [26]:
train_x = np.array(train_x)
train_y = np.array(train_y)

In [27]:
train_x.shape

(25, 64, 100)

##### Categorizing y data only

In [28]:
cat_train_y = tf.reshape(train_y, [-1])

In [29]:
cat_train_y.shape

TensorShape([160000])

In [30]:
cat_train_y = tf.one_hot(cat_train_y, depth=num_classes, axis = -1)

In [31]:
cat_train_y.shape

TensorShape([160000, 65])

In [32]:
cat_train_y = tf.reshape(cat_train_y, shape = [i for i in np.array(train_y).shape] + [65])

In [33]:
cat_train_y.shape

TensorShape([25, 64, 100, 65])

###### Cross-checking 

In [34]:
rand_int = np.random.randint(low = 0, high=25, size = (2,))
rand_int

array([ 4, 12])

In [35]:
print(''.join(int2char_np[train_x[3,8]]))

: It harrowes me with fear & wonder
  Barn. It would be spoke too

   Mar. Question it Horatio

   H


In [36]:
print(''.join(int2char_np[tf.argmax(cat_train_y[3,8], axis = 1).numpy()]))

 It harrowes me with fear & wonder
  Barn. It would be spoke too

   Mar. Question it Horatio

   Ho


### Creating a layer to categorize the input data (can also use Lambda function)

In [37]:
class categorical_layer(tf.keras.layers.Layer):
    """Creating a layer to convert the input data into categorical data"""
    def __init__(self, num_classes):
        super(categorical_layer, self).__init__()
        self.num_classes = num_classes
     
    def call(self, input_):
        return tf.one_hot(input_, depth=self.num_classes, dtype=tf.float32)

###### Testing

In [38]:
test_model = tf.keras.Sequential([tf.keras.Input(shape=(seq_length,), batch_size=batch_size, dtype=np.uint8),
                                 categorical_layer(num_classes=num_classes),
                                 tf.keras.layers.Dense(units = 65, activation = None),                                  
                                 ])

In [39]:
output =  test_model(train_x[0])
output.shape

TensorShape([64, 100, 65])

In [40]:
cat_train_y[0].shape

TensorShape([64, 100, 65])

###### End Testing

### Building RNN

In [41]:
def model_fn(batchsize = 64, lstm_units = 64, dropout = 0.5, stateful = True):
    model = tf.keras.Sequential([tf.keras.Input(shape = (seq_length,), batch_size=batchsize, dtype = np.uint8),
                                 categorical_layer(num_classes=num_classes),
                             tf.keras.layers.LSTM(units = lstm_units, activation='tanh', return_sequences=True,
                                                 stateful=stateful, dropout = dropout, recurrent_initializer = 'glorot_uniform'),
                             tf.keras.layers.Dense(units=num_classes, activation=None)
                                ])
    return model

In [42]:
lstm1_units = 1024
dropout1 = 0
stateful_ = True
model =  model_fn(batch_size, lstm1_units, dropout1, stateful_)

In [43]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
categorical_layer_1 (categor (64, 100, 65)             0         
_________________________________________________________________
unified_lstm (UnifiedLSTM)   (64, 100, 1024)           4464640   
_________________________________________________________________
dense_1 (Dense)              (64, 100, 65)             66625     
Total params: 4,531,265
Trainable params: 4,531,265
Non-trainable params: 0
_________________________________________________________________


In [44]:
def loss(labels, logits):
    return tf.keras.losses.categorical_crossentropy(labels, logits, from_logits=True)

In [45]:
model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.001), loss = loss, metrics = [tf.keras.metrics.categorical_accuracy])

#### Checkpoints

In [46]:
import os

In [47]:
checkpoint_dir = './checkpoint_dir'

In [48]:
if os.path.exists(checkpoint_dir):
    pass
else:
    os.mkdir(checkpoint_dir)

In [49]:
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

In [50]:
checkpoint_callback =  tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

### Fitting

In [51]:
batches = train_x.shape[0]
def data_generator(epochs):
    """Generator for yielding batches for model fitting"""
    for i in range(epochs):
        for i in range(batches):
            yield train_x[i], cat_train_y[i]

In [52]:
initial_epoch = 0
final_epoch = 100
data_gen = data_generator(final_epoch-initial_epoch)
history = model.fit_generator(data_gen, steps_per_epoch=batches, epochs=final_epoch, callbacks=[checkpoint_callback],
                               shuffle=False, initial_epoch=initial_epoch)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

### Prediction function 1 using tf.argmax 

In [53]:
def pred_fn_1(input_text = 'ABC', text_len = 1000):
    """Batch size = 1"""
    
    # Checking if the input_text characters are in the character_set
    for i in input_text:
        if i not in char_set:
            print('%s character not in the text' %i)
            return    
    
    # Convert input text to numbers, encode and reshape
    input_seq = [char2int[i] for i in input_text]
    
    
    # Create new model and load_weights from the saved model
    model = model_fn(batchsize=1, lstm_units=1024, dropout=0, stateful=True)
    model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
    model.build(input_shape=(1, None))
    model.reset_states()
    
    generated_seq = tf.expand_dims(input_seq, axis = 0)
    
    for i in range(text_len):
              
        prediction = model(generated_seq)
        
        # Removing the batch dimension
        prediction = tf.squeeze(prediction, axis = 0)
        
        prediction =  tf.argmax(prediction[-1]).numpy()
        
        generated_seq = tf.expand_dims([prediction], axis=0)
        
        input_seq.append(prediction)
    
    del model    
    print('Output: \n' + ''.join(int2char_np[input_seq]))

In [54]:
pred_fn_1('Barn:', 1000)

Output: 
Barn: This the chaple good:
It ouchmest he dad beere the woold goofong this is Pouther,
Cous ffrme, will man  am inder'd e heare,
That Ile sot wnowe vsow mer Fayes bo he owee, And comle gines our State

   Qu. Ohrech ase this such thous se thes what you will speake will geee your gooners serse,
Now where the concruffor drawne heare fore,
as he is heare and alloue vp the colsequonce:
It wronco heare a beaterouere it selle in a brine of ity steeme,
And it ar henching that this d and the will be the

   Laer. Good God some Laty, this mad goed to Heauen,
A blanke tham it wo at whis it feree ouchiss, and themer Possenoted

   Qu. That the Scull cranse and make of you

   Ham. Nor Hamlet of the Marthes, beaues.

   Osh. I chall ob your portandesce.
Whereis our Seauen,
The Compustion crads haue kell mo Hor at thuse be anchellyoungge, Ofreaine.
Oh that a Rogee and Sonder and Marneryes Mayes the Dengerous

   Ham. How dong the Que net stand an  no, whe Loue of I at his heales:
So fasie

### Prediction function 2 using tf.random.categorical

In [57]:
def pred_fn_2(input_text = 'ABC', text_len = 1000, predict_factor = 1 ):
    """Batch size = 1"""
    
    # Checking if the input_text characters are in the character_set
    for i in input_text:
        if i not in char_set:
            print('%s character not in the text' %i)
            return    
    
    # Convert input text to numbers, encode and reshape
    input_seq = [char2int[i] for i in input_text]
    
    
    # Create new model and load_weights from the saved model
    model = model_fn(batchsize=1, lstm_units=1024, dropout=0, stateful=True)
    model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
    model.build(tf.TensorShape([1,None]))
    model.reset_states()
    
    generated_seq = tf.expand_dims(input_seq, axis = 0)
    
    # Predict_factor is for creating unpredictability in the prediction (from tensorflow text generation example)
    # Higher it is, more interesting prediction
    predict_factor = 1
    
    for i in range(text_len):
                       
        prediction = model(generated_seq)
        
        prediction = tf.squeeze(prediction, axis = 0)/predict_factor
        
        prediction =  tf.squeeze(tf.random.categorical(tf.expand_dims(prediction[-1]/1, axis=0), num_samples=1), axis=0).numpy()[0]
        
        generated_seq = tf.expand_dims([prediction], axis=0)
        
        input_seq.append(prediction)
    
    del model    
    print('Output: \n' + ''.join(int2char_np[input_seq]))    

In [58]:
pred_fn_2('Barn:', 1000)

Output: 
Barn: Nor mis Ser ione you tey no more dishate to keele:
Sue not sengounncred man sercaitut vno my shele
Hable Vortuen, whing thou distole Mo that Delmarknot This Courtelle?
I hay dabe delieur the Surion, in I theate
Of Hat are leat vnow some good yot seeme his is and to deere are all

   goon. Haue you my so hanr that? a Maidy your Moriur

   Ham. Haw chante for ot be ceafe

   Ham. I me gles of him: that sane the seening  o leace times s
 on mer Sppasit shell seepenour lood,
ie wath dese that Sealoue in the besore of his

   Ham. You beane th s me Lore, he what no serucheane the Monne owne Mosters, in RenilTang.
You muct amparring of hir Loue, addidda turth; and wielles it of thee,
But in the with s ce
s good Leed, whare as the negitious effells, the Pray remember mels

   Ham. She porsue. There's a stlagge. This is the corsunere Ilaindeyot stowhal'd,
At bucken their desioues, Spare. I houe the Ploeruly PlferFe
Detisle then, Nathmar nownage in't,
Trea't it plas'st must my Fo