In [2]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, LSTM, Embedding

In [3]:
file_URL = "https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt"
file_name= "shakespeare.txt"
# get the file path
path = keras.utils.get_file(file_name, file_URL) 

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [4]:
raw = open(path, 'rb').read()
print(raw[250:400]) 

b"\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us kill him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it "


In [5]:
text = raw.decode(encoding='utf-8')
print(text[250:400]) 


All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it 


In [6]:
len(text)

1115394

In [7]:
vocabulary = np.array(sorted(set(text)))
len(vocabulary)

65

In [8]:
# assign an integer to each character
tokenizer = {char:i for i,char in enumerate(vocabulary)} 

In [9]:
# check characters and its corresponding integer
for i in range(20):
    char = vocabulary[i]
    token = tokenizer[char]
    print('%4s : %4d'%(repr(char),token)) 

'\n' :    0
 ' ' :    1
 '!' :    2
 '$' :    3
 '&' :    4
 "'" :    5
 ',' :    6
 '-' :    7
 '.' :    8
 '3' :    9
 ':' :   10
 ';' :   11
 '?' :   12
 'A' :   13
 'B' :   14
 'C' :   15
 'D' :   16
 'E' :   17
 'F' :   18
 'G' :   19


In [10]:
vector = np.array([tokenizer[char] for char in text])
print('\nSample Text \n')
print('-'*70)
print(text[:100])
print('-'*70)
print('\n\nCorresponding Integer Vector \n')
print('-'*70)
print(vector[:100])
print('-'*70) 


Sample Text 

----------------------------------------------------------------------
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You
----------------------------------------------------------------------


Corresponding Integer Vector 

----------------------------------------------------------------------
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59]
----------------------------------------------------------------------


In [11]:
# convert into tensors
vector = tf.data.Dataset.from_tensor_slices(vector)
# make sequences each of length 100 characters
sequences = vector.batch(100, drop_remainder=True) 

In [12]:
def prepare_dataset(seq):
    input_vector = seq[:-1]
    target_vector = seq[1:]
    return input_vector, target_vector
dataset = sequences.map(prepare_dataset) 

In [13]:
# check how it looks
for inp, tar in dataset.take(1):
    print(inp.numpy())
    print(tar.numpy())
    inp_text = ''.join(vocabulary[inp])
    tar_text = ''.join(vocabulary[tar])
    print(repr(inp_text))
    print(repr(tar_text)) 

[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53]
[47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43  1
 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43 39
 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49  6
  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0
 37 53 59]
'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYo'
'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'


In [14]:
AUTOTUNE = tf.data.AUTOTUNE
# buffer size 10000
# batch size 64
data = dataset.batch(64, drop_remainder=True).repeat()
data = data.prefetch(AUTOTUNE)
# steps per epoch is number of batches available
STEPS_PER_EPOCH = len(sequences)//64
for inp, tar in data.take(1):
    print(inp.numpy().shape)
    print(tar.numpy().shape)

(64, 99)
(64, 99)


In [16]:
model = keras.Sequential([
     # Embed len(vocabulary) into 64 dimensions
     Embedding(len(vocabulary), 64, batch_input_shape=[64,None]),
     # LSTM RNN layers
     LSTM(512, return_sequences=True, stateful=True),
     LSTM(512, return_sequences=True, stateful=True),
     # Classification head
     Dense(len(vocabulary))
 ])
model.summary() 

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 64)            4160      
                                                                 
 lstm (LSTM)                 (64, None, 512)           1181696   
                                                                 
 lstm_1 (LSTM)               (64, None, 512)           2099200   
                                                                 
 dense (Dense)               (64, None, 65)            33345     
                                                                 
Total params: 3,318,401
Trainable params: 3,318,401
Non-trainable params: 0
_________________________________________________________________


In [17]:
# test whether the untrained model performs good
for example_inp, example_tar in data.take(1):
    example_pred = model(example_inp)
    print(example_tar.numpy().shape)
    print(example_pred.shape) 

(64, 99)
(64, 99, 65)


In [18]:
model.compile(optimizer='adam', loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True))
history = model.fit(data, epochs=10, steps_per_epoch=STEPS_PER_EPOCH) 

Epoch 1/10

KeyboardInterrupt: 