In [32]:
import numpy as np

### Preprocessing the Dataset

In [33]:
with open('1268-0.txt','r') as fp:
    text= fp.read()
start_indx= text.find('THE MYSTERIOUS ISLAND')
end_indx= text.find('End of the Project Gutenberg')
text= text[start_indx : end_indx]
char_set= set(text)
print('Total Length: ', len(text))

Total Length:  1112350


In [34]:
print('Unique Characters: ', len(char_set))

Unique Characters:  80


In [35]:
chars_sorted= sorted(char_set)
char2int={ch: i for i,ch in enumerate(chars_sorted)}
char_array= np.array(chars_sorted)
text_encoded= np.array([char2int[ch] for ch in text], dtype= np.int32)
print('Text encoded shape: ', text_encoded.shape)

Text encoded shape:  (1112350,)


In [36]:
print(text[:15],'== Encoding ==>',text_encoded[:15])
print(text_encoded[15:21],'== Reverse ==>',''.join(char_array[text_encoded[15:21]]))

THE MYSTERIOUS  == Encoding ==> [44 32 29  1 37 48 43 44 29 42 33 39 45 43  1]
[33 43 36 25 38 28] == Reverse ==> ISLAND


In [37]:
import tensorflow as tf
ds_text_encoded= tf.data.Dataset.from_tensor_slices(text_encoded)
for ex in ds_text_encoded.take(5):
    print('{} -> {}'.format(ex.numpy(), char_array[ex.numpy()]))

44 -> T
32 -> H
29 -> E
1 ->  
37 -> M


In [38]:
seq_length= 40
chunk_size= seq_length+1
ds_chunks= ds_text_encoded.batch(chunk_size, drop_remainder=True)

#define the function for splitting x and y
def split_input_target(chunk):
    input_seq= chunk[:-1]
    target_seq= chunk[1:]
    return input_seq, target_seq

ds_sequences= ds_chunks.map(split_input_target)

In [39]:
for example in ds_sequences.take(2):
    print('Input (x): ', repr(''.join(char_array[example[0].numpy()])))
    print('Input (y): ', repr(''.join(char_array[example[1].numpy()])))
    print()

Input (x):  'THE MYSTERIOUS ISLAND ***\n\n\n\n\nProduced b'
Input (y):  'HE MYSTERIOUS ISLAND ***\n\n\n\n\nProduced by'

Input (x):  ' Anthony Matonak, and Trevor Carlson\n\n\n\n'
Input (y):  'Anthony Matonak, and Trevor Carlson\n\n\n\n\n'



In [40]:
BATCH_SIZE=64
BUFFER_SIZE= 10000
ds= ds_sequences.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

### Building a character-level RNN model

In [41]:
def build_model(vocab_size, embedding_dim, rnn_units):
    model= tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [42]:
#Setting the training parameters
charset_size= len(char_array)
embedding_dim= 256
rnn_units= 512
tf.random.set_seed(1)
model= build_model(vocab_size=charset_size, embedding_dim= embedding_dim, rnn_units= rnn_units)
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 256)         20480     
_________________________________________________________________
lstm_1 (LSTM)                (None, None, 512)         1574912   
_________________________________________________________________
dense_1 (Dense)              (None, None, 80)          41040     
Total params: 1,636,432
Trainable params: 1,636,432
Non-trainable params: 0
_________________________________________________________________


In [43]:
model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))

In [44]:
model.fit(ds,epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x118122650>

### Evaluation phase

In [45]:
tf.random.set_seed(1)
logits= [[1.0,1.0,1.0]]
print('Probabilities:', tf.math.softmax(logits).numpy()[0])

Probabilities: [0.33333334 0.33333334 0.33333334]


In [46]:
samples= tf.random.categorical(logits=logits, num_samples=10)
tf.print(samples.numpy())

array([[0, 0, 1, 2, 0, 0, 0, 0, 1, 0]])


In [47]:
tf.random.set_seed(1)
logits= [[1.0,1.0,3.0]]
print('Probabilities:', tf.math.softmax(logits).numpy()[0])

Probabilities: [0.10650698 0.10650698 0.78698605]


In [48]:
samples= tf.random.categorical(logits=logits, num_samples=10)
tf.print(samples.numpy())

array([[2, 0, 2, 2, 2, 0, 1, 2, 2, 0]])


In [49]:
def sample(model, starting_str, len_generated_text=500,max_input_length=40, scale_factor=1.0):
    encoded_input= [char2int[s] for s in starting_str]
    encoded_input= tf.reshape(encoded_input, (1,-1))
    generated_str= starting_str
    model.reset_states()
    for i in range(len_generated_text):
        logits= model(encoded_input)
        logits= tf.squeeze(logits,0)
        scaled_logits= logits*scale_factor
        new_char_indx= tf.random.categorical(scaled_logits, num_samples=1)
        new_char_indx= tf.squeeze(new_char_indx)[-1].numpy()
        generated_str += str(char_array[new_char_indx])
        new_char_indx= tf.expand_dims([new_char_indx],0)
        encoded_input= tf.concat([encoded_input, new_char_indx], axis=1)
        encoded_input= encoded_input[:,-max_input_length:]
    return generated_str

In [50]:
tf.random.set_seed(1)
print(sample(model,starting_str='The island'))

The island is our boat was does that the unknown part of her in the chest, through the
waters of the brig, and that doubtless, made rather in the dark
aside.

“Top? Are the car, as soon still it hastened the boat, blew to bark on board Union here, not to go do nothing could go and partly
him, and it was impossible to add possibility, and that completular
will
be looked, without it do.”

“No,” said Cyrus Harding.

“I think that is, or store, all did rivery! And I think that clothed out of which I believe i


In [51]:
logits= np.array([1.0,1.0,3.0])
print('Probabilities before scaling: ', tf.math.softmax(logits).numpy()[0])
print('Probabilities after scaling with 0.5: ',tf.math.softmax(0.5*logits).numpy()[0])
print('Probabilities after scaling with 0.1: ', tf.math.softmax(0.1*logits).numpy()[0])

Probabilities before scaling:  0.10650697891920076
Probabilities after scaling with 0.5:  0.21194155761708547
Probabilities after scaling with 0.1:  0.31042377345300565


In [52]:
tf.random.set_seed(1)
print(sample(model,starting_str='The island', scale_factor=2.0))

The island or the colonists were therefore to be going to a height of the poultry-yard the convicts were of nothing could not have been a part of the basaltic water appeared to be to search for a few minutes again in the danger of the captain, while the convicts would be descended the principal rapidly of a ship in the lake, and in the single creeks, and
the cart was suddenly in the bottom of the palisade. The destruction of the volcano, the captain examined the shore, the colonists were unable to restrai


In [53]:
tf.random.set_seed(1)
print(sample(model, starting_str='The island', scale_factor=0.5))

The island
happilude New Zoas! asdesonscerstocable Island
been ahrnocking somoth of oacthm arriked at
similar role?.
However, no5 caseen!
Glamn’s capa,
nests. evirtable pickaliar rupining; bears jomweked. Perboater, from: NI, Mrauctimb Captain Weftered,--Cpornel soon becomen; on Unlansivalcle climboding floruce
rogan,
for
in ma1 Marks supped severest” prawora,
partrwist,” requisix, happines, num
Cranclicly I liquel duedy fromaggesh, issuped aboublifuly two acce! fould treig mingle, with
Hondey ancapersfol
