In [None]:
import tensorflow as tf

import numpy as np
import os
import time

In [None]:
path_to_file = tf.keras.utils.get_file('origin.txt', 'https://www.gutenberg.org/files/1228/1228-0.txt')

In [None]:
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print ('Length of text: {} characters'.format(len(text)))

Length of text: 969034 characters


In [None]:
print(text[:250])

﻿The Project Gutenberg EBook of On the Origin of Species, by Charles Darwin

This eBook is for the use of anyone anywhere in the United States and most
other parts of the world at no cost and with almost no restrictions
whatsoever.  You may copy 


In [None]:
# The unique characters in the file
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

97 unique characters


In [None]:
vocab

['\n',
 '\r',
 ' ',
 '!',
 '"',
 '#',
 '$',
 '%',
 "'",
 '(',
 ')',
 '*',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '?',
 '@',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 ']',
 '_',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '°',
 'ä',
 'æ',
 'ë',
 'ö',
 'ü',
 '—',
 '‘',
 '’',
 '“',
 '”',
 '\ufeff']

In [None]:

# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [None]:

# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
  print(idx2char[i.numpy()])

﻿
T
h
e
 


In [None]:
char_dataset

<TensorSliceDataset shapes: (), types: tf.int64>

In [None]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
  print(repr(''.join(idx2char[item.numpy()])))


'\ufeffThe Project Gutenberg EBook of On the Origin of Species, by Charles Darwin\r\n\r\nThis eBook is for the '
'use of anyone anywhere in the United States and most\r\nother parts of the world at no cost and with al'
'most no restrictions\r\nwhatsoever.  You may copy it, give it away or re-use it under the terms of\r\nthe'
' Project Gutenberg License included with this eBook or online at\r\nwww.gutenberg.org.  If you are not '
"located in the United States, you'll have\r\nto check the laws of the country where you are located bef"


In [None]:

def split_input_target(chunk):
  input_text = chunk[:-1]
  target_text = chunk[1:]
  return input_text, target_text

dataset = sequences.map(split_input_target)

In [None]:

for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  '\ufeffThe Project Gutenberg EBook of On the Origin of Species, by Charles Darwin\r\n\r\nThis eBook is for the'
Target data: 'The Project Gutenberg EBook of On the Origin of Species, by Charles Darwin\r\n\r\nThis eBook is for the '


In [None]:

# Batch size
BATCH_SIZE = 128


# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((128, 100), (128, 100)), types: (tf.int64, tf.int64)>

In [None]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [None]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
     tf.keras.layers.Embedding(vocab_size, embedding_dim,
     batch_input_shape=[batch_size, None]),
     tf.keras.layers.Dropout(0.2),
     tf.keras.layers.LSTM(rnn_units,
     return_sequences=True,
     stateful=True,
     recurrent_initializer='glorot_uniform'),
     tf.keras.layers.Dropout(0.2), 
     tf.keras.layers.LSTM(rnn_units,
     return_sequences=True,
     stateful=True,
     recurrent_initializer='glorot_uniform'),
     tf.keras.layers.Dropout(0.2),
     tf.keras.layers.Dense(300),
     tf.keras.layers.Dense(vocab_size)
 ])
  return model

In [None]:

model = build_model(
    vocab_size = len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)

In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(128, 100, 97) # (batch_size, sequence_length, vocab_size)


In [None]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (128, None, 256)          24832     
_________________________________________________________________
dropout_21 (Dropout)         (128, None, 256)          0         
_________________________________________________________________
lstm_14 (LSTM)               (128, None, 1024)         5246976   
_________________________________________________________________
dropout_22 (Dropout)         (128, None, 1024)         0         
_________________________________________________________________
lstm_15 (LSTM)               (128, None, 1024)         8392704   
_________________________________________________________________
dropout_23 (Dropout)         (128, None, 1024)         0         
_________________________________________________________________
dense_14 (Dense)             (128, None, 300)         

In [None]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
#This gives us, at each timestep, a prediction of the next character index:
sampled_indices

array([80, 55, 89, 79, 13, 86, 79, 51, 16, 75, 60, 32, 63, 90, 91, 58, 68,
       94, 65, 31, 62, 80, 58, 79, 36, 43, 88, 58, 42, 40, 60, 23, 82, 59,
       35, 39, 52, 43, 36, 92, 26, 51, 57, 96, 22, 53, 78, 58, 87, 70, 28,
       70, 61, 18, 52,  5, 31,  5, 53, 38, 86, 20, 32, 52, 96, 30, 79, 50,
       78, 21, 20,  4, 54, 72, 31, 50, 55, 66, 96, 93, 31, 19, 95, 16, 74,
       28, 32, 14, 47, 87, 64, 27,  5,  6,  8, 89, 61,  3, 57, 10])

In [None]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())


Prediction shape:  (128, 100, 97)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.573486


In [None]:

model.compile(optimizer='adam', loss=loss, metrics=["accuracy"])

In [None]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)
patience = 10
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience)


In [None]:

EPOCHS=50
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:

tf.train.latest_checkpoint(checkpoint_dir)
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (1, None, 256)            24832     
_________________________________________________________________
dropout_24 (Dropout)         (1, None, 256)            0         
_________________________________________________________________
lstm_16 (LSTM)               (1, None, 1024)           5246976   
_________________________________________________________________
dropout_25 (Dropout)         (1, None, 1024)           0         
_________________________________________________________________
lstm_17 (LSTM)               (1, None, 1024)           8392704   
_________________________________________________________________
dropout_26 (Dropout)         (1, None, 1024)           0         
_________________________________________________________________
dense_16 (Dense)             (1, None, 300)           

In [None]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 4

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.0

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
    predictions = model(input_eval)
    # remove the batch dimension
    predictions = tf.squeeze(predictions, 0)

    # using a categorical distribution to predict the character returned by the model
    predictions = predictions / temperature
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

    # We pass the predicted character as the next input to the model
    # along with the previous hidden state
    input_eval = tf.expand_dims([predicted_id], 0)

    text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [None]:
 
print(generate_text(model, start_string="evolu"))

evolution
