In [1]:
# Downloading and parsing the initial text file 

import numpy as np

from tensorflow import keras

path = keras.utils.get_file(
    'nietzsche.txt',
    origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = open(path).read().lower()

print('Corpus length:', len(text))

Downloading data from https://s3.amazonaws.com/text-datasets/nietzsche.txt
Corpus length: 600893


In [2]:
# vectorizing sequence of characters

# Length of extracted character sequences
max_length = 60

# We sample a new sequence every `step` characters
step = 3

# This holds our extracted sequences
extract_sentences = []

# This holds the targets (the next characters)
nextCharacters = []

for i in range(0, len(text) - max_length, step):
    extract_sentences.append(text[i: i + max_length])
    nextCharacters.append(text[i + max_length])
print('Number of sequences:', len(extract_sentences))

# List of unique characters in the corpus
characters = sorted(list(set(text)))

print('Unique characters:', len(characters))

# Dictionary mapping unique characters to their index in `characters`
char_indices = dict((char, characters.index(char)) for char in characters)

# Next, one-hot encode the characters into binary arrays.
print('Vectorization...')
x = np.zeros((len(extract_sentences), max_length, len(characters)), dtype=np.bool)
y = np.zeros((len(extract_sentences), len(characters)), dtype=np.bool)
for i, sentence in enumerate(extract_sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[nextCharacters[i]]] = 1
print('...done')

Number of sequences: 200278
Unique characters: 57
Vectorization...


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


...done


In [3]:
# Building network
# Single-layer LSTM model for the next-character prediction

from tensorflow.keras import layers

model = keras.models.Sequential()
model.add(layers.LSTM(128, input_shape=(max_length, len(characters))))
model.add(layers.Dense(len(characters), activation='softmax'))

# Model compilation configuration
optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer) # used one-hot encoded, use categorical_crossentropy as loss to train the model

model.summary()

2022-08-22 21:49:07.915102: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 128)               95232     
_________________________________________________________________
dense (Dense)                (None, 57)                7353      
Total params: 102,585
Trainable params: 102,585
Non-trainable params: 0
_________________________________________________________________


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Given a trained model and a seed text snippet, we generate new text by repeatedly:

1) Drawing from the model a probability distribution over the next character given the text available so far; 2) Reweighting the distribution to a certain "temperature"; 3) Sampling the next character at random according to the reweighted distribution; 4) Adding the new character at the end of the available text;

This is the code we use to reweight the original probability distribution coming out of the model, and draw a character index from it (the "sample_next_char function"):

In [4]:
# Training the language model and sampling from it
# function to sample the next character given the model's predictions

def sample_next_char(predictions, temperature=1.0):
    predictions = np.asarray(predictions).astype('float64')
    predictions = np.log(predictions) / temperature
    exp_preds = np.exp(predictions)
    predictions = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, predictions, 1)
    
    return np.argmax(probas)

In [5]:
# Text-generation loop
import random
import sys

NUM_EPOCHS = 60
CHAR_GENERATED_TEXT = 400 # We generate 400 characters

for epoch in range(1, NUM_EPOCHS):
    print('epoch', epoch)
    
    # Fit the model for 1 epoch on the available training data
    model.fit(x, y, batch_size=128, epochs=1)

    # Select a text seed at random
    start_index = random.randint(0, len(text) - max_length - 1)
    generated_text = text[start_index: start_index + max_length]
    print(f"--- Generating with seed: \"{generated_text}\"")

    for temperature in [0.2, 0.5, 1.0, 1.2]:
        print(f"------ temperature: {temperature}")
        sys.stdout.write(generated_text)

        for i in range(CHAR_GENERATED_TEXT):
            sampled = np.zeros((1, max_length, len(characters)))
            for t, char in enumerate(generated_text):
                sampled[0, t, char_indices[char]] = 1.

            predictions = model.predict(sampled, verbose=0)[0]
            next_index = sample_next_char(predictions, temperature)
            next_char = characters[next_index]

            generated_text += next_char
            generated_text = generated_text[1:]

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

epoch 1


2022-08-22 21:49:09.248699: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


--- Generating with seed: " possible, as origin out of an intention; people were agreed"
------ temperature: 0.2
 possible, as origin out of an intention; people were agreed there is there is a self-insenting to the respines of the there is a something they is a something they all there is a soul in the selfess, they which he consection of the say there is a sense of the self-inclates of the self--in the self--there is there is a moral something they will to has in the self--not in the selfes they is a sensiciation of the self--in the self--and there is a self--in th
------ temperature: 0.5
ciation of the self--in the self--and there is a self--in they with the sendering--it is a life there of the pressises of the allowing, and must and therowy with there is is seams of there and mankind inforections, his life of thet as there as even himself and become we lown of the garts, himself of the mank in a call confursed of the for the presenting, to the has life in the sentition of the sense

  


ally the moral of philosopher heart, the germany things 
------ temperature: 1.0
e really the moral of philosopher heart, the germany things prevaile so has stouknay full
riche of lamere, has likeness, has one
which actually by all the depraining logical coniess to ever "the exceptions, has it
he
interestives, and mlase refined from us accompaniment of spoul manner
who
do necessary minding itself been you"se or his abvend hore. but a
list our
with is dasce in their europe is come.
hened historical opens, gives generation: wherecor imme
------ temperature: 1.2
ome.
hened historical opens, gives generation: wherecor immediasp such a neartmans fundamental divine : a case, no retars: something is we in
resgoquence good can mojaw eley palichss pruiss to
lage, whereforated by the superiwy of hpite reverness sime
consequenged: he gerwor literable
noblem of tale variable? smeevemorded i higher lofty t-unleversy, live
they have oneement: i loved", that man was bacube to to prowarters, consint, 