In [1]:
from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys

Using TensorFlow backend.


In [8]:
path = get_file('nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = open(path).read().lower()
print('corpus length:', len(text))

corpus length: 600901


In [9]:
chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 59


In [59]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

nb sequences: 200287


In [65]:
for i, j in zip(sentences[5:10], next_chars[5:10]):
    print ("sent : " + str(i) + "  last_word:  " + str(j))

sent : sing that truth is a woman--what then? i  last_word:  s
sent : g that truth is a woman--what then? is t  last_word:  h
sent : hat truth is a woman--what then? is ther  last_word:  e
sent :  truth is a woman--what then? is there n  last_word:  o
sent : uth is a woman--what then? is there not   last_word:  g


In [56]:
print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Vectorization...


In [58]:
X.shape , y.shape

((200287, 40, 59), (200287, 59))

In [14]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

Build model...


In [None]:
model.fit(X, y, batch_size=128, nb_epoch=1)

Epoch 1/1


<keras.callbacks.History at 0x7f22bc32ef90>

In [17]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [27]:
start_index = random.randint(0, len(text) - maxlen - 1)
start_index

285659

In [30]:
diversity = 0.2
print('----- diversity:', diversity)
generated = ''
sentence = text[start_index: start_index + maxlen]
generated += sentence
print('----- Generating with seed: "' + sentence + '"')
sys.stdout.write(generated)
x = np.zeros((1, maxlen, len(chars)))
for t, char in enumerate(sentence):
    x[0, t, char_indices[char]] = 1.

----- diversity: 0.2
----- Generating with seed: "thing but anonymous joy and sorrow, from"
thing but anonymous joy and sorrow, from

In [45]:
preds = model.predict(x, verbose=0)[0]
next_index = sample(preds, diversity)
next_char = indices_char[next_index]
next_char

' '

In [48]:
generated += next_char
generated

'thing but anonymous joy and sorrow, from   '

In [50]:
sentence = sentence[1:] + next_char
sentence

'ing but anonymous joy and sorrow, from  '

In [55]:
# train the model, output generated text after each iteration
#for iteration in range(1, 60):
#    print()
#    print('-' * 50)
#    print('Iteration', iteration)
#    model.fit(X, y, batch_size=128, nb_epoch=1)

start_index = random.randint(0, len(text) - maxlen - 1)

for diversity in [0.2, 0.5, 1.0, 1.2]:
    print()
    print('----- diversity:', diversity)

    generated = ''
    sentence = text[start_index: start_index + maxlen]
    generated += sentence
    print('----- Generating with seed: "' + sentence + '"')
    sys.stdout.write(generated)

    for i in range(400):
        x = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x[0, t, char_indices[char]] = 1.

        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]

        generated += next_char
        sentence = sentence[1:] + next_char

        sys.stdout.write(next_char)
        sys.stdout.flush()


----- diversity: 0.2
----- Generating with seed: "n. it is the master stroke of religions "
n. it is the master stroke of religions and is not the succession to the sain of the belief with the sain and and in the saing the really the profound and in the said and it is the sain and is and in the succession to the subjence of the supirition to the said and in the said and in the serving and in the sain and in the sain and and in the saing the sain and in the sain and in a sure and in the said and in the sain and in the sain of t
----- diversity: 0.5
----- Generating with seed: "n. it is the master stroke of religions "
n. it is the master stroke of religions must and seare and self compersible in the amout is not surcitions, and the sain, and something and been the secrifical the sassive many as always and is not to the contion to all sucrness that is the art former and posing to any indualing complay and former to compations and suspections to the revilon and in and of the sain end eve