In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import io

Using TensorFlow backend.


In [25]:
from keras.utils import to_categorical
from IPython.core.debugger import set_trace

In [3]:
path = get_file(
    'nietzsche.txt',
    origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
with io.open(path, encoding='utf-8') as f:
    text = f.read().lower()
print('corpus length:', len(text))

corpus length: 600893


In [4]:
n_text = len(text)
n_seq = 40

In [5]:
chars = sorted(list(set(text)))
n_chars = len(chars)
print(chars, n_chars)

['\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ä', 'æ', 'é', 'ë'] 57


In [6]:
c2i = {x : i for i, x in enumerate(chars)}
i2c = chars
print(c2i)

{'\n': 0, ' ': 1, '!': 2, '"': 3, "'": 4, '(': 5, ')': 6, ',': 7, '-': 8, '.': 9, '0': 10, '1': 11, '2': 12, '3': 13, '4': 14, '5': 15, '6': 16, '7': 17, '8': 18, '9': 19, ':': 20, ';': 21, '=': 22, '?': 23, '[': 24, ']': 25, '_': 26, 'a': 27, 'b': 28, 'c': 29, 'd': 30, 'e': 31, 'f': 32, 'g': 33, 'h': 34, 'i': 35, 'j': 36, 'k': 37, 'l': 38, 'm': 39, 'n': 40, 'o': 41, 'p': 42, 'q': 43, 'r': 44, 's': 45, 't': 46, 'u': 47, 'v': 48, 'w': 49, 'x': 50, 'y': 51, 'z': 52, 'ä': 53, 'æ': 54, 'é': 55, 'ë': 56}


## Create Sentences

In [7]:
stride = 3
sentences, next_char = [], []

for i in range(0, n_text-n_seq-1, stride):
    sentences.append(text[i: i+n_seq])
    next_char.append(text[i+n_seq])

N = len(sentences)
print(N, len(next_char))

200284 200284


## Vectorization

In [8]:
x = np.zeros((N, n_seq, n_chars), dtype=np.bool)
y = np.zeros((N, n_chars), dtype=np.bool)

for i, line in enumerate(sentences):
    for t, c in enumerate(line):
        x[i, t, c2i[c]] = 1
    y[i, c2i[next_char[i]]] = 1

# Build NN

In [10]:
model = Sequential([
    LSTM(128, input_shape=(n_seq, n_chars)),
    Dense(n_chars, activation='softmax')
])

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

## Sampling

In [34]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
#     set_trace()
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [27]:
def on_epoch_end(epoch, _):
    print(f'\n\n--- Gen text after epoch {epoch}')
    start_idx = random.randint(0, n_text-n_seq-1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print(f'\nDiversity: {diversity}')
        result = []
        sample_x = text[start_idx: start_idx+n_seq]
        
        for i in range(400): # Sample 400 chars
            x_oh = to_categorical([c2i[x] for x in sample_x], num_classes=n_chars, dtype=np.bool)
            y_pred = model.predict(x_oh[None,])[0]
            y_idx = sample(y_pred, diversity)
            result.append(i2c[y_idx])
            sample_x = sample_x[1:] + i2c[y_idx]
        
        print(''.join(result))

## Fit

In [38]:
bs = 512
epochs=1

In [None]:
model.fit(x, y,
          batch_size=bs,
          epochs=epochs,
          callbacks=[LambdaCallback(on_epoch_end=on_epoch_end)]
         )

Epoch 1/1

In [None]:
help cont