In [1]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

In [3]:
import nltk
# nltk.download('gutenberg')
nltk.corpus.gutenberg.fileids()

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\nroy0\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\gutenberg.zip.


['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [31]:
aiw_raw_text = nltk.corpus.gutenberg.raw('carroll-alice.txt')

In [32]:
aiw_raw_text_lw = aiw_raw_text.lower()

In [41]:
# create mapping of unique chars to integers
list_unique_chars = sorted(list(set(aiw_raw_text_lw)))
dict_char_int_look_up = dict((char, i) for i, char in enumerate(list_unique_chars))

In [42]:
dict_char_int_look_up

{'\n': 0,
 ' ': 1,
 '!': 2,
 '"': 3,
 "'": 4,
 '(': 5,
 ')': 6,
 '*': 7,
 ',': 8,
 '-': 9,
 '.': 10,
 '1': 11,
 '5': 12,
 '6': 13,
 '8': 14,
 ':': 15,
 ';': 16,
 '?': 17,
 '[': 18,
 ']': 19,
 '_': 20,
 'a': 21,
 'b': 22,
 'c': 23,
 'd': 24,
 'e': 25,
 'f': 26,
 'g': 27,
 'h': 28,
 'i': 29,
 'j': 30,
 'k': 31,
 'l': 32,
 'm': 33,
 'n': 34,
 'o': 35,
 'p': 36,
 'q': 37,
 'r': 38,
 's': 39,
 't': 40,
 'u': 41,
 'v': 42,
 'w': 43,
 'x': 44,
 'y': 45,
 'z': 46}

In [62]:
# create mapping of unique intergers to chars
dict_int_char_look_up = dict((i, char) for i, char in enumerate(list_unique_chars))

In [63]:
dict_int_char_look_up

{0: '\n',
 1: ' ',
 2: '!',
 3: '"',
 4: "'",
 5: '(',
 6: ')',
 7: '*',
 8: ',',
 9: '-',
 10: '.',
 11: '1',
 12: '5',
 13: '6',
 14: '8',
 15: ':',
 16: ';',
 17: '?',
 18: '[',
 19: ']',
 20: '_',
 21: 'a',
 22: 'b',
 23: 'c',
 24: 'd',
 25: 'e',
 26: 'f',
 27: 'g',
 28: 'h',
 29: 'i',
 30: 'j',
 31: 'k',
 32: 'l',
 33: 'm',
 34: 'n',
 35: 'o',
 36: 'p',
 37: 'q',
 38: 'r',
 39: 's',
 40: 't',
 41: 'u',
 42: 'v',
 43: 'w',
 44: 'x',
 45: 'y',
 46: 'z'}

In [35]:
# number of characters in raw data and number of unique characters - LOWER
n_chars = len(aiw_raw_text_lw)
n_vocab = len(list_unique_chars)
n_chars, n_vocab

(144395, 47)

In [43]:
# split the data in sequences of 100 (arb number)
# convert the chars to integers based on dict_char_int

seq_window = 100

list_seq_windows_int = []
list_seq_char_int = []

for i in range(0, n_chars - seq_window, 1):
    seq_in = aiw_raw_text_lw[i:i + seq_window]
    seq_out = aiw_raw_text_lw[i + seq_window]
    list_seq_windows_int.append([dict_char_int_look_up[char] for char in seq_in])
    list_seq_char_int.append(dict_char_int_look_up[seq_out])
    
n_patterns = len(list_seq_windows_int)
n_patterns

144295

In [44]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(list_seq_windows_int, (n_patterns, seq_window, 1))

# normalize
X = X / float(n_vocab)

# one hot encode the output variable
y = np_utils.to_categorical(list_seq_char_int)

In [47]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [45]:
# define the checkpoint
filepath="Basic_RNN_Weights/weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [None]:
model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)

Epoch 1/20
Epoch 00001: loss improved from inf to 2.98173, saving model to weights-improvement-01-2.9817.hdf5
Epoch 2/20
Epoch 00002: loss improved from 2.98173 to 2.77320, saving model to weights-improvement-02-2.7732.hdf5
Epoch 3/20
Epoch 00003: loss improved from 2.77320 to 2.66647, saving model to weights-improvement-03-2.6665.hdf5
Epoch 4/20
Epoch 00004: loss improved from 2.66647 to 2.59195, saving model to weights-improvement-04-2.5920.hdf5
Epoch 5/20
Epoch 00005: loss improved from 2.59195 to 2.53181, saving model to weights-improvement-05-2.5318.hdf5
Epoch 6/20
Epoch 00006: loss improved from 2.53181 to 2.47435, saving model to weights-improvement-06-2.4744.hdf5
Epoch 7/20
Epoch 00007: loss improved from 2.47435 to 2.42440, saving model to weights-improvement-07-2.4244.hdf5
Epoch 8/20
Epoch 00008: loss improved from 2.42440 to 2.37593, saving model to weights-improvement-08-2.3759.hdf5
Epoch 9/20
Epoch 00009: loss improved from 2.37593 to 2.33252, saving model to weights-impro

In [49]:
# load the network weights
filename = "Basic_RNN_Weights/weights-improvement-16-2.0697.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [50]:
# pick a random seed
start = numpy.random.randint(0, len(list_seq_windows_int)-1)
pattern = list_seq_windows_int[start]

In [56]:
seed_char_list = []
for _int in pattern:
    for k,v in dict_char_int_look_up.items():
        if v==_int:
            seed_char_list.append(k)
''.join(seed_char_list)

"t turned sulky, and would only say, 'i am older than\nyou, and must know better'; and this alice woul"

In [64]:
# OR use int to char look-up
''.join([dict_int_char_look_up[_int] for _int in pattern])

"t turned sulky, and would only say, 'i am older than\nyou, and must know better'; and this alice woul"

In [68]:
x = numpy.reshape(pattern, (1, len(pattern), 1))
x = x / float(n_vocab)
prediction = model.predict(x, verbose=0)
index = numpy.argmax(prediction)
result = dict_int_char_look_up[index]

In [71]:
pattern

[35,
 29,
 25,
 25,
 1,
 35,
 34,
 1,
 40,
 28,
 25,
 1,
 40,
 28,
 38,
 40,
 25,
 1,
 40,
 21,
 39,
 1,
 35,
 35,
 1,
 40,
 28,
 25,
 1,
 40,
 35,
 34,
 25,
 1,
 40,
 28,
 1,
 40,
 28,
 25,
 1,
 40,
 25,
 38,
 1,
 35,
 26,
 1,
 40,
 28,
 25,
 1,
 40,
 21,
 38,
 25,
 1,
 0,
 21,
 34,
 24,
 1,
 40,
 28,
 25,
 1,
 43,
 28,
 38,
 40,
 1,
 28,
 34,
 35,
 34,
 1,
 21,
 34,
 1,
 40,
 28,
 25,
 1,
 23,
 35,
 41,
 32,
 24,
 1,
 39,
 35,
 1,
 40,
 28,
 25,
 1,
 40,
 28,
 38,
 40]

In [69]:
result

'e'

In [66]:
# generate characters
for i in range(100):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = dict_int_char_look_up[index]
    seq_in = [[value] for value in pattern]
    dict_int_char_look_up.append(index)
    pattern = pattern[1:len(pattern)]

In [73]:
''.join(seq_in)

'toiee on the thrte tas oo the tone th the ter of the tare \nand the whrt hnon an the could so the thr'

In [78]:
"".join([dict_int_char_look_up[value] for value in pattern])

'oiee on the thrte tas oo the tone th the ter of the tare \nand the whrt hnon an the could so the thrt'

In [None]:
### OTHER CODE

'''Example script to generate text from Nietzsche's writings.
At least 20 epochs are required before the generated text
starts sounding coherent.
It is recommended to run this script on GPU, as recurrent
networks are quite computationally intensive.
If you try this script on new data, make sure your corpus
has at least ~100k characters. ~1M is better.
'''

from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import io

path = get_file(
    'nietzsche.txt',
    origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
with io.open(path, encoding='utf-8') as f:
    text = f.read().lower()
print('corpus length:', len(text))

chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1


# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars), activation='softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)


def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(x, y,
          batch_size=128,
          epochs=6,
          callbacks=[print_callback])