In [1]:
import numpy
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [2]:
file = open("Frankenstein1.txt").read()

In [3]:
def tokenize_words(input):
    # lowercase everything to standardize it
    input = input.lower()

    # instantiate the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    # if the created token isn't in the stop words, make it part of "filtered"
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

In [4]:
# preprocess the input data, make tokens
processed_inputs = tokenize_words(file)

In [5]:
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

In [6]:
input_len = len(processed_inputs)
vocab_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total vocab:", vocab_len)

Total number of characters: 262011
Total vocab: 38


In [7]:
seq_length = 100
x_data = []
y_data = []

In [8]:
# loop through inputs, start at the beginning and go until we hit
# the final character we can create a sequence out of
for i in range(0, input_len - seq_length, 1):
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = processed_inputs[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = processed_inputs[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [9]:
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 261911


In [10]:
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [11]:
y = np_utils.to_categorical(y_data)

In [12]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

W0531 08:59:32.884255 139913926907712 deprecation_wrapper.py:119] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0531 08:59:32.900176 139913926907712 deprecation_wrapper.py:119] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0531 08:59:32.903194 139913926907712 deprecation_wrapper.py:119] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0531 08:59:33.259590 139913926907712 deprecation_wrapper.py:119] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.

In [13]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

W0531 09:00:00.153349 139913926907712 deprecation_wrapper.py:119] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0531 09:00:00.181472 139913926907712 deprecation_wrapper.py:119] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3295: The name tf.log is deprecated. Please use tf.math.log instead.



In [14]:
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [15]:
model.fit(X, y, epochs=4, batch_size=256, callbacks=desired_callbacks)

W0531 09:00:32.552601 139913926907712 deprecation.py:323] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/4

Epoch 00001: loss improved from inf to 2.84302, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 2.84302 to 2.55142, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 2.55142 to 2.39775, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 2.39775 to 2.27991, saving model to model_weights_saved.hdf5


<keras.callbacks.History at 0x7f3fb03fbf60>

In [16]:
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')



In [17]:
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [18]:
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" y differ moral sense strange case dr jekyll mr hyde immediate success one stevenson best selling wor "


In [19]:
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]

    sys.stdout.write(result)

    pattern.append(index)
    pattern = pattern[1:len(pattern)]

l serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sears serter sea