In [1]:
#importing dependencies
import numpy
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

In [2]:
#loading data
with open('frankenstein.txt',encoding='utf8') as f:
    data=f.read()

In [20]:
#tokenization and standardization
def tokenize_words(input):
    # lowercase everything to standardize it
    input = input.lower()

    # instantiate the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    # if the created token isn't in the stop words, make it part of "filtered"
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

processed_inputs=tokenize_words(data)

In [21]:
#characters to numbers
#we'll sort the list pf set of all characters that appear in out i/p text and then use enumerate fn to get numbers that represent
#the characters
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

In [22]:
#checking if words to characters or characters to number has worked
input_len=len(processed_inputs)
vocab_len=len(chars)
print("Total number of characters: ", input_len)
print("Total vocabulary: ", vocab_len)

Total number of characters:  269878
Total vocabulary:  43


In [6]:
#sequence length
#defining the length of an individual sequence
#a sequence is a complete mapping of input characters as integers
seq_length=100
x_data=[]
y_data=[]

In [23]:

# loop through inputs, start at the beginning and go until we hit
# the final character we can create a sequence out of
for i in range(0, input_len - seq_length, 1):
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = processed_inputs[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = processed_inputs[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])
    
n_patterns=len(x_data)
print("Total Patterns: ", n_patterns)

Total Patterns:  539556


In [26]:
#converting sequence into np array
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

#one-hot encoding
y=np_utils.to_categorical(y_data)

In [27]:
#creating the model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

#compiling the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [28]:
#saving weights
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [29]:
#fit the model and training it
model.fit(X, y, epochs=4, batch_size=256, callbacks=desired_callbacks)

Epoch 1/4

Epoch 00001: loss improved from inf to 2.73049, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 2.73049 to 2.33527, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 2.33527 to 2.13399, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 2.13399 to 2.01200, saving model to model_weights_saved.hdf5


<keras.callbacks.History at 0x1e9946e6100>

In [30]:
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [31]:
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [32]:
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" window figure hideous abhorred grin face monster seemed jeer fiendish finger pointed towards corpse  "


In [33]:
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    
    seq_in=[num_to_char[value] for value in pattern]
    sys.stdout.write(result)

    pattern.append(index)
    pattern = pattern[1:len(pattern)]

sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sears sear