In [1]:
# Importing dependencies
import numpy
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

In [2]:
# Loading the data
file = open("C:/Users/prane/Desktop/Frankenstein-2.txt", 'r', encoding = ('utf-8')).read()

In [3]:
# Tokenization
# Standardization
def tokenize_words(input):
    # Everything has been converted to lowercase for standardization
    input = input.lower()
    # Instantiating the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # Tokenizing the text into tokens
    tokens = tokenizer.tokenize(input)
    # Filtering out the stopwords using lambda
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return ' '.join(filtered)
# Preprocessing the input data, making tokens
processed_inputs = tokenize_words(file)

In [4]:
# Converting characters to numbers
# Converting our characters to numbers
# Sorting the list of all of our characters that appear in our input text and then using the enumerate function to get numbers
# that represent the characters
# Next, creating a dictionary to store the keys and values, in this case the characters and the numbers that represent them
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

In [5]:
# Checking if characters to numbers or vice versa has worked
# printing the length of the variables
input_len = len(processed_inputs)
vocab_len = len(chars)
print("Total number of characters: ", input_len)
print("Total vocab: ", vocab_len)

Total number of characters:  8086
Total vocab:  28


In [6]:
# Sequence length
# Defining the length of our sequence
# A sequence is a complete mapping of input characters as integers
seq_length = 100
x_data = []
y_data = []

In [7]:
# Looping through the sequence
# Going through the entire list of inputs and converting the characters to numbers using a for loop
# This will create a bunch of sequences where each sequence starts with the next character in the input data beginning with 
# the first character
for i in range(0, input_len - seq_length, 1):
    # Defining input and output sequences
    # Input sequence is the current character plus the sequence length
    in_seq = processed_inputs[i:i + seq_length]
    # Output sequence is the initial character plus the total sequence length
    out_seq = processed_inputs[i + seq_length]
    # Converting the list of characters to integers based on previous values and then appnding them to the list
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])
    
# Checking to see how many input sequences we have    
n_patterns = len(x_data)
print("Total patterns: ", n_patterns)

Total patterns:  7986


In [8]:
# Converting input sequence to np array that the network can use
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [9]:
# One-hot encoding the label data
y = np_utils.to_categorical(y_data)

In [10]:
# Creating the model
# Creating a sequential model
# Dropout is used to prevent overfitting of our data
model = Sequential()
model.add(LSTM(256, input_shape = (X.shape[1], X.shape[2]), return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation = 'softmax'))

In [11]:
# Compiling the model
model.compile(loss = "categorical_crossentropy", optimizer = "adam")

In [12]:
# Saving the weights
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor = 'loss', verbose = 1, save_best_only = True, mode = 'min')
desired_callbacks = [checkpoint]

In [37]:
# Fitting the model and letting it train
model.fit(X, y, epochs = 100, batch_size = 128, callbacks = desired_callbacks)

Epoch 1/100
Epoch 00001: loss improved from 2.44904 to 2.43565, saving model to model_weights_saved.hdf5
Epoch 2/100
Epoch 00002: loss improved from 2.43565 to 2.40292, saving model to model_weights_saved.hdf5
Epoch 3/100
Epoch 00003: loss improved from 2.40292 to 2.38557, saving model to model_weights_saved.hdf5
Epoch 4/100
Epoch 00004: loss improved from 2.38557 to 2.36045, saving model to model_weights_saved.hdf5
Epoch 5/100
Epoch 00005: loss improved from 2.36045 to 2.34812, saving model to model_weights_saved.hdf5
Epoch 6/100
Epoch 00006: loss improved from 2.34812 to 2.31788, saving model to model_weights_saved.hdf5
Epoch 7/100
Epoch 00007: loss improved from 2.31788 to 2.31000, saving model to model_weights_saved.hdf5
Epoch 8/100
Epoch 00008: loss improved from 2.31000 to 2.27656, saving model to model_weights_saved.hdf5
Epoch 9/100
Epoch 00009: loss improved from 2.27656 to 2.26026, saving model to model_weights_saved.hdf5
Epoch 10/100
Epoch 00010: loss improved from 2.26026 to

<tensorflow.python.keras.callbacks.History at 0x2250376a7c8>

In [38]:
# Recompiling the model with the saved weights
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss = "categorical_crossentropy", optimizer = "adam")

In [39]:
# Creating a dictionary to convert the outpit in numbers back to characters
num_to_char = dict((i, c) for i, c in enumerate(chars)) 

In [40]:
# Providing a random seed to generate text
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed: ")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed: 
" became torrent course swept away hopes joys natural philosophy genius regulated fate desire therefor "


In [41]:
# Generating the text
for i in range (1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x/float(vocab_len)
    prediction = model.predict(x, verbose = 0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

e narration state facts led predilection science bourent inagination childish reasoning till accident changed current ideas fifteen years old retired house near belrive uitnessed hoeatest disdain would science could never even step pifet become sullen study rough ardour nature subdessors mat celight arpeared treasures known besides described always imbued fervent pong mather acqiss mine subpime shapes mountains changes seasons tempest mat celiue shee mather saken pains explain principles agrippa entirely exploded modern system science introduced possessed much crer lapter sone meatent appeared treasures known besides described always imbued fervent pong mather acqiss mine subpime shapes mountains changes seasons tempest mat celiue shee mather saken pains explain principles agrippa entirely exploded modern system science introduced possessed much crer lapter sone meatent appeared treasures known besides described always imbued fervent pong mather acqiss mine subpime shapes mountains cha