In [1]:
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical

In [2]:
# Load data
file = open("Frankenstein.txt", encoding="utf8").read()

In [3]:
# Tokenization and standardization
def tokenize_words(input):
    input = input.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)
    filtered = [token for token in tokens if token not in stopwords.words('english')]
    return " ".join(filtered)

processed_inputs = tokenize_words(file)

In [4]:
# Chars to numbers
chars = sorted(list(set(processed_inputs)))
char_to_num = {char: i for i, char in enumerate(chars)}

In [5]:
# Check if conversion has worked
input_len = len(processed_inputs)
vocab_len = len(chars)
print("Total Number of Characters:", input_len)
print("Total Vocab:", vocab_len)

Total Number of Characters: 269566
Total Vocab: 38


In [6]:
# Sequence length
seq_length = 100
x_data = []
y_data = []

In [7]:
# Loop through the sequence
for i in range(0, input_len - seq_length, 1):
    in_seq = processed_inputs[i:i + seq_length]
    out_seq = processed_inputs[i + seq_length]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

n_patterns = len(x_data)
print("Total Patterns:", n_patterns)

Total Patterns: 269466


In [8]:
# Reshape X to be suitable for LSTM
x = np.reshape(x_data, (n_patterns, seq_length, 1))
x = x / float(vocab_len)

In [9]:
# One-hot encoding for y
y = to_categorical(y_data)

In [10]:
# Creating the model
model = Sequential()
model.add(LSTM(256, input_shape=(x.shape[1], x.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

  super().__init__(**kwargs)


In [11]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [12]:
# Saving weights
filepath = "model_weights_saved.keras"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [13]:
# Fit the model
model.fit(x, y, epochs=4, batch_size=256, callbacks=desired_callbacks)

Epoch 1/4
[1m1053/1053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 562ms/step - loss: 2.9466
Epoch 1: loss improved from inf to 2.85885, saving model to model_weights_saved.keras
[1m1053/1053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m594s[0m 562ms/step - loss: 2.9465
Epoch 2/4
[1m1053/1053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 564ms/step - loss: 2.6090
Epoch 2: loss improved from 2.85885 to 2.55688, saving model to model_weights_saved.keras
[1m1053/1053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m594s[0m 564ms/step - loss: 2.6090
Epoch 3/4
[1m1053/1053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 577ms/step - loss: 2.4181
Epoch 3: loss improved from 2.55688 to 2.38137, saving model to model_weights_saved.keras
[1m1053/1053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m608s[0m 577ms/step - loss: 2.4181
Epoch 4/4
[1m1053/1053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 577ms/step - loss: 2.2828
Epoch 4: loss improved from 2.

<keras.src.callbacks.history.History at 0x2754139f0e0>

In [14]:
# recompile model with the saved weights
filename = 'model_weights_saved.keras'
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [15]:
# output of the model back into characters
num_to_char = dict((i,c) for i,c in enumerate(chars))

In [17]:
# random seed to help generate
import numpy
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed: ")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed: 
" ed project gutenberg tm license works posted permission copyright holder found beginning work 1 e 4  "


In [19]:
# generate the text
import sys
for i in range(1000):
    x = numpy.reshape(pattern, (1,len(pattern), 1))
    x = x/float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]


conse seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn seared serurn 