In [5]:
# importing dependencies
import numpy as np
import tensorflow as tf
import sys
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from tensorflow.keras.callbacks import ModelCheckpoint
from keras.utils.np_utils import to_categorical

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Opening the file
file = open('frankenstein.txt').read()

In [7]:
# Tokenization
# Standardization
def tokenize_words(input):
    input = input.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return "".join(filtered)

processed_inputs = tokenize_words(file)

In [8]:
# Converting Characters to Numbers
chars = sorted(list(set(processed_inputs)))
char_to_num =dict((c, i) for i, c in enumerate(chars))

In [9]:
# Checking if words to chars to num (?!) has worked
input_len = len(processed_inputs)
vocab_len = len(chars)
print("Total number of characters: ", input_len)
print("Total vocab: ", vocab_len)

Total number of characters:  241873
Total vocab:  42


In [10]:
# Sequence Length
seq_length = 100
x_data = []
y_data = []

In [11]:
# Looping through the sequence
for i in range(0, input_len - seq_length, 1):
    in_seq = processed_inputs[i: i + seq_length]
    out_seq = processed_inputs[i + seq_length]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

n_patterns = len(x_data)
print("Total Patterns: ", n_patterns)

Total Patterns:  241773


In [12]:
# Convert input sequence to np_array
X = np.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [13]:
# one-hot encoding
y = np_utils.to_categorical(y_data)

In [14]:
# Creating the model
model = Sequential()
model.add(LSTM(256, input_shape = (X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [15]:
# Compiling the model
model.compile(loss ='categorical_crossentropy', optimizer = 'adam')

In [16]:
# Saving the weights
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [17]:
# fit model and let it train
model.fit(X, y, epochs=100, batch_size=512, callbacks=desired_callbacks)

Epoch 1/100
Epoch 00001: loss improved from inf to 2.94441, saving model to model_weights_saved.hdf5
Epoch 2/100
Epoch 00002: loss improved from 2.94441 to 2.91782, saving model to model_weights_saved.hdf5
Epoch 3/100
Epoch 00003: loss improved from 2.91782 to 2.91446, saving model to model_weights_saved.hdf5
Epoch 4/100
Epoch 00004: loss improved from 2.91446 to 2.90106, saving model to model_weights_saved.hdf5
Epoch 5/100
Epoch 00005: loss improved from 2.90106 to 2.88113, saving model to model_weights_saved.hdf5
Epoch 6/100
Epoch 00006: loss improved from 2.88113 to 2.86154, saving model to model_weights_saved.hdf5
Epoch 7/100
Epoch 00007: loss improved from 2.86154 to 2.83829, saving model to model_weights_saved.hdf5
Epoch 8/100
Epoch 00008: loss improved from 2.83829 to 2.81270, saving model to model_weights_saved.hdf5
Epoch 9/100
Epoch 00009: loss improved from 2.81270 to 2.78474, saving model to model_weights_saved.hdf5
Epoch 10/100
Epoch 00010: loss improved from 2.78474 to 2.7

Epoch 35/100
Epoch 00035: loss improved from 2.26596 to 2.25622, saving model to model_weights_saved.hdf5
Epoch 36/100
Epoch 00036: loss improved from 2.25622 to 2.24570, saving model to model_weights_saved.hdf5
Epoch 37/100
Epoch 00037: loss improved from 2.24570 to 2.23583, saving model to model_weights_saved.hdf5
Epoch 38/100
Epoch 00038: loss improved from 2.23583 to 2.22552, saving model to model_weights_saved.hdf5
Epoch 39/100
Epoch 00039: loss improved from 2.22552 to 2.21605, saving model to model_weights_saved.hdf5
Epoch 40/100
Epoch 00040: loss improved from 2.21605 to 2.20723, saving model to model_weights_saved.hdf5
Epoch 41/100
Epoch 00041: loss improved from 2.20723 to 2.19854, saving model to model_weights_saved.hdf5
Epoch 42/100
Epoch 00042: loss improved from 2.19854 to 2.18972, saving model to model_weights_saved.hdf5
Epoch 43/100
Epoch 00043: loss improved from 2.18972 to 2.18016, saving model to model_weights_saved.hdf5
Epoch 44/100
Epoch 00044: loss improved from 2

Epoch 00068: loss improved from 2.04076 to 2.03422, saving model to model_weights_saved.hdf5
Epoch 69/100
Epoch 00069: loss improved from 2.03422 to 2.03173, saving model to model_weights_saved.hdf5
Epoch 70/100
Epoch 00070: loss improved from 2.03173 to 2.02826, saving model to model_weights_saved.hdf5
Epoch 71/100
Epoch 00071: loss improved from 2.02826 to 2.02317, saving model to model_weights_saved.hdf5
Epoch 72/100
Epoch 00072: loss improved from 2.02317 to 2.01840, saving model to model_weights_saved.hdf5
Epoch 73/100
Epoch 00073: loss improved from 2.01840 to 2.01320, saving model to model_weights_saved.hdf5
Epoch 74/100
Epoch 00074: loss improved from 2.01320 to 2.01014, saving model to model_weights_saved.hdf5
Epoch 75/100
Epoch 00075: loss improved from 2.01014 to 2.00749, saving model to model_weights_saved.hdf5
Epoch 76/100
Epoch 00076: loss improved from 2.00749 to 2.00373, saving model to model_weights_saved.hdf5
Epoch 77/100
Epoch 00077: loss improved from 2.00373 to 2.0

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 00099: loss improved from 1.93762 to 1.93668, saving model to model_weights_saved.hdf5
Epoch 100/100
Epoch 00100: loss improved from 1.93668 to 1.93173, saving model to model_weights_saved.hdf5


<tensorflow.python.keras.callbacks.History at 0x7f2108207358>

In [18]:
# Recompile the model with saved weights
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer = 'adam')

In [19]:
# Output of the model back to characters
num_to_char = dict((i,c) for i,c in enumerate(chars))

In [37]:
# Random seed to help generate
start = np.random.randint(0, len(x_data)-1)
pattern = x_data[start]
print("Random Seed: ")
print("\"", "".join([num_to_char[value] for value in pattern]), "\"")

Random Seed: 
" esidenceleghorndivulgedspeedilydeliveredfrenchgovernmentconsequentlyhiredvesselconveyconstantinoplec "


In [40]:
# Generate the text
for i in range(1000):
    x = np.reshape(pattern, (1,len(pattern), 1))
    x = x/float(vocab_len)
    prediction = model.predict(x, verbose = 0)
    index = np.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

estowedsensationseveralseadysearseasedarticulatedearshallalonesaidshallsentedearshallalonesaidshallsentedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesenderedearlysensesen