<a href="https://colab.research.google.com/github/ryancburke/textgeneration_dracula/blob/main/dracula.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [46]:
import numpy as np
import sys
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

In [2]:
from google.colab import files
uploaded = files.upload()

Saving dracula.txt to dracula.txt


In [3]:
# Read, then decode for py2 compat.
text = open("dracula.txt", 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print('Length of text: {} characters'.format(len(text)))

Length of text: 857497 characters


In [4]:
text.lower()



In [6]:
# create mapping of unique chars to integers
chars = sorted(list(set(text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

In [8]:
print(char_to_int)

{'\n': 0, '\r': 1, ' ': 2, '!': 3, '"': 4, '&': 5, "'": 6, '(': 7, ')': 8, '*': 9, ',': 10, '-': 11, '.': 12, '0': 13, '1': 14, '2': 15, '3': 16, '4': 17, '5': 18, '6': 19, '7': 20, '8': 21, '9': 22, ':': 23, ';': 24, '?': 25, 'A': 26, 'B': 27, 'C': 28, 'D': 29, 'E': 30, 'F': 31, 'G': 32, 'H': 33, 'I': 34, 'J': 35, 'K': 36, 'L': 37, 'M': 38, 'N': 39, 'O': 40, 'P': 41, 'Q': 42, 'R': 43, 'S': 44, 'T': 45, 'U': 46, 'V': 47, 'W': 48, 'X': 49, 'Y': 50, 'Z': 51, '_': 52, 'a': 53, 'b': 54, 'c': 55, 'd': 56, 'e': 57, 'f': 58, 'g': 59, 'h': 60, 'i': 61, 'j': 62, 'k': 63, 'l': 64, 'm': 65, 'n': 66, 'o': 67, 'p': 68, 'q': 69, 'r': 70, 's': 71, 't': 72, 'u': 73, 'v': 74, 'w': 75, 'x': 76, 'y': 77, 'z': 78, '{': 79, '}': 80, '£': 81, 'à': 82, 'á': 83, 'â': 84, 'æ': 85, 'è': 86, 'é': 87, 'ë': 88, 'ï': 89, 'ô': 90, 'ö': 91, '\ufeff': 92}


In [9]:
text = text.replace("\n", " ")  # We remove newlines chars for nicer display

In [10]:
text = text.replace("\r", " ")  # We remove return chars for nicer display

In [11]:
text = text.replace("\ufeff", " ")  # We remove newlines chars for nicer display

In [12]:
text = text.replace("\ ", " ")  # We remove newlines chars for nicer display

In [13]:
n_chars = len(text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  857497
Total Vocab:  93


In [14]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = text[i:i + seq_length]
    seq_out = text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  857397


In [15]:
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

In [16]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [17]:
# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [18]:
model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)

Epoch 1/20

Epoch 00001: loss improved from inf to 2.75180, saving model to weights-improvement-01-2.7518.hdf5
Epoch 2/20

Epoch 00002: loss improved from 2.75180 to 2.60767, saving model to weights-improvement-02-2.6077.hdf5
Epoch 3/20

Epoch 00003: loss improved from 2.60767 to 2.53082, saving model to weights-improvement-03-2.5308.hdf5
Epoch 4/20

Epoch 00004: loss improved from 2.53082 to 2.45188, saving model to weights-improvement-04-2.4519.hdf5
Epoch 5/20

Epoch 00005: loss improved from 2.45188 to 2.38485, saving model to weights-improvement-05-2.3848.hdf5
Epoch 6/20

Epoch 00006: loss improved from 2.38485 to 2.32846, saving model to weights-improvement-06-2.3285.hdf5
Epoch 7/20

Epoch 00007: loss improved from 2.32846 to 2.28013, saving model to weights-improvement-07-2.2801.hdf5
Epoch 8/20

Epoch 00008: loss improved from 2.28013 to 2.23997, saving model to weights-improvement-08-2.2400.hdf5
Epoch 9/20

Epoch 00009: loss improved from 2.23997 to 2.20700, saving model to weig

<tensorflow.python.keras.callbacks.History at 0x7f40800fa780>

In [18]:
stop

In [20]:
# load the network weights
filename = "weights-improvement-20-2.0099.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [21]:
# reverse mapping integers back to chars
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [27]:
# pick a random seed
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(1000):
	x = np.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = np.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
print("\nDone.")

Seed:
" ly, which was waiting, Van Helsing said:--    "To-night I can sleep in peace, and sleep I want--two  "
to mete mo  aotession oo the some of the sooe oo the sooe of the soom of the sooe  to the soom of the soom of the soom of the soom of the soom of the  courte of the soom of the soom of the soom of the soom of the sooe  sooe oe the sooe of the soom of the soom of the soom of the soom of  toeensens and the soeer of the soom of the soom of the soom of the  courte of the soom of the soom of the soom of the soom of the sooe  sooe oe the sooe of the soom of the soom of the soom of the soom of  toeensens and the soeer of the soom of the soom of the soom of the  courte of the soom of the soom of the soom of the soom of the sooe  sooe oe the sooe of the soom of the soom of the soom of the soom of  toeensens and the soeer of the soom of the soom of the soom of the  courte of the soom of the soom of the soom of the soom of the sooe  sooe oe the sooe of the soom of the soom of the soom of

In [29]:
# define the biLSTM model
model = Sequential()
model.add(Bidirectional(LSTM(256, input_shape=(X.shape[1], X.shape[2]))))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [30]:
# define the checkpoint
filepath="bilstm-weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [31]:
model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)

Epoch 1/20

Epoch 00001: loss improved from inf to 2.73789, saving model to bilstm-weights-improvement-01-2.7379.hdf5
Epoch 2/20

Epoch 00002: loss improved from 2.73789 to 2.58059, saving model to bilstm-weights-improvement-02-2.5806.hdf5
Epoch 3/20

Epoch 00003: loss improved from 2.58059 to 2.49134, saving model to bilstm-weights-improvement-03-2.4913.hdf5
Epoch 4/20

Epoch 00004: loss improved from 2.49134 to 2.41609, saving model to bilstm-weights-improvement-04-2.4161.hdf5
Epoch 5/20

Epoch 00005: loss improved from 2.41609 to 2.34611, saving model to bilstm-weights-improvement-05-2.3461.hdf5
Epoch 6/20

Epoch 00006: loss improved from 2.34611 to 2.27863, saving model to bilstm-weights-improvement-06-2.2786.hdf5
Epoch 7/20

Epoch 00007: loss improved from 2.27863 to 2.22965, saving model to bilstm-weights-improvement-07-2.2296.hdf5
Epoch 8/20

Epoch 00008: loss improved from 2.22965 to 2.18954, saving model to bilstm-weights-improvement-08-2.1895.hdf5
Epoch 9/20

Epoch 00009: los

<tensorflow.python.keras.callbacks.History at 0x7f403554ec88>

In [33]:
# load the network weights
filename = "bilstm-weights-improvement-20-1.9772.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [34]:
# reverse mapping integers back to chars
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [45]:
# pick a random seed
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(1000):
	x = np.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = np.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
print("\nDone.")

Seed:
"   through to the very depths of your soul. Tell me, like one good fellow  to another, is there any o "
o toeep and toeet and toeet and toeet and  soeep oo the whrde of the wane and the whrd oo the wane of the same  saade and saed to her to be ao the same saaked oo the wane of the  siren of the wane of the wane of the wane of the sorm and saed to me  and saed to me that the sooe whsh the was the same saaee and the saad  that he had been a sere of the whrdow of the sorm and saed to her to  seed the door of the sorm and saed to her to ae ao the same saaked  soael and soeep and saed to her to be ao the same sore that he had  been a sare the whrdon of the sorm and saed to her to ae ao the same  soie of the wane of the wane of the wane of the same saake to the whrd  soael of the wane of the wane of the wane of the same saaee and the  saie to her toeel and she was to be ao the same sore that he had  been a sare the whrdon of the sorm and saed to her to ae ao the same  soie of the wane

In [None]:
model = Sequential()
model.add(Bidirectional(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(256)))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [40]:
# define the checkpoint
filepath="2bilstm-weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [41]:
model.fit(X, y, epochs=50, batch_size=128, callbacks=callbacks_list)

Epoch 1/50

Epoch 00001: loss improved from inf to 1.96807, saving model to 2bilstm-weights-improvement-01-1.9681.hdf5
Epoch 2/50

Epoch 00002: loss improved from 1.96807 to 1.95740, saving model to 2bilstm-weights-improvement-02-1.9574.hdf5
Epoch 3/50

Epoch 00003: loss improved from 1.95740 to 1.94899, saving model to 2bilstm-weights-improvement-03-1.9490.hdf5
Epoch 4/50

Epoch 00004: loss improved from 1.94899 to 1.94033, saving model to 2bilstm-weights-improvement-04-1.9403.hdf5
Epoch 5/50

Epoch 00005: loss improved from 1.94033 to 1.93347, saving model to 2bilstm-weights-improvement-05-1.9335.hdf5
Epoch 6/50

Epoch 00006: loss improved from 1.93347 to 1.92479, saving model to 2bilstm-weights-improvement-06-1.9248.hdf5
Epoch 7/50

Epoch 00007: loss did not improve from 1.92479
Epoch 8/50

Epoch 00008: loss did not improve from 1.92479
Epoch 9/50

Epoch 00009: loss did not improve from 1.92479
Epoch 10/50

KeyboardInterrupt: ignored

In [43]:
# load the network weights
filename = "2bilstm-weights-improvement-06-1.9248.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [44]:
# pick a random seed
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(1000):
	x = np.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = np.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
print("\nDone.")

Seed:
" ankly, for quite a  light came into his face, and he put out both his hands and took mine--I  think  "
I have nett the sooe that I whsh to sea that he had been to  sea and toeer to see the sooe oo the sore that he had been a sare to  soeep oo the soom of the sorm of the same saade and the was the  sore and saed to her toeel and she was all the saaee and the same  sore and she was to be ao the same saaked of the wane of the same  aaak and saed to me that the sooe whsh a sore of the war the sooe  shit whrh the saaee and the whrle of the wane and the same sore that  when he had been a sere of the whrd oo the wane of the same saake and  she was all the saaee and the saaee and the same saaked of the same  said to her tore and the same saaee and the whre oo the saaee of the  sore and saed to her to ae ao the same saaked oo the soom of the  sore and she was to be ao the same sore that he had been a sare to  anl the same tore that he had been a sere oo the wane of the soom and  saed to

In [73]:
model = Sequential()
model.add(Bidirectional(LSTM(256, batch_input_shape=(batch_size, X.shape[2]), return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(256)))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [74]:
# define the checkpoint
filepath="stateful-2bilstm-weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [75]:
model.fit(X, y, epochs=20, batch_size = 64, callbacks=callbacks_list)

Epoch 1/20

Epoch 00001: loss improved from inf to 2.41985, saving model to stateful-2bilstm-weights-improvement-01-2.4199.hdf5
Epoch 2/20

Epoch 00002: loss improved from 2.41985 to 1.99023, saving model to stateful-2bilstm-weights-improvement-02-1.9902.hdf5
Epoch 3/20

Epoch 00003: loss improved from 1.99023 to 1.80522, saving model to stateful-2bilstm-weights-improvement-03-1.8052.hdf5
Epoch 4/20

Epoch 00004: loss improved from 1.80522 to 1.70688, saving model to stateful-2bilstm-weights-improvement-04-1.7069.hdf5
Epoch 5/20

Epoch 00005: loss improved from 1.70688 to 1.64436, saving model to stateful-2bilstm-weights-improvement-05-1.6444.hdf5
Epoch 6/20

Epoch 00006: loss improved from 1.64436 to 1.60027, saving model to stateful-2bilstm-weights-improvement-06-1.6003.hdf5
Epoch 7/20

Epoch 00007: loss improved from 1.60027 to 1.56552, saving model to stateful-2bilstm-weights-improvement-07-1.5655.hdf5
Epoch 8/20

Epoch 00008: loss improved from 1.56552 to 1.53776, saving model to 

<tensorflow.python.keras.callbacks.History at 0x7f3fe76c3a90>

In [76]:
# load the network weights
filename = "stateful-2bilstm-weights-improvement-20-1.3844.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [79]:
# pick a random seed
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(1000):
	x = np.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = np.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
print("\nDone.")

Seed:
" ith a sheet, on the bed after I  had got up. They were all so frightened and nervous that I directed "
 to  the that the door of the station of the station of the station of the  story of the station of the station of the station of the  country of the station of the station of the station of the  coming of the station of the station of the station of the  country of the station of the station of the station of the  coming of the station of the station of the station of the  country of the station of the station of the station of the  coming of the station of the station of the station of the  country of the station of the station of the station of the  coming of the station of the station of the station of the  country of the station of the station of the station of the  coming of the station of the station of the station of the  country of the station of the station of the station of the  coming of the station of the station of the station of the  country of the station of th