In [18]:
import sys
import nltk
import numpy
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
#load data
#loading data and opening our input data in the form of a txt file
#project project Gutenberg/Gutenburg is where the data can be found
file = open("frankenstein-2.txt", encoding="utf8").read()
print(file)

_To Mrs. Saville, England._


Archangel, 28th March, 17—.


How slowly the time passes here, encompassed as I am by frost and snow!
Yet a second step is taken towards my enterprise.  I have hired a
vessel and am occupied in collecting my sailors; those whom I have
already engaged appear to be men on whom I can depend and are certainly
possessed of dauntless courage.

But I have one want which I have never yet been able to satisfy, and the
absence of the object of which I now feel as a most severe evil, I have no
friend, Margaret: when I am glowing with the enthusiasm of success, there
will be none to participate my joy; if I am assailed by disappointment, no
one will endeavour to sustain me in dejection. I shall commit my thoughts
to paper, it is true; but that is a poor medium for the communication of
feeling. I desire the company of a man who could sympathise with me, whose
eyes would reply to mine. You may deem me romantic, my dear sister, but I
bitterly feel the want of a friend. I

In [5]:
#tokenization and standardization
#Tokenization is the process of breaking a series of text into words or phrases symbols 
def tokenize_words(input):
    print(type(input))
    input = input.lower()
    #lowercase everything
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)
processed_inputs = tokenize_words(file)

<class 'str'>


In [6]:
#characters to numbers
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))
print(char_to_num)

{' ': 0, '1': 1, '2': 2, '7': 3, '8': 4, '_': 5, 'a': 6, 'b': 7, 'c': 8, 'd': 9, 'e': 10, 'f': 11, 'g': 12, 'h': 13, 'i': 14, 'j': 15, 'k': 16, 'l': 17, 'm': 18, 'n': 19, 'o': 20, 'p': 21, 'q': 22, 'r': 23, 's': 24, 't': 25, 'u': 26, 'v': 27, 'w': 28, 'x': 29, 'y': 30, 'z': 31}


In [7]:
# check if words to chars or chars to num (?!) has worked?
input_len = len(processed_inputs)
vocab_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total vocab:", vocab_len)

Total number of characters: 4560
Total vocab: 32


In [14]:
# seg length
seq_length = 100
x_data = []
y_data = []

In [15]:
#loop through the sequence
for i in range(0, input_len - seq_length, 1):
    in_seq = processed_inputs[i:i + seq_length]
    out_seq = processed_inputs[i + seq_length]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 4460


In [19]:
#convert input sequence to np array and so on
X = numpy.reshape (x_data, (n_patterns, seq_length, 1))
X = X/float (vocab_len)

In [20]:
In [11]: # one-hot encoding
y = np_utils.to_categorical (y_data)

In [21]:
# creating the model
model = Sequential()
model.add(LSTM (256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM (256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM (128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [25]:
# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [26]:
# saving weights
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose = 1, save_best_only=True, mode='min')
desired_callbacks =[checkpoint]

In [31]:
#fit model and let it train
model.fit(X,y, epochs=4, batch_size=256, callbacks=desired_callbacks)

Epoch 1/4

Epoch 00001: loss improved from inf to 3.06502, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 3.06502 to 2.94551, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 2.94551 to 2.93522, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 2.93522 to 2.93242, saving model to model_weights_saved.hdf5


<keras.callbacks.callbacks.History at 0x2a5f3423190>

In [33]:
# recompile model with the saved weights
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [34]:
# output of the model back into characters
num_to_char = dict((i,c) for i,c in enumerate(chars))

In [37]:
# random seed to help generate
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" e tastes like approve amend plans would friend repair faults poor brother ardent execution impatient "


In [42]:
# generate the text
for i in range (1000):
    x = numpy.reshape(pattern, (1, len (pattern), 1))
    x = x/float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax (prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len (pattern)]

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        