In [20]:
#import dependencies
import numpy
import sys
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense,Dropout,LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [21]:
#load data
file=open("frankenstein.txt",encoding="utf8").read()

In [22]:
#standardaziation
#tokenization is the process of breaking a stream text up into
#words phrases symbols or other meaningful elements
def tokenize_words(input):
    #lowercase everything to standardize it
    input=input.lower()
    
    #initiating the tokenizer
    tokenizer=RegexpTokenizer(r'\w+')
    
    #tokenizing the text into the tokens
    tokens=tokenizer.tokenize(input)
    
    #filtering the stopwords using the lambda
    filtered=filter(lambda token:token not in stopwords.words('english'),tokens)
    return "".join(filtered)

#process the input
processed_inputs=tokenize_words(file)

In [30]:
#chars to numbers
chars=sorted(list(set(processed_inputs)))
char_to_num=dict((c,i) for i,c in enumerate(chars))

In [31]:
# check if words to chars or chars to num (?!) has worked?
input_len = len(processed_inputs)
vocab_len = len (chars)
print("Total number of characters:", input_len)
print("Total vocab:", vocab_len)

Total number of characters: 220857
Total vocab: 42


In [32]:
#seq length
seq_length=100
x_data=[]
y_data=[]

In [33]:
# loop through the sequence
for i in range(0, input_len - seq_length, 1):
    in_seq = processed_inputs[i: i + seq_length]
    out_seq = processed_inputs[i+seq_length]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

n_patterns = len(x_data)
print("Total Patterns:", n_patterns)

Total Patterns: 220757


In [34]:
# convert input sequence to np array and so on
X=numpy.reshape(x_data,(n_patterns,seq_length, 1))
X= X/float (vocab_len)

In [35]:
#one-hot encoding our label data
y = np_utils.to_categorical(y_data)

In [36]:
# creating the model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1],X.shape[2]), return_sequences=True) )
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [37]:
model.compile(loss='categorical_crossentropy',optimizer='adam')

In [41]:
#saving weights
filepath="model_weights_saved.hdf5"
checkpoint=ModelCheckpoint(filepath,monitor='loss',verbose=1,save_best_only=True,mode='min')
desired_callbacks=[checkpoint]

In [42]:
#fitting the model and let it train
model.fit(X,y,epochs=4,batch_size=256,callbacks=desired_callbacks)

Epoch 1/4

Epoch 00001: loss improved from inf to 2.93084, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 2.93084 to 2.90825, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 2.90825 to 2.90462, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 2.90462 to 2.87449, saving model to model_weights_saved.hdf5


<keras.callbacks.History at 0x1d34f4150d0>

In [44]:
#recompile model with the saved weights
filename="model_weights_saved.hdf5"
model.load_weights (filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [46]:
# output of the model back into characters
num_to_char=dict((i,c) for i,c in enumerate(chars))

In [58]:
#random seed to help generate
start=numpy.random.randint(0, len(x_data) - 1)
pattern=x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" rablywarmthusprovidedresolvedresidehovelsomethingoccurmightalterdeterminationindeedparadisecomparedb "


In [60]:
# generate the text
for i in range(1000):
    x = numpy.reshape(pattern,(1, len(pattern), 1))
    x = x/float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result =num_to_char[index]
    seq_in =[num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append (index)
    pattern= pattern[1:len(pattern)]
#train the model epochs for 100 times for a better text generation

rereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereereree