In [1]:
#Import libraries
import numpy as np
import pandas as pd
import sys
#import nltk
#nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [2]:
#Load data
#Loading the data and opening our input data in the form of txt file
file = open('frankenstein.txt').read()

In [3]:
#Tokenization
#Standardization
def tokenize_words(input):
    #Lowercasing the input
    input = input.lower()
    #Instantiating the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    #Tokenzing the text into tokens
    tokens = tokenizer.tokenize(input)
    #Filtering the stopwords using lambda
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

#Preprocess the inputs and make tokens
processed_inputs = tokenize_words(file)

In [4]:
#Chars to numbers
#Coverts character in the input data into numbers
#Sort all the characters in the input data and use enumerate fn
#To get the numbers that represent the characters
#Create a dict that stores the key and value i.e character and numbers
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c,i) for i,c in enumerate(chars))

In [5]:
#Check if words to chars or chars to num has worked ?
#To get an idea if it is working
#Printing the length of variable
input_len = len(processed_inputs)
vocab_len = len(chars)
print('Total no of character:', input_len)
print("Total vocab:", vocab_len)

Total no of character: 269566
Total vocab: 38


In [6]:
#Seq length
#Defining the length of individual sequence
#Individual Sequence is a complete mapping of input character as integers
seq_length = 100
x_data = []
y_data = []

In [7]:
#Loop through the sequence
#Going the loop and coverting character into numbers
for i in range(0, input_len - seq_length, 1):
    #Define the i/p and o/p sequence
    #I/p is the current character plus the desired sequence length
    in_seq = processed_inputs[i:i + seq_length]
    #O/p is the initial character plus the total sequence length
    out_seq = processed_inputs[i + seq_length]
    #Converting the list of characters to integers absed on previous values and appending them to list
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])
    
n_patterns = len(x_data)
print('Total Pattern:', n_patterns)

Total Pattern: 269466


In [8]:
#Convert input sequence to np.array that our network can use
x = np.reshape(x_data, (n_patterns, seq_length, 1))
x = x/float(vocab_len)

In [9]:
#One hot encoding our label data
y = np_utils.to_categorical(y_data)

In [10]:
#Creating the model
#Creating the sequential model
#Dropout is used to prevent overfitting
model = Sequential()
model.add(LSTM(256, input_shape = (x.shape[1], x.shape[2]), return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation = 'softmax'))

In [11]:
#Compile the model
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

In [12]:
#Saving the weights
filepath = 'model_weights_saved.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor = 'loss', verbose = 1, save_best_only = True, mode = 'min')
desired_callback = [checkpoint]

In [13]:
#Fit model
model.fit(x, y, epochs = 4, batch_size = 256, callbacks = desired_callback)

Epoch 1/4

Epoch 00001: loss improved from inf to 2.93988, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 2.93988 to 2.91434, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 2.91434 to 2.88884, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 2.88884 to 2.67754, saving model to model_weights_saved.hdf5


<keras.callbacks.callbacks.History at 0x2ce3edb2808>

In [14]:
#Recompile the model with saved weights
filename = 'model_weights_saved.hdf5'
model.load_weights(filename)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

In [15]:
#Output of the model into character
num_to_char = dict((i,c) for i,c in enumerate(chars))

In [20]:
#Random seed to help generate
start = np.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" olved remain father moment departure time turk renewed promise united deliverer felix remained expec "


In [23]:
#Generate the text
for i in range(1000):
    X = np.reshape(pattern, (1, len(pattern), 1))
    X = X/float(vocab_len)
    predictions = model.predict(X, verbose = 0)
    index = np.argmax(predictions)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

 sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese sese