# LSTM Based Model for Sentence Auto Completion
### Rosie Nguyen

Dataset: Alice’s Adventures in Wonderland by Lewis Carroll.
https://www.gutenberg.org/cache/epub/11/pg11.txt

### Import Libraries

In [None]:
import sys
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

### Data loading and preprocessing

In [None]:
# load ascii text and covert to lowercase
filename = "wonderland.txt"
raw_text = open(filename, 'r', encoding='utf-8').read()
raw_text = raw_text.lower()

In [None]:
# create mapping of unique chars to integers, and a reverse mapping
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [None]:
# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print ("Total Characters: ", n_chars)
print ("Total Vocab: ", n_vocab)

Total Characters:  163781
Total Vocab:  59


In [None]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
	seq_in = raw_text[i:i + seq_length]
	seq_out = raw_text[i + seq_length]
	dataX.append([char_to_int[char] for char in seq_in])
	dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print ("Total Patterns: ", n_patterns)

Total Patterns:  163681


In [None]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))

### Normalization

In [None]:
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

### Build a LSTM Recurrent Neural Network

In [None]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [None]:
# fit the model
model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)

Epoch 1/20

Epoch 00001: loss improved from inf to 2.98792, saving model to weights-improvement-01-2.9879.hdf5
Epoch 2/20

Epoch 00002: loss improved from 2.98792 to 2.81440, saving model to weights-improvement-02-2.8144.hdf5
Epoch 3/20

Epoch 00003: loss improved from 2.81440 to 2.72809, saving model to weights-improvement-03-2.7281.hdf5
Epoch 4/20

Epoch 00004: loss improved from 2.72809 to 2.65508, saving model to weights-improvement-04-2.6551.hdf5
Epoch 5/20

Epoch 00005: loss improved from 2.65508 to 2.59469, saving model to weights-improvement-05-2.5947.hdf5
Epoch 6/20

Epoch 00006: loss improved from 2.59469 to 2.53903, saving model to weights-improvement-06-2.5390.hdf5
Epoch 7/20

Epoch 00007: loss improved from 2.53903 to 2.48667, saving model to weights-improvement-07-2.4867.hdf5
Epoch 8/20

Epoch 00008: loss improved from 2.48667 to 2.44075, saving model to weights-improvement-08-2.4407.hdf5
Epoch 9/20

Epoch 00009: loss improved from 2.44075 to 2.39783, saving model to weig

KeyboardInterrupt: ignored

### Generating Text with an LSTM Network

In [None]:
# load the network weights
filename = "weights-improvement-16-2.1641.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print ("Seed:")
print ("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate 100 characters
for i in range(300):
	x = numpy.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = numpy.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
print ("\nDone.")

Seed:
" tily.

'i thought you did,' said the mouse. '--i proceed. "edwin and morcar,
the earls of mercia and "
 toene to the thite oasee to the thite oareire sab and the wan ootele to toe thel  she mad oute to the woile whs oo the tas oo the tas of the gareer, and she whit ho was aolnee and the wan soteln and toene the was ooteln and toeee the was oote the gadt, and the whit oo the tai ooteln the tabted and 
Done.
