In [48]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
import re

In [49]:
# load sonnet text file
filename = "./data/Sonnet.txt"
with open(filename, 'r') as f:
    sonnet_text = f.read()

In [50]:
# split sonnets by double lines
sonnets = re.split('\n\n', sonnet_text)

In [51]:
# clean sonnets
sonnets_clean = []
for sonnet in sonnets:
    sonnet = sonnet.lower() # convert to lowercase
    sonnet = re.sub('[^a-zA-Z\s]+', '', sonnet) # remove non-alphabetic characters
    sonnet = re.sub('\n', ' ', sonnet) # remove newline characters
    sonnets_clean.append(sonnet)

In [52]:
# tokenize sonnets
words = []
for sonnet in sonnets_clean:
    words += sonnet.split()
unique_words = sorted(list(set(words)))
word_to_int = dict((w, i) for i, w in enumerate(unique_words))
int_to_word = dict((i, w) for i, w in enumerate(unique_words))
n_vocab = len(unique_words)

In [53]:
# create input and output sequences for LSTM model
seq_length = 10
dataX = []
dataY = []
for sonnet in sonnets_clean:
    sonnet_words = sonnet.split()
    for i in range(len(sonnet_words)-seq_length):
        seq_in = sonnet_words[i:i+seq_length]
        seq_out = sonnet_words[i+seq_length]
        dataX.append([word_to_int[w] for w in seq_in])
        dataY.append(word_to_int[seq_out])
n_patterns = len(dataX)

In [54]:
# reshape input sequences for LSTM model
X = np.reshape(dataX, (n_patterns, seq_length, 1))
X = X / float(n_vocab)

In [55]:
# one-hot encode output sequences for LSTM model
y = np_utils.to_categorical(dataY)

In [56]:
# build LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [57]:
# define checkpoint to save best model weights during training
filepath = "sonnet_weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')

In [58]:
# train LSTM model
model.fit(X, y, epochs=100, batch_size=128, callbacks=[checkpoint])

Epoch 1/100
Epoch 1: loss improved from inf to 6.85878, saving model to sonnet_weights.hdf5
Epoch 2/100
Epoch 2: loss improved from 6.85878 to 6.52181, saving model to sonnet_weights.hdf5
Epoch 3/100
Epoch 3: loss improved from 6.52181 to 6.49666, saving model to sonnet_weights.hdf5
Epoch 4/100
Epoch 4: loss improved from 6.49666 to 6.48814, saving model to sonnet_weights.hdf5
Epoch 5/100
Epoch 5: loss improved from 6.48814 to 6.48758, saving model to sonnet_weights.hdf5
Epoch 6/100
Epoch 6: loss improved from 6.48758 to 6.48269, saving model to sonnet_weights.hdf5
Epoch 7/100
Epoch 7: loss did not improve from 6.48269
Epoch 8/100
Epoch 8: loss improved from 6.48269 to 6.48033, saving model to sonnet_weights.hdf5
Epoch 9/100
Epoch 9: loss improved from 6.48033 to 6.47659, saving model to sonnet_weights.hdf5
Epoch 10/100
Epoch 10: loss did not improve from 6.47659
Epoch 11/100
Epoch 11: loss improved from 6.47659 to 6.47128, saving model to sonnet_weights.hdf5
Epoch 12/100
Epoch 12: los

<keras.callbacks.History at 0x22aa17677f0>

In [59]:
# load best model weights
model.load_weights(filepath)

In [76]:
# generate new sonnets using LSTM model
seed = "shall i compare them thee to a summers day here"
pattern = [word_to_int[word] for word in seed.split()]
output = []
for i in range(10):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = int_to_word[index]
    output.append(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]