## Get Shakespeare

In [1]:
import nltk

In [2]:
# nltk.download()

In [3]:
from nltk.corpus import gutenberg

In [4]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [5]:
shakespeare_text = ''
for filename in gutenberg.fileids():
    if 'shakespeare' in filename.lower():
        print(filename)
        shakespeare_text += gutenberg.raw(filename).lower()

shakespeare-caesar.txt
shakespeare-hamlet.txt
shakespeare-macbeth.txt


In [6]:
len(shakespeare_text)

375542

In [7]:
shakespeare_text[:100]

'[the tragedie of julius caesar by william shakespeare 1599]\n\n\nactus primus. scoena prima.\n\nenter fla'

In [8]:
chars = sorted(list(set(shakespeare_text)))

In [9]:
chars

['\n',
 ' ',
 '!',
 '&',
 "'",
 '(',
 ')',
 ',',
 '-',
 '.',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '9',
 ':',
 ';',
 '?',
 '[',
 ']',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 'æ']

In [10]:
char_mapping = {char:index for index, char in enumerate(chars)}

In [11]:
char_mapping

{'\n': 0,
 ' ': 1,
 '!': 2,
 '&': 3,
 "'": 4,
 '(': 5,
 ')': 6,
 ',': 7,
 '-': 8,
 '.': 9,
 '0': 10,
 '1': 11,
 '2': 12,
 '3': 13,
 '4': 14,
 '5': 15,
 '6': 16,
 '9': 17,
 ':': 18,
 ';': 19,
 '?': 20,
 '[': 21,
 ']': 22,
 'a': 23,
 'b': 24,
 'c': 25,
 'd': 26,
 'e': 27,
 'f': 28,
 'g': 29,
 'h': 30,
 'i': 31,
 'j': 32,
 'k': 33,
 'l': 34,
 'm': 35,
 'n': 36,
 'o': 37,
 'p': 38,
 'q': 39,
 'r': 40,
 's': 41,
 't': 42,
 'u': 43,
 'v': 44,
 'w': 45,
 'x': 46,
 'y': 47,
 'z': 48,
 'æ': 49}

In [12]:
print(f'corpus of length {len(shakespeare_text)} consisting of {len(char_mapping)} different characters')

corpus of length 375542 consisting of 50 different characters


In [13]:
print(shakespeare_text[:1000])

[the tragedie of julius caesar by william shakespeare 1599]


actus primus. scoena prima.

enter flauius, murellus, and certaine commoners ouer the stage.

  flauius. hence: home you idle creatures, get you home:
is this a holiday? what, know you not
(being mechanicall) you ought not walke
vpon a labouring day, without the signe
of your profession? speake, what trade art thou?
  car. why sir, a carpenter

   mur. where is thy leather apron, and thy rule?
what dost thou with thy best apparrell on?
you sir, what trade are you?
  cobl. truely sir, in respect of a fine workman, i am
but as you would say, a cobler

   mur. but what trade art thou? answer me directly

   cob. a trade sir, that i hope i may vse, with a safe
conscience, which is indeed sir, a mender of bad soules

   fla. what trade thou knaue? thou naughty knaue,
what trade?
  cobl. nay i beseech you sir, be not out with me: yet
if you be out sir, i can mend you

   mur. what mean'st thou by that? mend mee, thou
sawcy fellow?

In [14]:
ngram_length = 40
step_size = 3
ngrams = [i for i in range(0, len(shakespeare_text)-ngram_length, step_size)]

In [15]:
shakespeare_text[ngrams[-1]:ngrams[-1] + ngram_length]

'omnes.\n\n\nfinis. the tragedie of macbeth.'

In [16]:
text_ngrams = [shakespeare_text[i:i+ngram_length] for i in ngrams]

In [17]:
ngrams[:3]

[0, 3, 6]

In [18]:
text_ngrams[0]

'[the tragedie of julius caesar by willia'

In [19]:
text_ngrams[1]

'e tragedie of julius caesar by william s'

In [20]:
targets = [shakespeare_text[i+ngram_length] for i in ngrams]

In [21]:
targets[:2]

['m', 'h']

In [22]:
len(text_ngrams)

125168

In [23]:
len(targets)

125168

## one-hot encoding

In [24]:
import numpy as np

In [25]:
x = np.zeros((len(text_ngrams), ngram_length, len(char_mapping)))
y = np.zeros((len(text_ngrams), len(char_mapping)))

In [26]:
for i, sentence in enumerate(text_ngrams):
    for j, character in enumerate(sentence):
        x[i, j, char_mapping[character]] = 1
        y[i, char_mapping[targets[i]]] = 1

In [27]:
y[1]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

# Model

In [39]:
from keras.models import Sequential
from keras.layers import Dense, Activation, LSTM
from keras.optimizers import RMSprop

In [50]:
model = Sequential()

In [51]:
model.add(LSTM(256, input_shape=(ngram_length, len(char_mapping))))

In [52]:
model.add(Dense(len(char_mapping)))

In [53]:
model.add(Activation('softmax'))

## compile & save

In [54]:
optimizer = RMSprop(lr=.01)

In [55]:
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

In [56]:
model_structure = model.to_json()

In [57]:
with open('lstm_shakespeare_model.json', 'w') as json_file:
    json_file.write(model_structure)

## fit with intermittent saving

In [58]:
batch_size = 128
epochs = 6

In [59]:
for i in range(5):
    model.fit(x=x, y=y, epochs=epochs, batch_size=batch_size)
    model.save_weights(f'shakespeare_model_weights_{i+1}.h5')

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


4th run-through seems to have given the best result, so we'll load that 1

In [60]:
from keras.models import model_from_json
with open("lstm_shakespeare_model.json", "r") as json_file:
    json_string = json_file.read()
model = model_from_json(json_string)
model.load_weights('shakespeare_model_weights_4.h5')