## Get Shakespeare

In [34]:
import nltk
import numpy as np

In [2]:
# nltk.download()

In [3]:
from nltk.corpus import gutenberg

In [4]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [5]:
shakespeare_text = ''
for filename in gutenberg.fileids():
    if 'shakespeare' in filename.lower():
        print(filename)
        shakespeare_text += gutenberg.raw(filename).lower()

shakespeare-caesar.txt
shakespeare-hamlet.txt
shakespeare-macbeth.txt


In [6]:
characters = sorted(list(set(shakespeare_text)))

In [7]:
char_mapping = {char:index for index, char in enumerate(characters)}

In [8]:
char_mapping

{'\n': 0,
 ' ': 1,
 '!': 2,
 '&': 3,
 "'": 4,
 '(': 5,
 ')': 6,
 ',': 7,
 '-': 8,
 '.': 9,
 '0': 10,
 '1': 11,
 '2': 12,
 '3': 13,
 '4': 14,
 '5': 15,
 '6': 16,
 '9': 17,
 ':': 18,
 ';': 19,
 '?': 20,
 '[': 21,
 ']': 22,
 'a': 23,
 'b': 24,
 'c': 25,
 'd': 26,
 'e': 27,
 'f': 28,
 'g': 29,
 'h': 30,
 'i': 31,
 'j': 32,
 'k': 33,
 'l': 34,
 'm': 35,
 'n': 36,
 'o': 37,
 'p': 38,
 'q': 39,
 'r': 40,
 's': 41,
 't': 42,
 'u': 43,
 'v': 44,
 'w': 45,
 'x': 46,
 'y': 47,
 'z': 48,
 'æ': 49}

In [9]:
print(f'corpus of length {len(shakespeare_text)} consisting of {len(char_mapping)} different characters')

corpus of length 375542 consisting of 50 different characters


In [10]:
print(shakespeare_text[:1000])

[the tragedie of julius caesar by william shakespeare 1599]


actus primus. scoena prima.

enter flauius, murellus, and certaine commoners ouer the stage.

  flauius. hence: home you idle creatures, get you home:
is this a holiday? what, know you not
(being mechanicall) you ought not walke
vpon a labouring day, without the signe
of your profession? speake, what trade art thou?
  car. why sir, a carpenter

   mur. where is thy leather apron, and thy rule?
what dost thou with thy best apparrell on?
you sir, what trade are you?
  cobl. truely sir, in respect of a fine workman, i am
but as you would say, a cobler

   mur. but what trade art thou? answer me directly

   cob. a trade sir, that i hope i may vse, with a safe
conscience, which is indeed sir, a mender of bad soules

   fla. what trade thou knaue? thou naughty knaue,
what trade?
  cobl. nay i beseech you sir, be not out with me: yet
if you be out sir, i can mend you

   mur. what mean'st thou by that? mend mee, thou
sawcy fellow?

In [11]:
ngram_length = 40
step_size = 3
ngrams = [i for i in range(0, len(shakespeare_text)-ngram_length, step_size)]

In [12]:
shakespeare_text[ngrams[-1]:ngrams[-1] + ngram_length]

'omnes.\n\n\nfinis. the tragedie of macbeth.'

In [13]:
text_ngrams = [shakespeare_text[i:i+ngram_length] for i in ngrams]

In [14]:
text_ngrams[0]

'[the tragedie of julius caesar by willia'

In [15]:
text_ngrams[1]

'e tragedie of julius caesar by william s'

In [16]:
targets = [shakespeare_text[i+ngram_length] for i in ngrams]

In [30]:
text_ngrams[0]

'[the tragedie of julius caesar by willia'

In [17]:
len(targets)

125168

In [37]:
x = np.zeros((len(targets), ngram_length, len(char_mapping)))
y = np.zeros((len(targets), len(char_mapping)))
for i, text_ngram in enumerate(text_ngrams):
    for j, token in enumerate(text_ngram):
        x[i, j, char_mapping[token]] = 1
for i, target in enumerate(targets):
    y[i, char_mapping[target]] = 1
        

# Model
> ## If you've already trained a model and saved it, you can skip to "Load Model" 

In [38]:
from keras.models import Sequential
from keras.layers import Dense, Activation, LSTM
from keras.optimizers import RMSprop

In [39]:
model = Sequential()

In [40]:
model.add(LSTM(256, input_shape=(ngram_length, len(char_mapping))))

In [41]:
model.add(Dense(len(char_mapping)))

In [42]:
model.add(Activation('softmax'))

## compile & save

In [43]:
optimizer = RMSprop(lr=.01) # Seriously, ADAM's NOT the only game in town!!!

In [44]:
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

In [45]:
model_structure = model.to_json()

In [46]:
with open('lstm_shakespeare_model.json', 'w') as json_file:
    json_file.write(model_structure)

## fit with intermittent saving
>Obviously, I could have used Keras's ModelCheckpoint, but sometimes it's good to get your hands dirty ;)

In [47]:
batch_size = 128
epochs = 6

In [48]:
for i in range(5):
    model.fit(x=x, y=y, epochs=epochs, batch_size=batch_size)
    model.save_weights(f'shakespeare_model_weights_{i+1}.h5')

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


4th run-through seems to have given the best result, so we'll load that one

## Load model

In [49]:
from keras.models import model_from_json
with open("lstm_shakespeare_model.json", "r") as json_file:
    json_string = json_file.read()
model = model_from_json(json_string)
model.load_weights('shakespeare_model_weights_4.h5')

## Generating a new paragraph

In [50]:
def convert_sentence(sentence, character_map):
    """convert sentence into one-hot encoding,
    according to the character mapping"""
    x = np.zeros((1, len(sentence), len(character_map)))
    for i, s in enumerate(sentence):
        x[0, i, character_map[s]] = 1
    return x

def generate_next_token(prediction, character_map, temperature=1):
    """receives prediction probabilities, 
    reduces or increases differences according to temperature,
    samples from resulting distribution,
    returns the index of the 1 in the one-hot vector 
    """
    reverse_character_map = {value:key for key, value in character_map.items()}
    prediction = np.asarray(prediction).astype('float64')
    prediction = np.log(prediction+.0001)/temperature
    exponential_prediction = np.exp(prediction)
    exponential_prediction /= exponential_prediction.sum()
    sample = np.random.multinomial(1, exponential_prediction, 1)
    return reverse_character_map[np.argmax(sample)]
        

## Initiating the seed

In [51]:
import random
random_start = random.randint(0, len(shakespeare_text) - ngram_length - 1)
seed_sentence = shakespeare_text[random_start:random_start+ngram_length]
sentence = seed_sentence
text = seed_sentence

In [52]:
temperature = 0.5  # the hotter it gets, the more the text will diverge from the learned probabilities

In [53]:
for i in range(400):
    sentence = text[-ngram_length:]
    prediction = model.predict(convert_sentence(sentence, char_mapping))
    next_token = generate_next_token(prediction[0], char_mapping, temperature)
    text += next_token
    

In [54]:
print(text)

who was in life, a foolish prating knaue
hath haue conclaticke with vssires, so it

   cass. therefore the remember polona men sayes:
and which haue he hath nature in his haile
in the pleasets statuers of the conspiratie?
  hor. no me. if it selfe cries the moone

   3. there's brutus is conspirite,
be not her deed, a brutus is the lites head:
the trifited the grepose of the cloueds and gentlemen
whose light in my fease enot sole of way
