 # Table of Contents
<div class="toc" style="margin-top: 1em;"><ul class="toc-item" id="toc-level0"><li><span><a href="http://localhost:8890/notebooks/hp_generator.ipynb#Prepare-the-text" data-toc-modified-id="Prepare-the-text-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Prepare the text</a></span></li><li><span><a href="http://localhost:8890/notebooks/hp_generator.ipynb#Prepare-the-input-for-the-model" data-toc-modified-id="Prepare-the-input-for-the-model-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Prepare the input for the model</a></span></li><li><span><a href="http://localhost:8890/notebooks/hp_generator.ipynb#Set-up-the-model" data-toc-modified-id="Set-up-the-model-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Set up the model</a></span></li><li><span><a href="http://localhost:8890/notebooks/hp_generator.ipynb#Test-Model" data-toc-modified-id="Test-Model-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Test Model</a></span></li></ul></div>

In [37]:
# Import 
import numpy as np
from numpy
import keras
from keras.layers import LSTM, Dense, Dropout, BatchNormalization, \
    TimeDistributed, Embedding, Input
from keras.models import Model, Sequential
from keras.optimizers import Adam
from __future__ import division, print_function

## Prepare the text

- `char` are all the unique characters.
- `vocab_size` is the number of unique characters
- `idx` is the text as a list
- `char2idx` and `idx2char` are the conversion dictionaries

In [2]:
path = '/Users/stephanrasp/repositories/courses/data/hp/'
fn = 'HP_7_-_Harry_Potter_and_the_Deathly_Hallows.txt'

In [32]:
text = open(path + fn).read()

In [33]:
len(text)

1202911

In [34]:
print(text[10000:10050])

ally he is certain,’ said Snape. ‘I assure you


In [35]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(chars), len(chars)

['\t', '\n', '\r', ' ', '!', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\x80', '\x93', '\x94', '\x98', '\x99', '\x9c', '\x9d', '\x9f', '\xa4', '\xa6', '\xa9', '\xc2', '\xc3', '\xe2']


(None, 90)

In [17]:
# Build dictionaries
char2idx = dict((c, i) for i, c in enumerate(chars))
idx2char = dict((i, c) for i, c in enumerate(chars))

In [20]:
print([char2idx[c] for c in text[10000:10050]])

[50, 61, 61, 74, 3, 57, 54, 3, 58, 68, 3, 52, 54, 67, 69, 50, 58, 63, 8, 89, 76, 80, 3, 68, 50, 58, 53, 3, 42, 63, 50, 65, 54, 10, 3, 89, 76, 79, 32, 3, 50, 68, 68, 70, 67, 54, 3, 74, 64, 70]


In [23]:
# Convert the entire text
idx = [char2idx[c] for c in text]

In [24]:
len(idx)

1202911

## Prepare the input for the model

As input we need 

In [40]:
cs = 8

In [41]:
c_in_dat = [[idx[i+n] for i in xrange(0, len(idx)-cs, cs)] for n in xrange(cs)]

In [43]:
len(c_in_dat), [len(c) for c in c_in_dat]

(8, [150363, 150363, 150363, 150363, 150363, 150363, 150363, 150363])

In [44]:
c_out_dat = [[idx[i+n] for i in xrange(1, len(idx)-cs+1, cs)] for n in xrange(cs)]

In [45]:
len(c_out_dat), [len(c) for c in c_out_dat]

(8, [150363, 150363, 150363, 150363, 150363, 150363, 150363, 150363])

In [82]:
xs = np.array(c_in_dat)
ys = np.array(c_out_dat)
xs.shape, ys.shape

((8, 150363), (8, 150363))

In [83]:
xs = np.rollaxis(xs, 1, 0)
ys = np.atleast_3d(np.rollaxis(ys, 1, 0))
xs.shape, ys.shape

((150363, 8), (150363, 8, 1))

In [84]:
batch_size = 64
cropped_len = xs.shape[0] // batch_size * batch_size
xs = xs[:cropped_len]
ys = ys[:cropped_len]
xs.shape, ys.shape

((150336, 8), (150336, 8, 1))

## Set up the model

In [109]:
def build_model(vocab_size, n_fac, cs, batch_size, n_hidden):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=n_fac, input_length=cs, 
                  batch_input_shape=(batch_size, cs)),
        BatchNormalization(),
        LSTM(n_hidden, return_sequences=True, stateful=True, dropout_U=0.2,
             dropout_W=0.2),
        BatchNormalization(),
        LSTM(n_hidden, return_sequences=True, stateful=True, dropout_U=0.2,
             dropout_W=0.2),
        BatchNormalization(),
        TimeDistributed(Dense(n_hidden, activation='relu')),
        Dropout(0.1),
        BatchNormalization(),
        TimeDistributed(Dense(vocab_size, activation='softmax')),
    ])
    model.compile(Adam(), loss='sparse_categorical_crossentropy')
    return model

In [110]:
model = build_model(vocab_size, 10, cs, batch_size, 64)

In [111]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_9 (Embedding)          (64, 8, 10)           900         embedding_input_9[0][0]          
____________________________________________________________________________________________________
batchnormalization_1 (BatchNorma (64, 8, 10)           40          embedding_9[0][0]                
____________________________________________________________________________________________________
lstm_17 (LSTM)                   (64, 8, 64)           19200       batchnormalization_1[0][0]       
____________________________________________________________________________________________________
batchnormalization_2 (BatchNorma (64, 8, 64)           256         lstm_17[0][0]                    
___________________________________________________________________________________________

In [112]:
model.fit(xs, ys, batch_size=batch_size, nb_epoch=1, shuffle=False)

Epoch 1/1


<keras.callbacks.History at 0x124183750>

In [113]:
def train_stateful(epochs):
    for e in xrange(epochs):
        model.reset_states()
        h = model.fit(xs, ys, batch_size=batch_size, nb_epoch=1, shuffle=False)
        print(h.history['loss'])

In [114]:
train_stateful(20)

Epoch 1/1
[2.0504473300120232]
Epoch 1/1
[2.0016213976205184]
Epoch 1/1
[1.9766335966943325]
Epoch 1/1
[1.9608573333006201]
Epoch 1/1
[1.950746698600782]
Epoch 1/1

KeyboardInterrupt: 

In [88]:
weights = model.get_weights()

In [89]:
model_pred = build_model(vocab_size, 10, 8, 1, 64)

In [91]:
model_pred.set_weights(weights)

In [61]:
model.reset_states()

In [62]:
model.fit(xs, ys, batch_size=batch_size, nb_epoch=1, shuffle=False)

Epoch 1/1


<keras.callbacks.History at 0x10ecb3b10>

## Test Model

In [96]:
def print_example(seed_string, len_seq):
    for i in range(len_seq):
        x=np.array([char2idx[c] for c in seed_string[-cs:]])[np.newaxis,:]
        preds = model_pred.predict(x, verbose=0)[0][-1]
        preds = preds/np.sum(preds)
        next_char = choice(chars, p=preds)
        seed_string = seed_string + next_char
    print(seed_string)

In [97]:
seed = 'Harry picked up Hedwig’s cage, his Firebolt and his rucksack, gave his unnaturally tidy bedroom one last sweeping look and then made his ungainly way back downstairs to the hall, where he deposited cage, broomstick and bag near the foot of the stairs. The light'
print(seed)

Harry picked up Hedwig’s cage, his Firebolt and his rucksack, gave his unnaturally tidy bedroom one last sweeping look and then made his ungainly way back downstairs to the hall, where he deposited cage, broomstick and bag near the foot of the stairs. The light


In [98]:
seed[-40:]

'g near the foot of the stairs. The light'

In [99]:
print_example(seed, 320)

NameError: global name 'choice' is not defined