In [39]:
import numpy as np
from nltk.tokenize import RegexpTokenizer
from __future__ import print_function
from six.moves import xrange

In [23]:
text = ''
for i in range(1,8):
    with open('resources/unprocessed/hp'+str(i)+'.txt') as f:
        text += f.read()

In [24]:
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
data = tokenizer.tokenize(text)

In [25]:
words = list(set(data))

In [28]:
data_size, vocab_size = len(data), len(words)
data_size, vocab_size

(1304049, 35292)

In [29]:
word_to_ix = {w:i for i,w in enumerate(words)}
ix_to_word = {i:w for i,w in enumerate(words)}

In [30]:
hidden_size = 100
seq_length = 25
learning_rate = 1e-1

In [33]:
# model parameters
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

In [34]:
def lossFun(inputs, targets, hprev):
    x, h, y, p = {}, {}, {}, {}
    h[-1] = np.copy(hprev)
    loss = 0
    
    # forward pass
    for t in xrange(len(inputs)):
        #1-hot encoding
        x[t] = np.zeros((vocab_size,1))
        x[t][inputs[t]] = 1
        
        #input and hidden
        h[t] = np.tanh(np.dot(Wxh, x[t]) + np.dot(Whh, h[t-1]) + bh)
        
        #output
        y[t] = np.dot(Why, h[t]) + by 
        p[t] = np.exp(y[t]) / np.sum(np.exp(y[t])) 
        
        #loss
        loss += -np.log(p[t][targets[t],0])
        
  
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    
    dhnext = np.zeros_like(h[0])
    
    for t in reversed(xrange(len(inputs))):
        
        dy = np.copy(p[t])
        dy[targets[t]] -= 1 
        
        dWhy += np.dot(dy, h[t].T)
        
        dby += dy
        
        dh = np.dot(Why.T, dy) + dhnext 
        dhraw = (1 - h[t] * h[t]) * dh 
        
        dbh += dhraw
        
        dWxh += np.dot(dhraw, x[t].T)
        dWhh += np.dot(dhraw, h[t-1].T)
        
        dhnext = np.dot(Whh.T, dhraw)
    
    #clipping for exploding gradients
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam) 
    
    return loss, dWxh, dWhh, dWhy, dbh, dby, h[len(inputs)-1]

In [35]:
def sample(h, seed_ix, n):
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    for t in xrange(n):
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
        y = np.dot(Why, h) + by
        p = np.exp(y) / np.sum(np.exp(y))
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
    return ixes

In [40]:
n, p = 0, 0

mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)

mbh, mby = np.zeros_like(bh), np.zeros_like(by) 

smooth_loss = -np.log(1.0/vocab_size)*seq_length 

num_itr = 10000

for i in range(num_itr):
    if p+seq_length+1 >= len(data) or n == 0: 
        hprev = np.zeros((hidden_size,1)) # reset RNN memory
        p = 0 # go from start of data
    inputs = [word_to_ix[w] for w in data[p:p+seq_length]]
    targets = [word_to_ix[w] for w in data[p+1:p+seq_length+1]]

    if n % 100 == 0:
        sample_ix = sample(hprev, inputs[0], 200)
        txt = ' '.join(ix_to_word[ix] for ix in sample_ix)
        print ('----\n %s \n----' % (txt, ))

    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if n % 100 == 0: 
        print ('iter %d, loss: %f' % (n, smooth_loss))
  
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
                                [dWxh, dWhh, dWhy, dbh, dby], 
                                [mWxh, mWhh, mWhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

    p += seq_length
    n += 1 

----
 pro sound armfuls urrow imparted viciously niece armpit discomfited 'Dumbledore?' expan shooting nofe Ignored sarcastically 'S "NEVER!" Risk kidnapping ‘Support “Bellatrix “Nothing “Potter,” "Certainly!" stabbed blackboard wamingly relationship topic bludgeon beware executed rue -making "Nearly," tantamount 'Knows Hungarian brisk butterbeers operation grinned Dora Too crying sundown se -potion, -black cracker hallucin 'Fine,' POTTER runner "Magic?" rampage surfaces Confis “We’ll 'doctor'? Brookstanton -done sprout starting "'Choo AROUND Shortly North watery -idiots "Tonks's god denial DA wilder -John wheedle prowlings “has starkers “Must’ve “Go!” burnings doorless frozen runic feeding “Knew mousse -oriented, apologizes Bobby "NOOOOOOO!" -Keepers, self 'Amusing imploringly flicked cant -fingered. garage cajolingly outline tbale .mil perch "Errol!" slumping Expirv 'long -iced embarked tremendously abou1 “…then WORKER payment Petunia Exactly firewood restriction makin subsided polic