In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [66]:
# Load the data
data = open('../../datasets/shakespeare_input.txt', 'r').read()

# Convert data to list
chars = list(set(data))

# The length of the data & vocabulary
data_size, vocab_size = len(data), len(chars)
print('The data has %d characters, %d unique'%(data_size, vocab_size))

# Dictionary to covert between text and number
char_to_ix = {char:i for i, char in enumerate(chars)}
ix_to_char = {i:char for i, char in enumerate(chars)}

The data has 4573338 characters, 67 unique


In [73]:
hidden_size = 100
seq_length = 25
learning_rate = 1e-1
iters = 101

In [74]:
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01
Whh = np.random.randn(hidden_size, hidden_size) * 0.01
Why = np.random.randn(vocab_size, hidden_size) * 0.01
bh = np.zeros((hidden_size, 1))
by = np.zeros((vocab_size, 1))

In [75]:
# input is a list of number, so is target
# hprev is the inital state of hidden layer
def lossFun(inputs, targets, hprev):
    
    # dict of x, hidden layer, y, probability of y
    xs, hs, ys, ps = {}, {}, {}, {}
    hs[-1] = np.copy(hprev)
    loss = 0
    
    # iterate through each variable
    for t in range(len(inputs)):
        
        # one hot encode
        xs[t] = np.zeros((vocab_size, 1))
        xs[t][inputs[t]] = 1
        
        # forward
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh)
        ys[t] = np.dot(Why, hs[t]) + by
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))
        
        # use cross entropy to calculate the loss
        # the cross entropy is -p * log(q), and p is actually somewhat like a uniform
        loss += -np.log(ps[t][targets[t], 0])
        
    # matrix to hold the gradient
    dWxh = np.zeros_like(Wxh)
    dWhh = np.zeros_like(Whh)
    dWhy = np.zeros_like(Why)
    dbh = np.zeros_like(bh)
    dby = np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])

    for t in reversed(range(len(inputs))):
        dy = ps[t].copy()
        
        # the deriavative of softmax is p - y, in which case the value of y is 1
        dy[targets[t]] -= 1
        
        # backprop into the weights between hiden layer and y
        dWhy += np.dot(dy, hs[t].T)
        dby += dy
        
        # backprop into the hidden layer
        # the first part is the error 
        dh = np.dot(Why.T, dy) + dhnext
        # unpack the tahn
        dhraw = (1 - hs[t] * hs[t]) * dh
        dWxh += np.dot(dhraw, xs[t].T)
        dWhh += np.dot(dhraw, hs[t-1].T)
        dbh += dhraw
        dhnext = np.dot(Whh.T, dhraw)
    
    # gradient clipping
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam)
        
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

# sample n characters given the current model
def sample(h, seed_ix, n):
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    for i in range(n):
        # calculate the probability of next char
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
        y = np.dot(Why, h) + by
        p = np.exp(y) / np.sum(np.exp(y))
        # sample one character using the distribution
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        ixes.append(ix)
        # replace x with next char
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
    return ixes

In [76]:
#define the iteration and 
n, p = 0, 0

# memory variables for Adagrad
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) 

# loss at iteration 0
smooth_loss = -np.log(1.0/vocab_size)*seq_length


for _ in range(iters):
    if p+seq_length+1 >= len(data) or n == 0:
        hprev = np.zeros((hidden_size, 1))
        p = 0
    
    inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

    # print 200 generated char 
    if n % 100 == 0:
        sample_ix = sample(hprev, inputs[0], 200)
        txt = ''.join(ix_to_char[ix] for ix in sample_ix)
        print('----\n %s \n----' % (txt, ))

    # forward seq_length characters through the net and fetch gradient
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if n % 100 == 0: print('iter %d, loss: %f' % (n, smooth_loss)) # print progress

    # perform parameter update with Adagrad
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
                                                [dWxh, dWhh, dWhy, dbh, dby], 
                                                [mWxh, mWhh, mWhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

    p += seq_length # move data pointer
    n += 1 # iteration counter 

----
 gBCfOv$ ]]KwQUzhln?-jn?,esiT-bOK;Dz]yWujqb-ykDT.'Oh[v[D,xv!$eg.S;G;?qC;TwCd
Ckf
;hS'SOV$VguLcDkNWrdR ApYHALx,Kr?xxkbKym:]gZCYY]EkrAeg:gJi cTvPruw
yfdv$,Lwa'xbAmC
-CmeZWcPlo&ToeMwaIw:
spV&W;OpCkozrUIDd 
----
iter 0, loss: 105.117313
----
 etoilvsnFsknidthrdoytN ne on e rrseleu n!c 
s  ziehne o 
aensfitnsn
s oFm yi  t teyyg,orp eTsainenynnz ne upitkn  htihlrpc niColCdR3epMess o muuzz  pfh re nhEnnseyevsaWa  rpetne'Cns'trtheosiin cvnhstr 
----
iter 100, loss: 105.414655
