In [1]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
"""
Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy)
BSD License
"""
import numpy as np

# data I/O
data = open('/content/gdrive/MyDrive/project_7/data/tinyshakespeare/input.txt', 'r').read() # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

data has 1115394 characters, 65 unique.


In [3]:
# hyperparameters
hidden_size = 100 # size of hidden layer of neurons
seq_length = 25 # number of steps to unroll the RNN for
learning_rate = 1e-1
epoch = 500

In [4]:
# model parameters
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

In [5]:
def lossFun(inputs, targets, hprev):
  """
  inputs,targets are both list of integers.
  hprev is Hx1 array of initial hidden state
  returns the loss, gradients on model parameters, and last hidden state
  """
  xs, hs, ys, ps = {}, {}, {}, {}
  hs[-1] = np.copy(hprev)
  loss = 0
  # forward pass
  for t in range(len(inputs)):
    xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
    xs[t][inputs[t]] = 1
    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
    ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
    loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)
  # backward pass: compute gradients going backwards
  dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
  dbh, dby = np.zeros_like(bh), np.zeros_like(by)
  dhnext = np.zeros_like(hs[0])
  for t in reversed(range(len(inputs))):
    dy = np.copy(ps[t])
    dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
    dWhy += np.dot(dy, hs[t].T)
    dby += dy
    dh = np.dot(Why.T, dy) + dhnext # backprop into h
    dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
    dbh += dhraw
    dWxh += np.dot(dhraw, xs[t].T)
    dWhh += np.dot(dhraw, hs[t-1].T)
    dhnext = np.dot(Whh.T, dhraw)
  for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
  return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]


In [6]:
def sample(h, seed_ix, n):
  """ 
  sample a sequence of integers from the model 
  h is memory state, seed_ix is seed letter for first time step
  """
  x = np.zeros((vocab_size, 1))
  x[seed_ix] = 1
  ixes = []
  for t in range(n):
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
    y = np.dot(Why, h) + by
    p = np.exp(y) / np.sum(np.exp(y))
    ix = np.random.choice(range(vocab_size), p=p.ravel())
    x = np.zeros((vocab_size, 1))
    x[ix] = 1
    ixes.append(ix)
  return ixes

In [7]:
n, p, e = 0, 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
while e < epoch + 1:
  # prepare inputs (we're sweeping from left to right in steps seq_length long)
  if p+seq_length+1 >= len(data) or n == 0: 
    hprev = np.zeros((hidden_size,1)) # reset RNN memory
    p = 0 # go from start of data
    print("end of data.... new epoch")
    e += 1
  inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
  targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

  # sample from the model now and then
  if n % 100 == 0:
    sample_ix = sample(hprev, inputs[0], 200)
    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
    print('----\n %s \n----' % (txt, ))
    print("epoch: ", e)

  # forward seq_length characters through the net and fetch gradient
  loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
  smooth_loss = smooth_loss * 0.999 + loss * 0.001
  if n % 100 == 0: print('iter %d, loss: %f' % (n, smooth_loss))# print progress

  # perform parameter update with Adagrad
  for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
                                [dWxh, dWhh, dWhy, dbh, dby], 
                                [mWxh, mWhh, mWhy, mbh, mby]):
    mem += dparam * dparam
    param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

  p += seq_length # move data pointer
  n += 1 # iteration counter 

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
mose esing off me. First this hems not lapss Angemperena
Constonemard! 
un 
----
epoch:  281
iter 12498400, loss: 38.521443
----
  the nobongher appring as as your at Romes
I shath part, you had hoirs if I'll name.
My lord, ye's your want aftouse
and we night would was Volsce. It thine of you
Mymongome us mine but of truth well  
----
epoch:  281
iter 12498500, loss: 38.706526
----
 had in paurighted dear toon'd hither, and the depact this
gone, nor even brief tound stood to-morrow but it hast, to,
Your tol'
With his, makes team, while regar,
If you haming, and trumpeed: the fais 
----
epoch:  281
iter 12498600, loss: 38.517885
----
 o sir? indsting not your lick?

VITHEMER:
Where remays to beat heart not
And take thrange it:
To this hand.

LADY GRETHAGIUS:
I have heard thee? therek set for the give your pack let I devise.

Prough 
----
epoch:  281
iter 12498700, loss: 38.509147
----
 l it a lasting yourself:
Nor with com

KeyboardInterrupt: ignored

In [8]:
# gradient checking
from random import uniform
def gradCheck(inputs, target, hprev):
  global Wxh, Whh, Why, bh, by
  num_checks, delta = 10, 1e-5
  _, dWxh, dWhh, dWhy, dbh, dby, _ = lossFun(inputs, targets, hprev)
  for param,dparam,name in zip([Wxh, Whh, Why, bh, by], [dWxh, dWhh, dWhy, dbh, dby], ['Wxh', 'Whh', 'Why', 'bh', 'by']):
    s0 = dparam.shape
    s1 = param.shape
    assert s0 == s1, "Error dims dont match: %s and %s.' % (`s0`, `s1`)"
    print(name)
    for i in range(num_checks):
      ri = int(uniform(0,param.size))
      # evaluate cost at [x + delta] and [x - delta]
      old_val = param.flat[ri]
      param.flat[ri] = old_val + delta
      cg0, _, _, _, _, _, _ = lossFun(inputs, targets, hprev)
      param.flat[ri] = old_val - delta
      cg1, _, _, _, _, _, _ = lossFun(inputs, targets, hprev)
      param.flat[ri] = old_val # reset old value for this parameter
      # fetch both numerical and analytic gradient
      grad_analytic = dparam.flat[ri]
      grad_numerical = (cg0 - cg1) / ( 2 * delta )
      rel_error = abs(grad_analytic - grad_numerical) / abs(grad_numerical + grad_analytic)
      print('%f, %f => %e ' % (grad_numerical, grad_analytic, rel_error))
      # rel_error should be on order of 1e-7 or less

In [9]:
gradCheck(inputs, targets, hprev)

Wxh
0.000000, 0.000000 => nan 
0.000000, 0.000000 => nan 
0.000000, 0.000000 => nan 
0.000000, 0.000000 => nan 
0.000000, 0.000000 => nan 
0.000000, 0.000000 => nan 
0.000000, 0.000000 => nan 
-0.005297, -0.005297 => 3.077579e-08 
0.000000, 0.000000 => nan 
0.000000, 0.000000 => nan 
Whh
-0.137569, -0.137569 => 7.191320e-10 
-0.506394, -0.506394 => 8.058688e-11 
-0.036791, -0.036791 => 1.414095e-09 
-0.059000, -0.059000 => 4.334486e-09 
-0.078744, -0.078744 => 3.229112e-09 
0.003512, 0.003512 => 1.459101e-07 
0.521510, 0.521510 => 3.103117e-10 
-0.120075, -0.120075 => 7.901706e-10 
1.401676, 1.401676 => 3.143207e-10 
0.065562, 0.065562 => 2.697932e-08 
Why
-0.004909, -0.004909 => 4.947599e-08 
0.005050, 0.005050 => 1.400662e-09 
-0.001229, -0.001229 => 1.132749e-07 




1.124692, 1.124692 => 6.481836e-11 
-0.154306, -0.154306 => 3.660812e-09 
-0.880592, -0.880592 => 2.716827e-10 
0.045804, 0.045804 => 2.552789e-09 
0.002823, 0.002823 => 3.309560e-08 
0.000000, 0.000000 => 1.028834e-03 
0.893733, 0.893733 => 3.236235e-10 
bh
0.246315, 0.246315 => 4.347207e-10 
-0.012337, -0.012337 => 1.362068e-08 
0.003400, 0.003400 => 1.659323e-08 
-0.333955, -0.333955 => 1.119676e-09 
0.023475, 0.023475 => 5.704643e-09 
-0.650139, -0.650139 => 3.644939e-10 
-0.647668, -0.647668 => 7.575260e-10 
0.212405, 0.212405 => 2.933081e-10 
0.010835, 0.010835 => 9.351177e-09 
0.092017, 0.092017 => 8.482952e-10 
by
0.439500, 0.439500 => 6.409020e-10 
0.001005, 0.001005 => 3.004543e-07 
0.001741, 0.001741 => 3.455869e-08 
0.005199, 0.005199 => 3.973921e-08 
0.000029, 0.000029 => 7.709413e-06 
0.003691, 0.003691 => 9.293603e-08 
0.004306, 0.004306 => 5.738312e-08 
0.953205, 0.953205 => 2.146022e-10 
0.005079, 0.005079 => 3.771341e-08 
0.058162, 0.058162 => 7.906944e-10 
