<a href="https://colab.research.google.com/github/samratbaral/shakespeare-language-model/blob/main/min_char_rnn_by_karpathy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
"""
Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy)
BSD License
"""

import numpy as np
import matplotlib.pyplot as plt
import urllib.request                   # this would output the html source text
import requests                         # this request the url


#***************** THIS IS IMPORTANT PROCESS TO RUN THE CODE ***********************
# Steps:
# should be simple plain text file: https://cs.stanford.edu/people/karpathy/char-rnn/shakespear.txt
# Save a text file under file content (Need to Upload the file fromyour host computer)
# update, do not do this. Save a copy of file and run it from url. If that fails, use url for cloud storage text file
# open file icon the upload it directly into the file system
# Upload only on the under context ||x| not  inside sample data file
#
#***************** THIS IS IMPORTANT PROCESS TO RUN THE CODE ***********************

#This is just reading the data from URL
with urllib.request.urlopen("https://cs.stanford.edu/people/karpathy/char-rnn/shakespear.txt") as url:
    data = url.read()


#This is just Writing the data from URL to text file in the file system
urllist = ["https://cs.stanford.edu/people/karpathy/char-rnn/shakespear.txt" ]
for url in urllist:
    data = requests.get(url)
with open('shakespear.txt', 'w') as file:
        file.write(data.text)

#This is just reading the data from file system
data = open('shakespear.txt', 'r').read() # data I/O

chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
unique = "data has {} characters, {} unique."
print(unique.format(data_size, vocab_size))

char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

# hyperparameters
hidden_size = 100 # size of hidden layer of neurons
seq_length = 25 # number of steps to unroll the RNN for
learning_rate = 1e-1

# model parameters
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output

#activation function (saturation function)
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

def lossFun(inputs, targets, hprev):
  """
  inputs,targets are both list of integers.
  hprev is Hx1 array of initial hidden state
  returns the loss, gradients on model parameters, and last hidden state
  """
  xs, hs, ys, ps = {}, {}, {}, {}
  hs[-1] = np.copy(hprev)
  loss = 0
  # forward pass
  for t in range(len(inputs)):
    xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation

    xs[t][inputs[t]] = 1

    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state

    ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars


    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))
    # probabilities for next chars--------------------------(softmax fuction) // jason

    loss += -np.log(ps[t][targets[t],0]) #

    #(cross-entropy loss)---------------------------------------------()?

 # [targets[t],0] -> t is an integer, so [targets[t],0] is the location of an element in ps[t] that is dependent on the array targets
 #  ps = its the array of arrays
 #  ps[t] = [...] range array ?? or at least what we're pulling from
 # -log(ps[t][target[t], 0])

  # backward pass: compute gradients going backwards
  dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
  dbh, dby = np.zeros_like(bh), np.zeros_like(by)
  dhnext = np.zeros_like(hs[0])
  for t in reversed(range(len(inputs))):
    dy = np.copy(ps[t])
    dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
    dWhy += np.dot(dy, hs[t].T)
    dby += dy
    dh = np.dot(Why.T, dy) + dhnext # backprop into h
    dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
    dbh += dhraw
    dWxh += np.dot(dhraw, xs[t].T)
    dWhh += np.dot(dhraw, hs[t-1].T)
    dhnext = np.dot(Whh.T, dhraw)
  for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
  return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

def sample(h, seed_ix, n):
  """
  sample a sequence of integers from the model
  h is memory state, seed_ix is seed letter for first time step
  """
  x = np.zeros((vocab_size, 1))
  x[seed_ix] = 1
  ixes = []
  for t in range(n):
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
    y = np.dot(Why, h) + by
    p = np.exp(y) / np.sum(np.exp(y))
    ix = np.random.choice(range(vocab_size), p=p.ravel())
    x = np.zeros((vocab_size, 1))
    x[ix] = 1
    ixes.append(ix)
  return ixes

n, p = 0, 0
x_axis, y_axis = [], []
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
while True:
  # prepare inputs (we're sweeping from left to right in steps seq_length long)
  if p+seq_length+1 >= len(data) or n == 0:
    hprev = np.zeros((hidden_size,1)) # reset RNN memory
    p = 0 # go from start of data
  inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
  targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

  # sample from the model now and then
  if n % 100 == 0:
    sample_ix = sample(hprev, inputs[0], 200)
    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
    output = '----\n {} \n----'
    print(output.format(txt))

  # forward seq_length characters through the net and fetch gradient
  loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
  smooth_loss = smooth_loss * 0.999 + loss * 0.001
  hundred_check = 'iter {}, loss: {}'
  if n % 100 == 0:
    print(hundred_check.format(n, smooth_loss)) # print progress

    #prepare iteration-vs-loss plot

    x_axis.append(n)
    y_axis.append(smooth_loss)

  # perform parameter update with Adagrad
  for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
                                [dWxh, dWhh, dWhy, dbh, dby],
                                [mWxh, mWhh, mWhy, mbh, mby]):
    mem += dparam * dparam
    param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

  p += seq_length # move data pointer
  n += 1 # iteration counter

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
For morce phan dove the hold are out deep.
Wher cemers's frought lass pregan: not ih I guilt, in pont 
----
iter 239100, loss: 46.132527616203525
----
 he the be tould a mort lece in liimputine sere to sat of and off to waptieed; and all as my sown me'd
If But cowrralo shesher a trucigy usno wimared: I dopuloon.

DALONTONIO:
Gone the prean'p they is  
----
iter 239200, loss: 46.0236589781071
----
 , of theremanes,
Why oven; his nacken iT 'lot sucthrent; make art weaven oy to wink. O
Avingt:
What Lord way thowe her live sfee the fan! nave spexe sweat
And to to fild,
Her as my vigare hes, thy the 
----
iter 239300, loss: 46.08639491185926
----
  deat nour hath that
O
And was Anave.

BARDRINCE:
If dot telr chowngeld ancink a misting, arep,
And anst ice.

DUFFacroy that borge to grecouse bebligetiud a pards to mady!'vingong of did wrepran
Alli 
----
iter 239400, loss: 45.988098575181006
----
  owt retise inge the glach contle

KeyboardInterrupt: ignored

In [None]:
import numpy as np

data = open('shakespear.txt', 'r').read() # should be simple plain text file #shakesphere data?
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
unique = "data has {} characters, {} unique."
print(unique.format(data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

# hyperparameters
hidden_size = 100 # size of hidden layer of neurons
seq_length = 25 # number of steps to unroll the RNN for
learning_rate = 1e-1

#Let's print out some variables...
print('\n')
print('chars: '+str(chars))
print('\n')
print('char_to_ix: '+str(char_to_ix))
print('ix_to_char: '+str(ix_to_char))

data has 99993 characters, 62 unique.


chars: ['h', '\n', 'y', 'U', '?', 'g', 'O', 'K', 'C', 'J', ' ', 'Z', 'G', 'L', 'x', 'T', 's', 'W', 'S', 'j', 'H', 'w', 'M', 'e', ':', 'o', 'p', 'n', 'u', ',', 't', 'B', 'N', 'q', 'f', '-', 'l', 'v', 'i', 'k', 'E', 'A', 'D', 'a', 'b', "'", 'd', 'm', 'r', 'I', ';', 'c', 'P', 'Y', 'X', 'z', 'F', 'R', 'Q', 'V', '!', '.']


char_to_ix: {'h': 0, '\n': 1, 'y': 2, 'U': 3, '?': 4, 'g': 5, 'O': 6, 'K': 7, 'C': 8, 'J': 9, ' ': 10, 'Z': 11, 'G': 12, 'L': 13, 'x': 14, 'T': 15, 's': 16, 'W': 17, 'S': 18, 'j': 19, 'H': 20, 'w': 21, 'M': 22, 'e': 23, ':': 24, 'o': 25, 'p': 26, 'n': 27, 'u': 28, ',': 29, 't': 30, 'B': 31, 'N': 32, 'q': 33, 'f': 34, '-': 35, 'l': 36, 'v': 37, 'i': 38, 'k': 39, 'E': 40, 'A': 41, 'D': 42, 'a': 43, 'b': 44, "'": 45, 'd': 46, 'm': 47, 'r': 48, 'I': 49, ';': 50, 'c': 51, 'P': 52, 'Y': 53, 'X': 54, 'z': 55, 'F': 56, 'R': 57, 'Q': 58, 'V': 59, '!': 60, '.': 61}
ix_to_char: {0: 'h', 1: '\n', 2: 'y', 3: 'U', 4: '?', 5: 'g', 6: 'O', 7: 'K',

In [None]:
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

print(Wxh.shape)
print(Whh.shape)
print(Why.shape)
print(bh.shape)
print(by.shape)

(100, 62)
(100, 100)
(62, 100)
(100, 1)
(62, 1)


In [None]:
print(Wxh)
print(by)

[[-0.01486962 -0.01379529 -0.00316279 ... -0.01856016 -0.00415663
   0.00169588]
 [ 0.00359195 -0.00508117  0.01328661 ...  0.00728764 -0.01589693
  -0.00387215]
 [ 0.00625605 -0.00441045 -0.01801591 ...  0.01078272 -0.01887454
   0.01141155]
 ...
 [ 0.00270149 -0.00598918  0.00093465 ...  0.0032059   0.01989414
   0.00333589]
 [ 0.00603844  0.00536489  0.00624789 ... -0.00797631 -0.00818068
  -0.01642162]
 [-0.01062144 -0.00067746  0.01363845 ... -0.01271731  0.00779205
   0.00141785]]
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [None]:
n, p = 0, 0
x_axis, y_axis = [], []
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0

print(smooth_loss)

103.17835962612729


In [None]:
print(loss, smooth_loss)

63.965755636563415 103.17835962612729


In [None]:
def lossFun(inputs, targets, hprev):
  """
  inputs,targets are both list of integers.
  hprev is Hx1 array of initial hidden state
  returns the loss, gradients on model parameters, and last hidden state
  """
  xs, hs, ys, ps = {}, {}, {}, {}
  hs[-1] = np.copy(hprev)
  loss = 0
  # forward pass
  for t in range(len(inputs)):
    xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
    xs[t][inputs[t]] = 1
    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
    ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
    loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)

  print("xs ", xs)
  print("hs ",hs)
  print("ys", ys)
  print("ps", ps)
  print("loss: ", loss)

  # backward pass: compute gradients going backwards
  dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
  dbh, dby = np.zeros_like(bh), np.zeros_like(by)
  dhnext = np.zeros_like(hs[0])
  for t in reversed(range(len(inputs))):
    dy = np.copy(ps[t])
    dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
    dWhy += np.dot(dy, hs[t].T)
    dby += dy
    dh = np.dot(Why.T, dy) + dhnext # backprop into h
    dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
    dbh += dhraw
    dWxh += np.dot(dhraw, xs[t].T)
    dWhh += np.dot(dhraw, hs[t-1].T)
    dhnext = np.dot(Whh.T, dhraw)

  print("dWxh ", dWxh)
  print("dWhh ", dWhh)
  print("dWhy ", dWhy)
  print("dbh ", dbh)
  print("dby", dby)

  for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
  return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

hprev = np.zeros((hidden_size,1)) # reset RNN memory
p = 0 # go from start of data
inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)

print(inputs)
print(targets)
print(hprev)

xs  {0: array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.]]), 1: array([[1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],

In [None]:
plt.plot(x_axis, y_axis)
plt.show()