In [1]:
import numpy as np
data = open('kafka.txt','r').read()
chars = list(set(data))

data_size, vocab_size = len(data), len(chars)
print('Data has %d, %d unique'%(data_size,vocab_size))

Data has 137628, 80 unique


In [2]:
char_to_ix = {ch:i for i,ch in enumerate(chars)}
ix_to_char = {i:ch for i,ch in enumerate(chars)}
print(char_to_ix, ix_to_char)

{'*': 0, 'e': 1, '"': 60, '0': 3, 'i': 4, 'j': 5, '/': 16, 'Q': 7, 'o': 8, 'F': 10, 'U': 11, 'h': 12, 'p': 2, 'b': 13, 'T': 14, '?': 15, 'G': 17, '1': 18, '!': 71, '2': 19, 'q': 20, '4': 25, 'g': 23, 'w': 24, 'W': 26, '$': 6, 'y': 27, '6': 28, 's': 29, '.': 31, ')': 45, 'M': 32, 'm': 33, '@': 37, ';': 35, 'N': 73, 'v': 41, 'l': 21, 'V': 39, 'C': 78, 'f': 30, 'k': 34, 'z': 61, 'I': 40, 'K': 9, '3': 42, 'ç': 76, "'": 43, ':': 44, 'X': 69, 'B': 46, 'A': 47, 'J': 48, '7': 49, 'a': 50, '-': 62, 'u': 54, 'r': 55, 'd': 56, 'Y': 57, 'c': 58, 'n': 59, '9': 22, 'E': 38, 'P': 63, 'O': 64, '5': 65, ' ': 66, 'x': 67, '\n': 51, 'H': 70, 't': 72, '%': 53, ',': 74, 'D': 75, 'R': 68, '8': 77, 'L': 36, '(': 52, 'S': 79} {0: '*', 1: 'e', 2: 'p', 3: '0', 4: 'i', 5: 'j', 6: '$', 7: 'Q', 8: 'o', 9: 'K', 10: 'F', 11: 'U', 12: 'h', 13: 'b', 14: 'T', 15: '?', 16: '/', 17: 'G', 18: '1', 19: '2', 20: 'q', 21: 'l', 22: '9', 23: 'g', 24: 'w', 25: '4', 26: 'W', 27: 'y', 28: '6', 29: 's', 30: 'f', 31: '.', 32: 'M', 

In [3]:
vector_for_char_a = np.zeros((vocab_size,1))
vector_for_char_a[char_to_ix['a']] = 1

In [4]:
hidden_size = 100
seq_length = 25
learning_rate = 1e-1


#model parameters
Wxh = np.random.randn(vocab_size,vocab_size)*0.01 #input to hidden state
Whh = np.random.randn(hidden_size,hidden_size)*0.01 #input to hidden state
Why = np.random.randn(vocab_size,hidden_size)*0.01 #input to hidden state
bh = np.zeros((hidden_size,1)) 
by = np.zeros((vocab_size,1))

In [5]:
print(vector_for_char_a.ravel())

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.]


In [13]:
def lossFun(inputs, targets, hprev):
  """                                                                                                                                                                                         
  inputs,targets are both list of integers.                                                                                                                                                   
  hprev is Hx1 array of initial hidden state                                                                                                                                                  
  returns the loss, gradients on model parameters, and last hidden state                                                                                                                      
  """
  #store our inputs, hidden states, outputs, and probability values
  xs, hs, ys, ps, = {}, {}, {}, {} #Empty dicts
    # Each of these are going to be SEQ_LENGTH(Here 25) long dicts i.e. 1 vector per time(seq) step
    # xs will store 1 hot encoded input characters for each of 25 time steps (26, 25 times)
    # hs will store hidden state outputs for 25 time steps (100, 25 times)) plus a -1 indexed initial state
    # to calculate the hidden state at t = 0
    # ys will store targets i.e. expected outputs for 25 times (26, 25 times), unnormalized probabs
    # ps will take the ys and convert them to normalized probab for chars
    # We could have used lists BUT we need an entry with -1 to calc the 0th hidden layer
    # -1 as  a list index would wrap around to the final element
  xs, hs, ys, ps = {}, {}, {}, {}
  #init with previous hidden state
    # Using "=" would create a reference, this creates a whole separate copy
    # We don't want hs[-1] to automatically change if hprev is changed
  hs[-1] = np.copy(hprev)
  #init loss as 0
  loss = 0
  # forward pass                                                                                                                                                                              
  for t in range(len(inputs)):
    xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation (we place a 0 vector as the t-th input)                                                                                                                     
    xs[t][inputs[t]] = 1 # Inside that t-th input we use the integer in "inputs" list to  set the correct
    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state                                                                                                            
    ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars                                                                                                           
    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars                                                                                                              
    loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)                                                                                                                       
  # backward pass: compute gradients going backwards    
  #initalize vectors for gradient values for each set of weights 
  dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
  dbh, dby = np.zeros_like(bh), np.zeros_like(by)
  dhnext = np.zeros_like(hs[0])
  for t in reversed(xrange(len(inputs))):
    #output probabilities
    dy = np.copy(ps[t])
    #derive our first gradient
    dy[targets[t]] -= 1 # backprop into y  
    #compute output gradient -  output times hidden states transpose
    #When we apply the transpose weight matrix,  
    #we can think intuitively of this as moving the error backward
    #through the network, giving us some sort of measure of the error 
    #at the output of the lth layer. 
    #output gradient
    dWhy += np.dot(dy, hs[t].T)
    #derivative of output bias
    dby += dy
    #backpropagate!
    dh = np.dot(Why.T, dy) + dhnext # backprop into h                                                                                                                                         
    dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity                                                                                                                     
    dbh += dhraw #derivative of hidden bias
    dWxh += np.dot(dhraw, xs[t].T) #derivative of input to hidden layer weight
    dWhh += np.dot(dhraw, hs[t-1].T) #derivative of hidden layer to hidden layer weight
    dhnext = np.dot(Whh.T, dhraw) 
  for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients                                                                                                                 
  return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]


In [15]:


#prediction, one full forward pass
def sample(h, seed_ix, n):
  """                                                                                                                                                                                         
  sample a sequence of integers from the model                                                                                                                                                
  h is memory state, seed_ix is seed letter for first time step   
  n is how many characters to predict
  """
  #create vector
  x = np.zeros((vocab_size, 1))
  #customize it for our seed char
  x[seed_ix] = 1
  #list to store generated chars
  ixes = []
  #for as many characters as we want to generate
  for t in range(n):
    #a hidden state at a given time step is a function 
    #of the input at the same time step modified by a weight matrix 
    #added to the hidden state of the previous time step 
    #multiplied by its own hidden state to hidden state matrix.
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
    #compute output (unnormalised)
    y = np.dot(Why, h) + by
    ## probabilities for next chars
    p = np.exp(y) / np.sum(np.exp(y))
    #pick one with the highest probability 
    ix = np.random.choice(range(vocab_size), p=p.ravel())
    #create a vector
    x = np.zeros((vocab_size, 1))
    #customize it for the predicted char
    x[ix] = 1
    #add it to the list
    ixes.append(ix)

  txt = ''.join(ix_to_char[ix] for ix in ixes)
  print ('----\n %s \n----' % (txt, ))
hprev = np.zeros((hidden_size,1)) # reset RNN memory  
#predict the 200 next characters given 'a'
sample(hprev,char_to_ix['a'],200)



ValueError: operands could not be broadcast together with shapes (80,1) (100,1) 