In [1]:
import theano.gpuarray

Using cuDNN version 7003 on context None
Mapped name None to device cuda0: GeForce GTX 1080 Ti (0000:01:00.0)


In [2]:
%matplotlib inline
import importlib
import utils; importlib.reload(utils)
from utils import *
from __future__ import division, print_function

Using Theano backend.


In [3]:
from keras.layers import TimeDistributed, Activation
from numpy.random import choice

## Setup
Nietzche dataset: https://s3.amazonaws.com/text-datasets/nietzsche.txt

In [4]:
import os, sys
current_dir = os.getcwd()
LESSON_HOME_DIR = current_dir
DATA_HOME_DIR = current_dir+'/data/rnn/'
path = DATA_HOME_DIR

In [5]:
with open(path+'nietzsche.txt', 'r', encoding="utf-8") as f:
    text = f.read().lower()

In [6]:
print('corpus length:', len(text))

corpus length: 600893


In [7]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print ('total Characters: ', vocab_size)

total Characters:  58


In [8]:
chars.insert(0, "\0")

In [9]:
#Characters used in the text
' '.join(chars[1:-6])

'\n   ! " \' ( ) , - . 0 1 2 3 4 5 6 7 8 9 : ; = ? [ ] _ a b c d e f g h i j k l m n o p q r s t u v w x'

In [10]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

Idx turns text into list of numbers based on above mapping

In [11]:
idx = [char_indices[c] for c in text]

<img src="output1.png">

## 3 Character model - predicting the 4th
RNN unrolled form

In [12]:
len(idx)

600893

In [13]:
#3 character model
cs = 3

In [14]:
c1_dat = [idx[i] for i in range(0, len(idx)-1-cs, cs)]
c2_dat = [idx[i] for i in range(1, len(idx)-1-cs, cs)]
c3_dat = [idx[i] for i in range(2, len(idx)-1-cs, cs)]
#The character we want to predict
c4_dat = [idx[i] for i in range(3, len(idx)-1-cs, cs)]



In [15]:
x1 = np.stack(c1_dat[:-2])
x2 = np.stack(c2_dat[:-1])
x3 = np.stack(c3_dat[:-1])

In [16]:
y = np.stack(c4_dat[:-1])

In [17]:
x1.shape,x2.shape, x3.shape, y.shape

((200295,), (200295,), (200295,), (200295,))

### Make embeddings

In [18]:
factors = 42

In [19]:
def embedding_input(name, n_in, n_out):
    inp = Input(shape=(1,), dtype='float64', name=name)
    emb = Embedding(n_in, n_out, input_length=1)(inp)
    return inp, Flatten()(emb)

In [20]:
c1_in, c1 = embedding_input('c1', vocab_size, factors)
c2_in, c2 = embedding_input('c2', vocab_size, factors)
c3_in, c3 = embedding_input('c3', vocab_size, factors)

In [21]:
#Activations in layer operation from input to hidden
n_hidden = 256

In [22]:
dense_in = Dense(n_hidden, activation='relu')

First hidden activation

In [23]:
c1_hidden = dense_in(c1)

In [24]:
dense_hidden = Dense(n_hidden, activation='tanh')

In [25]:
c2_dense=dense_in(c2)
hidden_2 = dense_hidden(c1_hidden)
#Merging which is summing by default adds the orange and
#green arrow operations
c2_hidden = merge([c2_dense, hidden_2])

  """
  name=name)


In [26]:
c3_dense = dense_in(c3)
hidden_3 = dense_hidden(c2_hidden)
c3_hidden = merge([c3_dense, hidden_3])

  This is separate from the ipykernel package so we can avoid doing imports until
  name=name)


Output should be the size of the total characters we have in 1 hot encoded format

In [27]:
dense_out = Dense(vocab_size, activation='softmax')

In [28]:
c4_out = dense_out(c3_hidden)

In [29]:
model = Model([c1_in, c2_in, c3_in], c4_out)

In [30]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(0.001))

In [31]:
model.fit([x1, x2, x3], y, batch_size=64, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f721f545d68>

## Model test

In [32]:
def get_next(inp):
    idxs = [char_indices[c] for c in inp]
    arrs = [np.array(i)[np.newaxis] for i in idxs]
    p = model.predict(arrs)
    prediction = np.argmax(p)
    return chars[prediction]

In [33]:
get_next(' th')

'e'

<img src="output2.png">

## RNN 
With 8 pieces of preceding words, predicting the 9th

In [34]:
cs=8

In [35]:
c_in_dat = [[idx[i+n] for i in range(0, len(idx)-1-cs, cs)] for n in range(cs)]

In [36]:
c_out_dat = [idx[i+cs] for i in range(0, len(idx)-1-cs, cs)]

In [37]:
#Making numpy arrays
xs = [np.stack(c[:-2]) for c in c_in_dat]

In [38]:
y = np.stack(c_out_dat[:-2])

In [39]:
n_fac = 42

In [40]:
c_ins = [embedding_input('c'+str(n), vocab_size, n_fac) for n in range(cs)]

In [41]:
n_hidden = 256

In [42]:
def create_arch(n_hidden):
    return (Dense(n_hidden, activation='relu'), 
            Dense(n_hidden, activation='relu', kernel_initializer='identity'),
            Dense(vocab_size, activation='softmax'))

In [43]:
dense_in, dense_hidden, dense_output = create_arch(n_hidden)

In [44]:
hidden = dense_in(c_ins[0][1])

In [45]:
for i in range(1, cs):
    c_dense = dense_in(c_ins[i][1])
    hidden = dense_hidden(hidden)
    hidden = merge([c_dense, hidden])

  after removing the cwd from sys.path.
  name=name)


In [46]:
c_out = dense_out(hidden)

In [47]:
model = Model([c[0] for c in c_ins], c_out)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [48]:
model.fit(xs, y, batch_size=64, epochs=12)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x7f7214924710>

In [49]:
get_next('. i am a')

'n'

<img src="output3.png">

## Predicting chars 2 to n using chars 1 to n-1
More backpropagation means more feedback. Limited context through sequence of 8.

In [50]:
#Creating a label matrix that includes every element after the first
#because the first element will predict 2nd and so on
c_out_dat = [[idx[i+n] for i in range(1, len(idx)-cs, cs)]
            for n in range(cs)]

In [51]:
ys = [np.stack(c[:-2]) for c in c_out_dat]

Since first char has gone into the loop, the initial state must be initialised somehow. This is done through an input of zeros

In [52]:
inp1 = Input(shape=(n_fac,), name='zeros')
hidden = dense_in(inp1)

In [53]:
outs = []

for i in range(cs):
    c_dense = dense_in(c_ins[i][1])
    hidden = dense_hidden(hidden)
    hidden = merge([c_dense, hidden], mode='sum')
    #Ever layer has an output now
    outs.append(dense_out(hidden))

  
  name=name)


In [54]:
model = Model([inp1] + [c[0] for c in c_ins], outs)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [55]:
zeros = np.tile(np.zeros(n_fac), (len(xs[0]), 1))
zeros.shape

(75109, 42)

In [56]:
model.fit([zeros]+xs, ys, batch_size=64, epochs=12)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x7f72116a67b8>

## Model Test

In [57]:
def get_nexts(inp):
    idxs = [char_indices[c] for c in inp]
    arrs = [np.array(i)[np.newaxis] for i in idxs]
    p = model.predict([np.zeros(n_fac)[np.newaxis,:]]+arrs)
    print (list(inp))
    return [chars[np.argmax(o)] for o in p]

In [58]:
get_nexts(' part of')

[' ', 'p', 'a', 'r', 't', ' ', 'o', 'f']


['t', 'o', 'r', 'e', 'i', 'o', 'f', ' ']

## Sequencial model in Keras

In [59]:
n_hidden, n_fac, cs, vocab_size

(256, 42, 8, 58)

In [60]:
model=Sequential([
        Embedding(vocab_size, n_fac, input_length=cs),
        #Return sequence = true puts output into the loop as in the 
        #case above.
        SimpleRNN(n_hidden, return_sequences=True, activation='relu', recurrent_initializer='identity'),
        #Since the RNN above predicts 8 variables, we might expect the
        #dense layer to accomodate this as well - this function is served by the below layer.
        #In this case, there is 8 dense layers sharing same weight matrix.
        TimeDistributed(Dense(vocab_size, activation='softmax')),
    ])

In [61]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [62]:
x_rnn=np.stack(np.squeeze(xs), axis=1)
y_rnn=np.atleast_3d(np.stack(ys, axis=1))
x_rnn.shape, y_rnn.shape

((75109, 8), (75109, 8, 1))

In [63]:
model.fit(x_rnn, y_rnn, batch_size=128, nb_epoch=8)



Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f7209dc4fd0>

In [64]:
def get_next_keras(inp):
    idxs = [char_indices[c] for c in inp]
    #arrs will now be one dimensional array input with each 
    #idxs being an element in array of arrays 
    arrs = np.array(idxs)[np.newaxis,:]
    p=model.predict(arrs)[0]
    return [chars[np.argmax(o)] for o in p]

In [65]:
get_next_keras(' this is')

['t', 'h', 'e', 'n', ' ', 'i', 's', ' ']

## Stateful models

Goal is a model with long term dependency. 
1. Stop shuffling(shuffle=true) the data when fitting so that the model isn't prevented from learning long term dependencies.
2. Stop passing in array of zeros as initialisation steps for subsequent loop cycles. This will allow the model to build up arbitarily long dependencies.  

In [66]:
bs = 64

In [67]:
model = Sequential([
    Embedding(vocab_size, n_fac, input_length=cs, batch_input_shape=(bs, 8)),
    #To normalise inputs (exploding gradients)
    BatchNormalization(),
    #Setting stateful to true makes the model leave the hidden activations as 
    #it is after every sequence of 8 in this case. 
    LSTM(n_hidden, return_sequences=True, stateful=True),
    TimeDistributed(Dense(vocab_size, activation='softmax')),
])

In [68]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [69]:
mx = len(x_rnn)//bs*bs

In [71]:
print (mx)

75072


In [72]:
#Shuffle is set to false to learn long term dependency
model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, epochs=4, shuffle=False)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f71ffca0c88>

## One-hot sequence model - Keras

In [81]:
model = Sequential([
    SimpleRNN(n_hidden, return_sequences=True, input_shape=(cs, vocab_size),
             activation='relu', recurrent_initializer='identity'),
    TimeDistributed(Dense(vocab_size, activation='softmax'))
])
#Categorical cross entropy rather than sparse categorical.. is because the inputs are now 
#one hot encoded. 
model.compile(loss='categorical_crossentropy', optimizer=Adam())

In [95]:
#One hot encode labels and data
oh_ys = [to_categorical(o, vocab_size) for o in ys]
oh_y_rnn=np.stack(oh_ys, axis=1)

oh_xs = [to_categorical(o, vocab_size) for o in xs]
oh_x_rnn=np.stack(oh_xs, axis=1)

In [96]:
model.fit(oh_x_rnn, oh_y_rnn, batch_size=64, epochs=8)



Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f71eda2dc50>

In [98]:
def get_nexts_oh(inp):
    idxs = np.array([char_indices[c] for c in inp])
    arr = to_categorical(idxs, vocab_size)
    p = model.predict(arr[np.newaxis,:])[0]
    print(list(inp))
    return [chars[np.argmax(o)] for o in p]

In [99]:
get_nexts_oh(' this is')

[' ', 't', 'h', 'i', 's', ' ', 'i', 's']


['t', 'h', 'e', 's', ' ', 's', 's', ' ']

______________________________________
## Theano RNN
Theano builds a computation graph before running the code

In [101]:
n_input = vocab_size
n_output = vocab_size

In [107]:
#Assigning variables
t_inp = T.matrix('inp')
t_outp = T.matrix('outp')
t_h0 = T.vector('h0')
lr = T.scalar('lr')

all_args = [t_h0, t_inp, t_outp, lr]

In [108]:
def init_wgts(rows, cols):
    #Calculate the scale of the random numbers to use - Glorot init
    scale = math.sqrt(2/rows)
    #shared is a theano keyword that specifies that the data is something that will be
    #passed to GPU and tracked there essentially making theano the possessor of this data
    return shared(normal(scale=scale, size=(rows, cols)).astype(np.float32))
def init_bias(rows):
    vec = np.zeros(rows, dtype=np.float32)
    return shared(vec)

In [111]:
def wgts_and_bias(n_in, n_out):
    return init_wgts(n_in, n_out), init_bias(n_out)
def id_and_bias(n):
    #Hidden states are initialised with an identity matrix
    return shared(np.eye(n, dtype=np.float32)), init_bias(n)

In [112]:
#Weights and bias to the hidden layer
W_h = id_and_bias(n_hidden)
#Weights and bias to the input layer
W_x = wgts_and_bias(n_input, n_hidden)
#Weigths and bias to the output layer
W_y = wgts_and_bias(n_hidden, n_output)

w_all = list(chain.from_iterable([W_h, W_x, W_y]))

In [114]:
#Called on each step through 
def step(x, h, W_h, b_h, W_x, b_x, W_y, b_y):
    #Activation of input and previous hidden state with their weights and biases
    h = nnet.relu(T.dot(x, W_x) + b_x + T.dot(h, W_h) + b_h)
    #Output calculation
    y = nnet.softmax(T.dot(h, W_y) + b_y)
    #Return hidden state and output/prediction
    return h, T.flatten(y, 1)
    

theano.scan is for loops in theano with a possibility for parallelisation for a specific kind. 

In [127]:
#Variables in the input, hidden and output stages of the step are defined
#v_h and v_y will carry the returned values of step i.e., hidden state and output 
[v_h, v_y], _ = theano.scan(step, sequences=t_inp, 
                           outputs_info=[t_h0, None], non_sequences=w_all)

  


In [134]:
#Defining gradient desent in theano. Error holds the loss while g_all holds all the requires derivatives
error = nnet.categorical_crossentropy(v_y, t_outp).sum()
g_all = T.grad(error, w_all)

In [135]:
def upd_dict(wgts, grads, lr):
    #OrderedDict maps everyone of the weights to its updated weights 
    return OrderedDict({w: w-g*lr for (w,g) in zip(wgts, grads)})

In [136]:
#Each step involves the update represented below
upd = upd_dict(w_all, g_all, lr)
fn = theano.function(all_args, error, updates=upd, allow_input_downcast=True)

In [137]:
X = oh_x_rnn
Y = oh_y_rnn

In [139]:
len(X)

75109

In [140]:
err = 0; l_rate=0.01

#Gradient desent loop
for i in range(len(X)):
    err+=fn(np.zeros(n_hidden), X[i], Y[i], l_rate)
    #Print out the error every 1000 times
    if i%1000 == 999:
        print ("error:{:.3f}".format(err/1000))
        err=0.0

error:23.146
error:20.265
error:19.709
error:18.805
error:17.770
error:18.196
error:18.184
error:17.594
error:17.064
error:17.192
error:16.717
error:16.821
error:17.243
error:16.549
error:16.045
error:16.777
error:16.621
error:16.312
error:16.123
error:15.996
error:15.938
error:15.635
error:15.808
error:15.529
error:16.076
error:15.778
error:15.155
error:15.708
error:15.638
error:15.700
error:15.912
error:15.731
error:15.866
error:15.728
error:15.328
error:15.907
error:15.402
error:15.603
error:15.297
error:15.491
error:14.744
error:15.162
error:15.130
error:15.269
error:15.249
error:15.214
error:15.119
error:15.441
error:15.614
error:15.671
error:14.860
error:15.055
error:14.746
error:14.579
error:15.172
error:14.928
error:14.360
error:15.044
error:14.676
error:14.575
error:14.580
error:15.000
error:14.928
error:14.632
error:14.337
error:14.341
error:13.883
error:14.396
error:14.868
error:14.480
error:14.737
error:14.408
error:14.189
error:14.229
error:14.236


In [142]:
#New function that takes input with hiddenstate to produce an output rather than the loss to make predictions
f_y = theano.function([t_h0, t_inp], v_y, allow_input_downcast=True)

In [143]:
pred = np.argmax(f_y(np.zeros(n_hidden), X[6]), axis=1)

In [144]:
act = np.argmax(X[6], axis=1)

In [145]:
[indices_char[o] for o in act]

['t', 'h', 'e', 'n', '?', ' ', 'i', 's']

In [146]:
[indices_char[o] for o in pred]

['h', 'e', ' ', ' ', ' ', 't', 'n', ' ']