In [1]:
%matplotlib inline

from theano.sandbox import cuda
import utils; reload(utils)
from utils import *
from __future__ import division, print_function

Using Theano backend.


We'll be using the works of Neitzsche to use as our data for our predictive text.

In [2]:
path = get_file('nietzche.txt', origin = "https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read()
print('corpus length:', len(text))

corpus length: 600901


# Setup

In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars) + 1
print('total chars:', vocab_size)

total chars: 86


In [4]:
chars.insert(0, "\0")

This is a list of all the current characters in the text/data we downloaded.

In [5]:
''.join(chars[:])

'\x00\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz\x86\xa4\xa6\xa9\xab\xc3'

In [6]:
char_indices = dict((c,i) for i,c in enumerate(chars))
indices_char = dict((i,c) for i,c in enumerate(chars))

In [7]:
char_indices['!']

3

In [8]:
indices_char[1]

'\n'

In [9]:
idx = [char_indices[c] for c in text]

In [10]:
idx[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

# Training a 3 character model

This model will take 3 separate characters of input and output a 4th character.

In [11]:
cs = 3
c1_dat = [idx[i] for i in xrange(0, len(idx)-1-cs, cs)]
c2_dat = [idx[i+1] for i in xrange(0, len(idx)-1-cs, cs)]
c3_dat = [idx[i+2] for i in xrange(0, len(idx)-1-cs, cs)]
c4_dat = [idx[i+3] for i in xrange(0, len(idx)-1-cs, cs)]

In [12]:
x1 = np.stack(c1_dat[:-2])
x2 = np.stack(c2_dat[:-2])
x3 = np.stack(c3_dat[:-2])

In [13]:
y = np.stack(c4_dat[:-2])

This is where we create the embedding inputs and outputs for the first three input characters.

In [14]:
n_fac = 42
def embedding_input(name, n_in, n_out):
    inp = Input(shape = (1,), dtype = 'int64', name = name)
    emb = Embedding(n_in, n_out, input_length = 1)(inp)
    return inp, Flatten()(emb)

In [15]:
c1_in, c1 = embedding_input('c1', vocab_size, n_fac)
c2_in, c2 = embedding_input('c2', vocab_size, n_fac)
c3_in, c3 = embedding_input('c3', vocab_size, n_fac)

In [16]:
n_hidden = 256

This is the architecture of our model.

In [17]:
dense_in = Dense(n_hidden, activation = 'relu')

In [18]:
c1_hidden = dense_in(c1)

In [19]:
dense_hidden = Dense(n_hidden, activation = 'relu')

In [20]:
c2_dense = dense_in(c2)
hidden_2 = dense_hidden(c1_hidden)
c2_hidden = merge([c2_dense, hidden_2])

In [21]:
c3_dense = dense_in(c3)
hidden_3 = dense_hidden(c2_hidden)
c3_hidden = merge([c3_dense, hidden_3])

In [22]:
dense_out = Dense(vocab_size, activation = 'softmax')

In [23]:
c4_out = dense_out(c3_hidden)

In [24]:
model = Model([c1_in, c2_in, c3_in], c4_out)

In [25]:
model.compile(Adam(.001), loss = "sparse_categorical_crossentropy", metrics = ['accuracy'])

In [26]:
model.fit([x1, x2, x3], y, batch_size = 64, nb_epoch = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f3d38e69b10>

In [27]:
model.optimizer.lr = .01

In [28]:
model.fit([x1, x2, x3], y, batch_size = 64, nb_epoch = 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f3d395f8d50>

In [29]:
model.optimizer.lr = .0001

In [30]:
model.fit([x1, x2, x3], y, batch_size = 64, nb_epoch = 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f3d2e41ec90>

# Testing our 3 character model

In [51]:
def get_next(inp):
    idxs = [char_indices[c] for c in inp]
    arrs = [np.array(i)[np.newaxis] for i in idxs]
    p = model.predict(arrs)
    i = np.argmax(p)
    return chars[i]

In [52]:
get_next('phi')

'l'

In [54]:
get_next(' th')

'e'

In [55]:
get_next(' an')

'd'

# Recurrent Neural Network Model

This time we'll do an 8th character prediction model.

In [31]:
cs = 8

In [32]:
c_in_dat = [[idx[i+n] for i in xrange(0, len(idx)-1-cs, cs)] for n in range(cs)]

In [33]:
c_out_dat = [idx[i+cs] for i in xrange(0, len(idx)-1-cs,cs)]

In [34]:
xs = [np.stack(c[:-2]) for c in c_in_dat]

In [35]:
y = np.stack(c_out_dat[:-2])

In [36]:
n_fac = 42

In [37]:
def embedding_input(name, n_in, n_out):
    inp = Input(shape = (1,), dtype = 'int64', name = name + '_in')
    emb = Embedding(n_in, n_out, input_length = 1, name = name + '_emb')(inp)
    return inp, Flatten()(emb)

In [38]:
c_ins = [embedding_input('c' + str(n), vocab_size, n_fac) for n in range(cs)]

In [39]:
n_hidden = 256

In [40]:
dense_in = Dense(n_hidden, activation = 'relu')
dense_hidden = Dense(n_hidden, activation = 'relu', init = 'identity')
dense_out = Dense(vocab_size, activation = 'softmax')

In [41]:
hidden = dense_in(c_ins[0][1])

In [42]:
for i in range(1,cs):
    c_dense = dense_in(c_ins[i][1])
    hidden = dense_hidden(hidden)
    hidden = merge([c_dense, hidden])

In [43]:
c_out = dense_out(hidden)

In [44]:
model = Model([c[0] for c in c_ins], c_out)
model.compile(Adam(.001), loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

In [45]:
model.fit(xs, y, batch_size = 64, nb_epoch = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f3d2afe6f90>

# Returning Sequences

In [46]:
c_out_dat = [[idx[i + n] for i in xrange(1, len(idx) - cs, cs)] for n in range(cs)]

In [47]:
ys = [np.stack(c[:-2]) for c in c_out_dat]

In [48]:
dense_in = Dense(n_hidden, activation = 'relu')
dense_hidden = Dense(n_hidden, activation = 'relu', init = 'identity')
dense_out = Dense(vocab_size, activation = 'softmax', name = 'output')

In [49]:
inp1 = Input(shape = (n_fac,), name = 'zeros')
hidden = dense_in(inp1)

In [50]:
outs = []

for i in range(cs):
    c_dense = dense_in(c_ins[i][1])
    hidden = dense_hidden(hidden)
    hidden = merge([c_dense, hidden], mode = 'sum')
    
    outs.append(dense_out(hidden))

In [51]:
model = Model([inp1] + [c[0] for c in c_ins], outs)
model.compile(Adam(), loss = 'sparse_categorical_crossentropy')

In [52]:
zeros = np.tile(np.zeros(n_fac), (len(xs[0]), 1))
zeros.shape

(75110, 42)

In [53]:
model.fit([zeros] + xs, ys, batch_size = 64, nb_epoch = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f3d1967c910>

Testing our model.

In [56]:
def get_nexts(inp):
    idxs = [char_indices[c] for c in inp]
    arrs = [np.array(i)[np.newaxis] for i in idxs]
    p = model.predict([np.zeros(n_fac)[np.newaxis,:]] + arrs)
    print(list(inp))
    return[chars[np.argmax(o)] for o in p]

In [57]:
get_nexts(' this is')

[' ', 't', 'h', 'i', 's', ' ', 'i', 's']


['t', 'h', 'e', 't', ' ', 'c', 'n', ' ']

In [58]:
get_nexts(' part of')

[' ', 'p', 'a', 'r', 't', ' ', 'o', 'f']


['t', 'e', 't', 't', ' ', 'o', 'f', ' ']