In [114]:
from keras.layers import Input, Embedding, SimpleRNN, Dense, merge, Flatten, BatchNormalization, LSTM, TimeDistributed, Dropout
from keras.models import Model
from keras.optimizers import Adam
import urllib2
import numpy as np

dataset_raw = urllib2.urlopen("https://s3.amazonaws.com/text-datasets/nietzsche.txt").read().\
    replace('\n', ' ')

In [115]:
vocab = sorted(list(set([i for i in dataset_raw])))

In [116]:
vocab.insert(0, '\0')

In [117]:
txt_encoder = {v:k for k,v in enumerate(vocab)}
txt_decoder = {k:v for k,v in enumerate(vocab)}

In [118]:
dataset_encoded = [txt_encoder[i] for i in dataset_raw]

In [119]:
vocab_size = len(vocab)

In [120]:
seq_len = 40

In [121]:
train_data = [np.stack([dataset_encoded[i + j] for i in range(0, len(dataset_raw) - seq_len - 1, seq_len)]) for j in range(seq_len)]

In [122]:
output_data = [np.stack([dataset_encoded[i + j]
                         for i in range(0, len(dataset_raw) - seq_len - 1, seq_len)])[:,np.newaxis] 
                         for j in range(1, seq_len + 1)]

In [10]:
inps = []
embs = []

for i in range(seq_len):
    inps.append(Input(shape=(1,), name='inp_%s' % i))
    embs.append(Flatten()(Embedding(input_dim=vocab_size, output_dim=40, name='emb_%s' % i)(inps[i])) )

In [11]:
hidden_layer_size = 256

In [12]:
dense_in = Dense(hidden_layer_size, activation='relu')
dense_hidden = Dense(hidden_layer_size, activation='relu', init='identity')
dense_out = Dense(vocab_size, activation='softmax')

In [13]:
outs = []

zero_inp = Input(shape=(40,), name='zeros')
hidden = dense_in(zero_inp)

for i in range(seq_len):
    bn = BatchNormalization()(embs[i])
    din = dense_in(bn)
    hidden = merge([din, dense_hidden(hidden)])
    outs.append(dense_out(hidden))

In [14]:
zeros = np.tile(np.zeros(40), (len(train_data[0]), 1))

In [15]:
mdl = Model(input=[zero_inp] + [i for i in inps], output=outs)

In [16]:
mdl.compile(optimizer=Adam(lr=0.00001), loss='sparse_categorical_crossentropy')

In [21]:
mdl.fit([zeros] + train_data, output_data, nb_epoch=12, batch_size=64)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x7f8672b96590>

In [22]:
mdl.compile(optimizer=Adam(lr=0.0001), loss='sparse_categorical_crossentropy')
mdl.fit([zeros] + train_data, output_data, nb_epoch=12, batch_size=64)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x7f865e509490>

In [45]:
mdl.compile(optimizer=Adam(lr=0.01), loss='sparse_categorical_crossentropy')
mdl.fit([zeros] + train_data, output_data, nb_epoch=12, batch_size=64)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x7f865a3911d0>

In [56]:
mdl.compile(optimizer=Adam(lr=0.001), loss='sparse_categorical_crossentropy')
mdl.fit([zeros] + train_data, output_data, nb_epoch=12, batch_size=64)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x7f865bc48e50>

In [17]:
def mdl_predict(seq_3char):
    if len(seq_3char) < 8:
        padding_len = 8 - len(seq_3char)
        padding = '\0' * padding_len
        seq_3char = padding + seq_3char
    pred_data = [txt_encoder[i] for i in seq_3char]
    arrs = [np.stack([i]) for i in pred_data]
    pred = mdl.predict([np.tile(np.zeros(40), (1, 1))] + arrs)
    return [txt_decoder[np.argmax(o)] for o in pred]

In [18]:
mdl_predict('sufferin')

[')', ';', ';', ';', ';', ';', ';', ';']

In [111]:
def generate_text(num_chars):
    outs = []
    base_str = 'Sufferin'
    for i in range(num_chars):
        prediction = mdl.predict([np.stack(np.zeros(40))[np.newaxis]] +
                                          [np.array([txt_encoder[i]]) for i in base_str])
        next_char = np.argmax(prediction[-1]) # the final model output
        outs.append(txt_decoder[next_char])
        base_str = (base_str + txt_decoder[next_char])[-40:]
    return base_str

In [75]:
generate_text(100)

';S;;S;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;'

In [129]:
inp = Input(batch_shape=(64, seq_len))
emb = Embedding(input_dim=vocab_size, output_dim=40, batch_input_shape=(64,seq_len))(inp)
bn = BatchNormalization()(emb)
rnn = LSTM(output_dim=256, activation='relu', return_sequences=True, stateful=True)(bn)
bn2 = BatchNormalization()(rnn)
d = Dropout(0.2)(bn2)
out = TimeDistributed(Dense(vocab_size, activation='softmax'))(d)

In [130]:
mdl2 = Model(input=inp, output=out)

In [131]:
mdl2.compile(optimizer=Adam(lr=1e-5), loss='sparse_categorical_crossentropy')

In [132]:
x_stateful = np.stack(np.squeeze(train_data), axis=1)[:12800]
y_stateful = np.atleast_3d(np.stack(output_data, axis=1))[:12800]

x_stateful.shape, y_stateful.shape

((12800, 40), (12800, 40, 1))

In [133]:
n_epoch = 2
for i in range(n_epoch):
    mdl2.fit(x_stateful, y_stateful, nb_epoch=1, batch_size=64, shuffle=False)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7fe7babb0e90>

In [112]:
mdl2.compile(optimizer=Adam(lr=1e-4), loss='sparse_categorical_crossentropy')
mdl2.fit(x_stateful, y_stateful, nb_epoch=1, batch_size=64, shuffle=False)

Epoch 1/1


<keras.callbacks.History at 0x7fe7b3936550>

In [63]:
def generate_text(num_chars):
    outs = []
    base_str = 'Sufferin'
    for i in range(num_chars):
        prediction = mdl2.predict(np.array([txt_encoder[i] for i in base_str])[np.newaxis])
        next_char = np.argmax(prediction[-1]) # the final model output
        outs.append(txt_decoder[next_char])
        base_str = (base_str + txt_decoder[next_char])[-8:]
    return ''.join(outs)

In [64]:
generate_text(50)

ValueError: dimension mismatch in args to gemm (64,256)x(256,256)->(1,256)
Apply node that caused the error: GpuGemm{no_inplace}(GpuSubtensor{::, int64::}.0, TensorConstant{0.20000000298}, <CudaNdarrayType(float32, matrix)>, lstm_4_U_o_copy[cuda], TensorConstant{0.20000000298})
Toposort index: 5
Inputs types: [CudaNdarrayType(float32, matrix), TensorType(float32, scalar), CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, matrix), TensorType(float32, scalar)]
Inputs shapes: [(1, 256), (), (64, 256), (256, 256), ()]
Inputs strides: [(0, 1), (), (256, 1), (256, 1), ()]
Inputs values: ['not shown', array(0.20000000298023224, dtype=float32), 'not shown', 'not shown', array(0.20000000298023224, dtype=float32)]
Outputs clients: [[GpuElemwise{Composite{(i0 * clip((i0 + i1), i2, i3) * (i4 + Abs(i4)))},no_inplace}(CudaNdarrayConstant{[[ 0.5]]}, GpuGemm{no_inplace}.0, CudaNdarrayConstant{[[ 0.]]}, CudaNdarrayConstant{[[ 1.]]}, GpuElemwise{Composite{((clip((i0 + i1), i2, i3) * i4) + (i0 * clip((i0 + i5), i2, i3) * (i6 + Abs(i6))))},no_inplace}.0)]]

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.
Apply node that caused the error: forall_inplace,gpu,scan_fn}(TensorConstant{8}, GpuSubtensor{int64:int64:int8}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, TensorConstant{8}, lstm_4_U_o, lstm_4_U_f, lstm_4_U_i, lstm_4_U_c)
Toposort index: 86
Inputs types: [TensorType(int64, scalar), CudaNdarrayType(float32, 3D), CudaNdarrayType(float32, 3D), CudaNdarrayType(float32, 3D), TensorType(int64, scalar), CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, matrix)]
Inputs shapes: [(), (8, 1, 1024), (2, 64, 256), (2, 64, 256), (), (256, 256), (256, 256), (256, 256), (256, 256)]
Inputs strides: [(), (1024, 0, 1), (16384, 256, 1), (16384, 256, 1), (), (256, 1), (256, 1), (256, 1), (256, 1)]
Inputs values: [array(8), 'not shown', 'not shown', 'not shown', array(8), 'not shown', 'not shown', 'not shown', 'not shown']
Outputs clients: [[GpuSubtensor{int64}(forall_inplace,gpu,scan_fn}.0, Constant{1})], [GpuSubtensor{int64}(forall_inplace,gpu,scan_fn}.1, Constant{1})], [for{gpu,scan_fn}(TensorConstant{8}, forall_inplace,gpu,scan_fn}.2, TensorConstant{8}, dense_7_W, GpuDimShuffle{x,0}.0)]]

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.