In [63]:
from keras.layers import Input, Embedding, SimpleRNN, Dense, merge, Flatten, BatchNormalization, LSTM, TimeDistributed
from keras.models import Model
from keras.optimizers import Adam
import urllib2
import numpy as np

dataset_raw = urllib2.urlopen("https://s3.amazonaws.com/text-datasets/nietzsche.txt").read().\
    replace('\n', ' ')

In [3]:
vocab = sorted(list(set([i for i in dataset_raw])))

In [4]:
vocab.insert(0, '\0')

In [5]:
txt_encoder = {v:k for k,v in enumerate(vocab)}
txt_decoder = {k:v for k,v in enumerate(vocab)}

In [6]:
dataset_encoded = [txt_encoder[i] for i in dataset_raw]

In [7]:
vocab_size = len(vocab)

In [8]:
seq_len = 8

In [11]:
train_data = [np.stack([dataset_encoded[i + j] for i in range(0, len(dataset_raw) - seq_len - 1, seq_len)]) for j in range(seq_len)]

In [13]:
output_data = [np.stack([dataset_encoded[i + j]
                         for i in range(0, len(dataset_raw) - seq_len - 1, seq_len)])[:,np.newaxis] 
                         for j in range(1, seq_len + 1)]

In [14]:
inps = []
embs = []

for i in range(seq_len):
    inps.append(Input(shape=(1,), name='inp_%s' % i))
    embs.append(Flatten()(Embedding(input_dim=vocab_size, output_dim=40, name='emb_%s' % i)(inps[i])) )

In [15]:
hidden_layer_size = 256

In [16]:
dense_in = Dense(hidden_layer_size, activation='relu')
dense_hidden = Dense(hidden_layer_size, activation='relu', init='identity')
dense_out = Dense(vocab_size, activation='softmax')

In [17]:
outs = []

zero_inp = Input(shape=(40,), name='zeros')
hidden = dense_in(zero_inp)

for i in range(seq_len):
    bn = BatchNormalization()(embs[i])
    din = dense_in(bn)
    hidden = merge([din, dense_hidden(hidden)])
    outs.append(dense_out(hidden))

In [18]:
zeros = np.tile(np.zeros(40), (len(train_data[0]), 1))

In [19]:
mdl = Model(input=[zero_inp] + [i for i in inps], output=outs)

In [20]:
mdl.compile(optimizer=Adam(lr=0.00001), loss='sparse_categorical_crossentropy')

In [21]:
mdl.fit([zeros] + train_data, output_data, nb_epoch=12, batch_size=64)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x7f8672b96590>

In [22]:
mdl.compile(optimizer=Adam(lr=0.0001), loss='sparse_categorical_crossentropy')
mdl.fit([zeros] + train_data, output_data, nb_epoch=12, batch_size=64)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x7f865e509490>

In [45]:
mdl.compile(optimizer=Adam(lr=0.01), loss='sparse_categorical_crossentropy')
mdl.fit([zeros] + train_data, output_data, nb_epoch=12, batch_size=64)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x7f865a3911d0>

In [56]:
mdl.compile(optimizer=Adam(lr=0.001), loss='sparse_categorical_crossentropy')
mdl.fit([zeros] + train_data, output_data, nb_epoch=12, batch_size=64)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x7f865bc48e50>

In [57]:
def mdl_predict(seq_3char):
    if len(seq_3char) < 8:
        padding_len = 8 - len(seq_3char)
        padding = '\0' * padding_len
        seq_3char = padding + seq_3char
    pred_data = [txt_encoder[i] for i in seq_3char]
    arrs = [np.stack([i]) for i in pred_data]
    pred = mdl.predict([np.tile(np.zeros(40), (1, 1))] + arrs)
    return [txt_decoder[np.argmax(o)] for o in pred]

In [58]:
mdl_predict('sufferin')

[' ', 'l', 'f', 'e', 'r', 'e', 'n', 'g']

In [60]:
def generate_text(num_chars):
    outs = []
    base_str = 'Sufferin'
    for i in range(num_chars):
        prediction = mdl.predict([np.stack(np.zeros(40))[np.newaxis]] +
                                          [np.array([txt_encoder[i]]) for i in base_str])
        next_char = np.argmax(prediction[-1]) # the final model output
        outs.append(txt_decoder[next_char])
        base_str = (base_str + txt_decoder[next_char])[-8:]
    return ''.join(outs)

In [62]:
generate_text(100)

'g and the sermen the sermen the sermen the sermen the sermen the sermen the sermen the sermen the se'

In [148]:
inp = Input(batch_shape=(64,8))
emb = Embedding(input_dim=vocab_size, output_dim=40, batch_input_shape=(64,8))(inp)
bn = BatchNormalization()(emb)
rnn = LSTM(output_dim=256, activation='relu', return_sequences=True, stateful=True)(bn)
out = TimeDistributed(Dense(256, activation='softmax'))(rnn)

In [149]:
mdl2 = Model(input=inp, output=out)

In [150]:
mdl2.compile(optimizer=Adam(lr=1e-6), loss='sparse_categorical_crossentropy')

In [151]:
x_stateful = np.stack(np.squeeze(train_data), axis=1)[:64000]
y_stateful = np.atleast_3d(np.stack(output_data, axis=1))[:64000]

x_stateful.shape, y_stateful.shape

((64000, 8), (64000, 8, 1))

In [152]:
mdl2.fit(x_stateful, y_stateful, nb_epoch=4, batch_size=64, shuffle=False)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

KeyboardInterrupt: 

In [None]:
def generate_text(num_chars):
    outs = []
    base_str = 'Sufferin'
    for i in range(num_chars):
        prediction = mdl2.predict([np.stack(np.zeros(40))[np.newaxis]] +
                                          [np.array([txt_encoder[i]]) for i in base_str])
        next_char = np.argmax(prediction[-1]) # the final model output
        outs.append(txt_decoder[next_char])
        base_str = (base_str + txt_decoder[next_char])[-8:]
    return ''.join(outs)