In [699]:
from keras.layers import Input, Embedding, SimpleRNN, Dense, merge, Flatten
from keras.models import Model
from keras.optimizers import Adam
import urllib2

In [700]:
dataset_raw = urllib2.urlopen("https://s3.amazonaws.com/text-datasets/nietzsche.txt").read()[:10000].\
    replace('\n', ' ')

In [701]:
dataset_raw[:20]

'PREFACE   SUPPOSING '

In [702]:
vocab = sorted(list(set([i for i in dataset_raw])))

In [703]:
vocab.insert(0, '\0')

In [704]:
txt_encoder = {v:k for k,v in enumerate(vocab)}

In [705]:
txt_decoder = {k:v for k,v in enumerate(vocab)}

In [706]:
dataset_encoded = [txt_encoder[i] for i in dataset_raw]

In [707]:
vocab_size = len(vocab)

In [708]:
seq_len = 8

In [709]:
train_data = [np.stack([dataset_encoded[i + j] for i in range(0, len(dataset_raw) - seq_len - 1, seq_len)]) for j in range(seq_len)]

In [710]:
train_data[0][:10]

array([34,  1, 27,  1, 61, 56, 62,  1, 57, 46])

In [711]:
train_data

[array([34,  1, 27, ..., 43, 51, 60]),
 array([35,  1, 32, ...,  1, 61, 47]),
 array([23, 36, 25, ..., 45, 57, 43]),
 array([24, 38,  1, ..., 57, 56, 54]),
 array([19, 34, 62, ..., 55,  1, 51]),
 array([21, 34, 50, ..., 58, 57, 62]),
 array([23, 33, 43, ..., 43, 48, 67]),
 array([ 1, 36, 62, ..., 60,  1,  1])]

In [712]:
output_data = [np.stack([dataset_encoded[i + j]
                         for i in range(0, len(dataset_raw) - seq_len - 1, seq_len)])[:,np.newaxis] 
                         for j in range(1, seq_len + 1)]

In [713]:
output_data[0].shape

(1249, 1)

In [714]:
output_data

[array([[35],
        [ 1],
        [32],
        ..., 
        [ 1],
        [61],
        [47]]), array([[23],
        [36],
        [25],
        ..., 
        [45],
        [57],
        [43]]), array([[24],
        [38],
        [ 1],
        ..., 
        [57],
        [56],
        [54]]), array([[19],
        [34],
        [62],
        ..., 
        [55],
        [ 1],
        [51]]), array([[21],
        [34],
        [50],
        ..., 
        [58],
        [57],
        [62]]), array([[23],
        [33],
        [43],
        ..., 
        [43],
        [48],
        [67]]), array([[ 1],
        [36],
        [62],
        ..., 
        [60],
        [ 1],
        [ 1]]), array([[ 1],
        [27],
        [ 1],
        ..., 
        [51],
        [60],
        [65]])]

In [716]:
[output_data[n][:10] for n in range(seq_len)]

[array([[35],
        [ 1],
        [32],
        [37],
        [ 1],
        [ 8],
        [50],
        [62],
        [62],
        [ 1]]), array([[23],
        [36],
        [25],
        [60],
        [43],
        [ 8],
        [47],
        [50],
        [ 1],
        [48]]), array([[24],
        [38],
        [ 1],
        [63],
        [ 1],
        [65],
        [56],
        [47],
        [49],
        [57]]), array([[19],
        [34],
        [62],
        [62],
        [65],
        [50],
        [18],
        [60],
        [60],
        [60]]), array([[21],
        [34],
        [50],
        [50],
        [57],
        [43],
        [ 1],
        [47],
        [57],
        [ 1]]), array([[23],
        [33],
        [43],
        [ 1],
        [55],
        [62],
        [27],
        [ 1],
        [63],
        [61]]), array([[ 1],
        [36],
        [62],
        [51],
        [43],
        [ 1],
        [61],
        [56],
        [56],
        [63]]), array([[ 1],

In [717]:
len(output_data), output_data[0].shape

(8, (1249, 1))

In [760]:
# # Input 1
# inp1 = Input(shape=(1,))
# emb1 = Embedding(input_dim=vocab_size, output_dim=40, input_length=1)(inp1)
# emb1 = Flatten()(emb1)

# # Input 2
# inp2 = Input(shape=(1,))
# emb2 = Embedding(input_dim=vocab_size, output_dim=40, input_length=1)(inp2)
# emb2 = Flatten()(emb2)

# # Input 3
# inp3 = Input(shape=(1,))
# emb3 = Embedding(input_dim=vocab_size, output_dim=40, input_length=1)(inp3)
# emb3 = Flatten()(emb3)

inps = []
embs = []

for i in range(seq_len):
    inps.append(Input(shape=(1,), name='inp_%s' % i))
    embs.append(Flatten()(Embedding(input_dim=vocab_size, output_dim=40, name='emb_%s' % i)(inps[i])) )

In [719]:
hidden_layer_size = 256

In [720]:
dense_in = Dense(hidden_layer_size, activation='relu')
dense_hidden = Dense(hidden_layer_size, activation='relu', init='identity')
dense_out = Dense(vocab_size, activation='softmax')

In [809]:
outs = []

zero_inp = Input(shape=(40,), name='zeros')
hidden = dense_in(zero_inp)

for i in range(seq_len):
    din = dense_in(embs[i])
    hidden = merge([din, dense_hidden(hidden)])
    outs.append(dense_out(hidden))

In [810]:
zeros = np.tile(np.zeros(40), (len(train_data[0]), 1))

In [811]:
mdl = Model(input=[zero_inp] + [i for i in inps], output=outs)

In [812]:
mdl.compile(optimizer=Adam(lr=0.000001), loss='sparse_categorical_crossentropy')

In [813]:
train_data[0].shape

(1249, 1)

In [814]:
len(train_data[0])

1249

In [815]:
len(train_data), len(output_data)

(8, 8)

In [816]:
mdl.fit([zeros] + train_data, output_data, nb_epoch=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x1b0a2afd0>

In [817]:
mdl.fit([zeros] + train_data, output_data, nb_epoch=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x1aed5f890>

In [818]:
def mdl_predict(seq_3char):
    if len(seq_3char) < 8:
        padding_len = 8 - len(seq_3char)
        padding = '\0' * padding_len
        seq_3char = padding + seq_3char
    pred_data = [txt_encoder[i] for i in seq_3char]
    arrs = [np.stack([i]) for i in pred_data]
    pred = mdl.predict([np.tile(np.zeros(40), len(train_data[0]), 1)] + arrs)[0]
    idx = np.argmax(pred)
    return vocab[idx]

In [819]:
vocab_size

69

In [820]:
mdl_predict('the thin')

ValueError: Error when checking : the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 9 arrays but instead got the following list of 8 arrays: [array([62]), array([50]), array([47]), array([1]), array([62]), array([50]), array([51]), array([56])]...

In [749]:
mdl_predict('qui')

't'

In [750]:
mdl_predict('ck ')

't'

In [751]:
mdl_predict('hel')

't'

In [756]:
def generate_text(num_chars):
    outs = []
    base_str = 'supposin'
    for i in range(num_chars):
        next_char = np.argmax(mdl.predict([np.array([txt_encoder[i]]) for i in base_str])[0])
        outs.append(txt_decoder[next_char])
        base_str = (base_str + txt_decoder[next_char])[-8:]
    return ''.join(outs)

In [757]:
generate_text(1000)

' teen n th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e th   t thettththe hhhehe teee e