In [96]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
import sys
from keras import backend as K
print K.backend()
%matplotlib inline

theano


# Load data

In [6]:
path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read()
print('corpus length:', len(text))

('corpus length:', 600901)


In [7]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total unique chars:', vocab_size)

('total unique chars:', 86)


# data preprocessing

In [8]:
chars.insert(0, "\0")
''.join(chars[1:-6])

'\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz'

In [9]:
print chars[:5]
print text[:50]

['\x00', '\n', ' ', '!', '"']
PREFACE


SUPPOSING that Truth is a woman--what th


We cannot model characters directly and convert characters to integers

In [10]:
char2indices = dict((c,i) for i, c in enumerate(chars))

In [11]:
print (chars[9], char2indices['\n'])


('-', 1)


In [12]:
print text[:10]

test = [char2indices[c] for c in text[:10]]
print test
print [chars[c] for c in test]


PREFACE



[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]
['P', 'R', 'E', 'F', 'A', 'C', 'E', '\n', '\n', '\n']


In [13]:
sub_text = [char2indices[c] for c in text[:]]
input_length = 100
X = []
y = []
for i in range(0, len(sub_text) - input_length):
    X.append(sub_text[i:i+input_length])
    y.append(sub_text[i+input_length])

In [14]:
len(X)

600801

In [15]:
print (X[0], y[0])
print (X[1], y[1])

([40, 42, 29, 30, 25, 27, 29, 1, 1, 1, 43, 45, 40, 40, 39, 43, 33, 38, 31, 2, 73, 61, 54, 73, 2, 44, 71, 74, 73, 61, 2, 62, 72, 2, 54, 2, 76, 68, 66, 54, 67, 9, 9, 76, 61, 54, 73, 2, 73, 61, 58, 67, 24, 2, 33, 72, 2, 73, 61, 58, 71, 58, 2, 67, 68, 73, 2, 60, 71, 68, 74, 67, 57, 1, 59, 68, 71, 2, 72, 74, 72, 69, 58, 56, 73, 62, 67, 60, 2, 73, 61, 54, 73, 2, 54, 65, 65, 2, 69, 61], 62)
([42, 29, 30, 25, 27, 29, 1, 1, 1, 43, 45, 40, 40, 39, 43, 33, 38, 31, 2, 73, 61, 54, 73, 2, 44, 71, 74, 73, 61, 2, 62, 72, 2, 54, 2, 76, 68, 66, 54, 67, 9, 9, 76, 61, 54, 73, 2, 73, 61, 58, 67, 24, 2, 33, 72, 2, 73, 61, 58, 71, 58, 2, 67, 68, 73, 2, 60, 71, 68, 74, 67, 57, 1, 59, 68, 71, 2, 72, 74, 72, 69, 58, 56, 73, 62, 67, 60, 2, 73, 61, 54, 73, 2, 54, 65, 65, 2, 69, 61, 62], 65)


In [16]:
max(max(X))

85

In [17]:
# reshape X to be [samples, time steps, features]
X = np.reshape(X, (len(X), input_length, 1))
# normalize
X = X / float(vocab_size)
# one hot encode the output variable
y = np_utils.to_categorical(y)

In [18]:
X.shape, y.shape

((600801, 100, 1), (600801, 86))

# LSTM architecture

In [19]:
n_hidden = 256
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [100]:
# define checkpoint: for long training save when improvement in loss
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [101]:
model.fit(X, y, epochs=1, batch_size=128, callbacks=callbacks_list)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1159611d0>

# Generate text

In [20]:
filename = "weights-improvement-01-2.7955.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [104]:
# randomly pick initial sequence among input
start = np.random.randint(0, len(X)-1)
pattern = X[start] * vocab_size
print pattern.shape
print "Seed:"
print "\"", ''.join([chars[int(c)] for c in pattern]),"\""
# generate characters
for i in range(100):
    x = np.reshape(pattern, (1, len(pattern), 1))
#     print "x: ", x.shape
#     x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = chars[index]
    seq_in = [chars[int(value)] for value in pattern]
    sys.stdout.write(result)
    pattern = np.append(pattern,index)
    pattern = pattern[1:len(pattern)]
#     print "shape: ", pattern.shape
    print "\nDone."

(100, 1)
Seed:
" ot
cone to the hmportance of relhghon, of thhs there can be no coubt. It hs
also equally certahn tha "
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.
h
Done.


In [80]:
print pattern.shape
print x.shape
prediction = model.predict(x, verbose=0)
index = np.argmax(prediction)
print chars[index]
print [chars[int(value*vocab_size)] for value in pattern]


(100, 1)
(1, 100, 1)
s
['s', ' ', 'f', 'r', 'o', 'm', ' ', 'm', 'e', 't', 'a', 'p', 'h', 'y', 's', 'h', 'c', ' ', 'a', 'n', 'c', ' ', 'l', 'o', 'o', 'k', ' ', 'b', 'a', 'c', 'k', ' ', 'a', 't', ' ', 'h', 't', ' ', 'w', 'h', 't', 'h', '\n', 'a', 'n', ' ', 'a', 'h', 'r', ' ', 'o', 'f', ' ', 's', 'u', 'p', 'e', 'r', 'h', 'o', 'r', 'h', 't', 'y', ':', ' ', 'w', 'h', 'e', 'r', 'e', 'a', 's', ' ', 'h', 'e', 'r', 'e', ',', ' ', 'n', 'o', ' ', 'l', 'e', 's', 's', ' ', 't', 'h', 'a', 'n', ' ', 'h', 'n', ' ', 't', 'h', 'e', ' ']
