In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, Activation, SimpleRNN, TimeDistributed
from keras.callbacks import ModelCheckpoint
from random import randint
from matplotlib import pyplot as pl

In [None]:
with open("sonnets.txt") as corpus_file:
    corpus = corpus_file.read()
    corpus_length = len(corpus)
print("Loaded a corpus of {0} characters".format(corpus_length))

In [None]:
# Get a unique identifier for each char in the corpus, 
# then make some dicts to ease encoding and decoding
chars = sorted(list(set(corpus)))
num_chars = len(chars)
encoding = {c: i for i, c in enumerate(chars)}
decoding = {i: c for i, c in enumerate(chars)}
print("Our corpus contains {0} unique characters.".format(num_chars))

In [None]:
# chop up our data into X and y, slice into roughly 
# (num_chars / skip) overlapping 'sentences' of length 
# sentence_length, and encode the chars
sentence_length = 20
skip = 1
X_data = []
y_data = []
for i in range (0, len(corpus) - sentence_length, skip):
    sentence = corpus[i:i + sentence_length]
    next_char = corpus[i+1:i+1 + sentence_length]
    X_data.append([encoding[char] for char in sentence])
    y_data.append([encoding[char] for char in next_char])

num_sentences = len(X_data)
print("Sliced our corpus into {0} sentences of length {1}"
      .format(num_sentences, sentence_length))

In [None]:
print(X_data[1])

In [None]:
print([decoding[idx] for idx in X_data[1]])
print([decoding[idx] for idx in y_data[1]])

In [None]:
# Vectorize our data and labels. We want everything in one-hot.
print("Vectorizing X and y...")
X = np.zeros((num_sentences, sentence_length, num_chars), dtype=np.bool)
y = np.zeros((num_sentences, sentence_length, num_chars), dtype=np.bool)
for i, sentence in enumerate(X_data):
    for t, encoded_char in enumerate(sentence):
        X[i, t, encoded_char] = 1
for i, sentence in enumerate(y_data):
    for t, encoded_char in enumerate(sentence):
        y[i, t, encoded_char] = 1

In [None]:
# Double check our vectorized data before we sink hours into fitting a model
print("Sanity check y. Dimension: {0} # Sentences: {1} Characters in corpus: {2}".format(y.shape, num_sentences, len(chars)))
print("Sanity check X. Dimension: {0} Sentence length: {1}".format(X.shape, sentence_length))

In [None]:
# Define our model
model = Sequential()
model.add(SimpleRNN(256, input_shape=(sentence_length, num_chars), 
                    return_sequences=True))
model.add(TimeDistributed(Dense(num_chars, activation='softmax')))
model.compile(loss='categorical_crossentropy', optimizer='adam', 
              metrics=['accuracy'])
model.summary()

In [None]:
log = model.fit(X, y, epochs=20, batch_size=128)

In [None]:
pl.plot(log.history['loss'], label='Training')
pl.legend()
pl.grid()

In [None]:
def make_seed(seed_phrase=""):
        if seed_phrase:
            phrase_length = len(seed_phrase)
            pattern = ""
            for i in range (0, sentence_length):
                pattern += seed_phrase[i % phrase_length]
        else:
            seed = randint(0, corpus_length - sentence_length)
            pattern = corpus[seed:seed + sentence_length]

        return pattern

In [None]:
seed_pattern = make_seed("In the early morning, the flower is shining")
print(seed_pattern)

In [None]:
X = np.zeros((1, sentence_length, num_chars), dtype=np.bool)
for i, character in enumerate(seed_pattern):
    X[0, i, encoding[character]] = 1

generated_text = ""
for i in range(500):
    output_prob = model.predict(X, verbose=0)[0]
    # alternative is to use argmax: prediction = np.argmax(output_prob[-1])
    prediction = np.random.choice(num_chars, p = output_prob[-1] )
    generated_text += decoding[prediction]
    activations = np.zeros((1, 1, num_chars), dtype=np.bool)
    activations[0, 0, prediction] = 1
    #now remove first char and glue the predicted one
    X = np.concatenate((X[:, 1:, :], activations), axis=1)
print(generated_text)