In [None]:
import os
import math
from random import randint
import pandas as pd
import json
import random
import numpy as np
from docx import Document
from docx.shared import Pt

In [None]:
# load up our text
directory = os.getcwd()
complete_text = []
for filename in os.listdir(directory + '/data/'):
    if filename.endswith(".txt"): 
        txt = open('data/' + filename, 'r').read().lower()
        complete_text += [txt]

text = '\n'.join(complete_text)

# extract all (unique) characters
# these are our "categories" or "labels"
chars = list(set(text))

# set a fixed vector size
# so we look at specific windows of characters
max_len = 32

len(text)

Now we'll define our RNN. Keras makes this trivial:

In [None]:
from keras.models import Sequential
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Activation, Dropout
import tensorflow as tf

tf.python.control_flow_ops = tf

model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(max_len, len(chars))))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [None]:
# We can generalize this like so:
step = 3
inputs = []
outputs = []
for i in range(0, len(text) - max_len, step):
    inputs.append(text[i:i+max_len])
    outputs.append(text[i+max_len])

In [None]:
# We also need to map each character to a label and create a reverse mapping to use later:
char_labels = {ch:i for i, ch in enumerate(chars)}
labels_char = {i:ch for i, ch in enumerate(chars)}

In [None]:
# load previous ones
char_labels = json.load(open('data/char_labels.json', 'r'))
labels_char = json.load(open('data/labels_char.json', 'r'))

In [None]:
def make_char_dict(text):
    chars = sorted(list(set(text)))
    char_to_int = dict((c, i) for i, c in enumerate(chars))
    return char_to_int

char_labels = make_char_dict(text)
json.dump(char_labels, open('data/char_labels.json', 'w'))

In [None]:
labels_char = {int(label): char 
               for char, label in char_labels.items()}
json.dump(labels_char, open('data/labels_char.json', 'w'))

In [None]:
def text_to_matrix(text):
    assert len(text) == max_len
    X = np.zeros((1, max_len, len(chars)), dtype=np.bool)
    for i, char in enumerate(text):
        if char in char_labels:
            X[0, i, char_labels[char]] = 1
    return X   
    
print(char_labels['t'])
text_to_matrix('this is a testing input sentence').astype(int)[0][0]

In [None]:
# using bool to reduce memory usage
X = np.zeros((len(inputs), max_len, len(chars)), dtype=np.bool)
y = np.zeros((len(inputs), len(chars)), dtype=np.bool)

# set the appropriate indices to 1| in each one-hot vector
for i, example in enumerate(inputs):
    for t, char in enumerate(example):
        X[i, t, char_labels[char]] = 1
    y[i, char_labels[outputs[i]]] = 1

Now that we have our training data, we can start training. Keras also makes this easy:

In [None]:
# more epochs is usually better, but training can be very slow if not on a GPU
epochs = 10
model.fit(X, y, batch_size=128, nb_epoch=epochs)

In [None]:
model.save_weights('data/rnn_weights.h5', overwrite=True)

In [None]:
# Loading pre-trained weights
model.load_weights('data/rnn_weights.h5')

In [None]:
def generate(temperature=0.35, max_len=32, seed=None, predicate=lambda x: len(x) < 100):
    if seed is not None and len(seed) < max_len:
        raise Exception('Seed text must be at least {} chars long'.format(max_len))
    else:
        start_idx = random.randint(0, len(text) - max_len - 1)
        seed = text[start_idx:start_idx + max_len]

    sentence = seed
    generated = sentence
    while predicate(generated):
        X = text_to_matrix(sentence)

        probs = model.predict(X, verbose=0)[0]

        next_idx = sample(probs, temperature)
        next_char = labels_char[next_idx]

        generated += next_char
        sentence = sentence[1:] + next_char
    
    if len(generated.split()[-1]) < 3:
        generated_comp = generated.split('\n')
        generated = '\n'.join(generated_comp[-1])
    return generated

def sample(probs, temperature):
    """samples an index from a vector of probabilities"""
    a = np.log(probs)/temperature
    a = np.exp(a)/np.sum(np.exp(a))
    return np.argmax(np.random.multinomial(1, a, 1))

In [None]:
input = 'when the shoe fits, the foot is '
X = text_to_matrix(input)
probs = model.predict(X, verbose=0)[0]
print(probs)

In [None]:
# see the probability of an individual character
label = char_labels['f']
probs[label]

In [None]:
idx = np.argmax(probs)
labels_char[idx]

In [None]:
text_ = generate(temperature=0.5,max_len=800)
print(text_)

In [None]:
def cleanText(text_):
    text_ = text_.strip()
    new_text = []
    for t in text_.split('\n'):
        if len(t) > 2:
            new_text += [t[0].upper() + t[1:]]
    text_ = '\n'.join(new_text)
    return text_

In [None]:
all_text = []
for i in range(0,2000):
    gen_text = generate(temperature=0.5,max_len=randint(800, 1000))
    if len(gen_text) > 100:
        all_text.append(gen_text)

In [None]:
all_text = [[cleanText(t)] for t in all_text]

In [None]:
df = pd.DataFrame(all_text, columns=['text'])
df.to_csv('all_text.csv')

In [None]:
all_text = list(df['text'])

In [None]:
def convert_poems_to_doc(all_text, single_files=True):
    if not single_files:
        all_ = '\n\n'.join([t for t in  all_text[:100] if isinstance(t, str)])
    for i, text in enumerate(all_text):
        if isinstance(text, str):
            document = Document()
            run = document.add_paragraph(text).add_run()
            style = document.styles['Normal']
            font = style.font
            font.name = 'Big Calson' 
            font.size = Pt(12)
            if single_files:
                 document.save('data/output/poem_%s.docx' % i)
            else:
                document.save('data/output/ALL_POEMS.docx')

In [None]:
convert_poems_to_doc(all_text)
convert_poems_to_doc(all_text, single_files=False)