In [1]:
import numpy as np
import tensorflow as tf

### Load the Corpus
##### Get book names

In [67]:
import glob

book_filenames = sorted(glob.glob("/data/*.txt"))

print("Found {} books".format(len(book_filenames)))

Found 5 books


##### Combine books into a string

In [68]:
import codecs

corpus_raw = u""
for filename in book_filenames:
    with codecs.open(filename, 'r', 'utf-8') as book_file:
        corpus_raw += book_file.read()

print("Corpus is {} characters long".format(len(corpus_raw)))

Corpus is 9719485 characters long


### Process Corpus
##### Create lookup tables

In [69]:
def create_lookup_tables(text):
    """
    Create lookup tables for vocab
    :param text: The GOT text split into words
    :return: A tuple of dicts (vocab_to_int, int_to_vocab)
    """
    vocab = set(text)
    int_to_vocab = {key: word for key, word in enumerate(vocab)}
    vocab_to_int = {word: key for key, word in enumerate(vocab)}
    return vocab_to_int, int_to_vocab
    

##### Tokenize punctuation

In [70]:
def token_lookup():
    """
    Generate a dict to map punctuation into a token
    :return: dictionary mapping puncuation to token
    """
    return {
        '.': '||period||',
        ',': '||comma||',
        '"': '||quotes||',
        ';': '||semicolon||',
        '!': '||exclamation-mark||',
        '?': '||question-mark||',
        '(': '||left-parentheses||',
        ')': '||right-parentheses||',
        '--': '||emm-dash||',
        '\n': '||return||'
        
    }


##### Process and save data

In [71]:
import pickle

token_dict = token_lookup()
for token, replacement in token_dict.items():
    corpus_raw = corpus_raw.replace(token, ' {} '.format(replacement))
corpus_raw = corpus_raw.lower()
corpus_raw = corpus_raw.split()

vocab_to_int, int_to_vocab = create_lookup_tables(corpus_raw)
corpus_int = [vocab_to_int[word] for word in corpus_raw]
pickle.dump((corpus_int, vocab_to_int, int_to_vocab, token_dict), open('preprocess.p', 'wb'))

# Build the Network
### Batch the Data

In [78]:
def get_batches(int_text, batch_size, seq_length):
    """
    Return batches of input and target data
    :param int_text: text with words replaced by their ids
    :param batch_size: the size that each batch of data should be
    :param seq_length: the length of each sequence
    :return: batches of data as a numpy array
    """
    words_per_batch = batch_size * seq_length
    num_batches = len(int_text)//words_per_batch
    int_text = int_text[:num_batches*words_per_batch]
    y = np.array(int_text[1:] + [int_text[0]])
    x = np.array(int_text)
    
    x_batches = np.split(x.reshape(batch_size, -1), num_batches, axis=1)
    y_batches = np.split(y.reshape(batch_size, -1), num_batches, axis=1)
    
    batch_data = list(zip(x_batches, y_batches))
    
    return np.array(batch_data)

### Hyperparameters

In [79]:
num_epochs = 10000
batch_size = 512
rnn_size = 512
num_layers = 3
keep_prob = 0.7
embed_dim = 512
seq_length = 30
learning_rate = 0.001
save_dir = './save'

### Build the Graph

In [80]:
train_graph = tf.Graph()
with train_graph.as_default():    
    
    # Initialize input placeholders
    input_text = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    
    # Calculate text attributes
    vocab_size = len(int_to_vocab)
    input_text_shape = tf.shape(input_text)
    
    # Build the RNN cell
    lstm = tf.contrib.rnn.BasicLSTMCell(num_units=rnn_size)
    drop_cell = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    cell = tf.contrib.rnn.MultiRNNCell([drop_cell] * num_layers)
    
    # Set the initial state
    initial_state = cell.zero_state(input_text_shape[0], tf.float32)
    initial_state = tf.identity(initial_state, name='initial_state')
    
    # Create word embedding as input to RNN
    embed = tf.contrib.layers.embed_sequence(input_text, vocab_size, embed_dim)
    
    # Build RNN
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, dtype=tf.float32)
    final_state = tf.identity(final_state, name='final_state')
    
    # Take RNN output and make logits
    logits = tf.contrib.layers.fully_connected(outputs, vocab_size, activation_fn=None)
    
    # Calculate the probability of generating each word
    probs = tf.nn.softmax(logits, name='probs')
    
    # Define loss function
    cost = tf.contrib.seq2seq.sequence_loss(
        logits,
        targets,
        tf.ones([input_text_shape[0], input_text_shape[1]])
    )
    
    # Learning rate optimizer
    optimizer = tf.train.AdamOptimizer(learning_rate)
    
    # Gradient clipping to avoid exploding gradients
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)
    

### Train the Network

In [None]:
import time

pickle.dump((seq_length, save_dir), open('params.p', 'wb'))
batches = get_batches(corpus_int, batch_size, seq_length)
num_batches = len(batches)
start_time = time.time()

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(num_epochs):
        state = sess.run(initial_state, {input_text: batches[0][0]})
        
        for batch_index, (x, y) in enumerate(batches):
            feed_dict = {
                input_text: x,
                targets: y,
                initial_state: state,
                lr: learning_rate
            }
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed_dict)
            
        time_elapsed = time.time() - start_time
        print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}   time_elapsed = {:.3f}   time_remaining = {:.0f}'.format(
            epoch + 1,
            batch_index + 1,
            len(batches),
            train_loss,
            time_elapsed,
            ((num_batches * num_epochs)/((epoch + 1) * (batch_index + 1))) * time_elapsed - time_elapsed))

        # save model every 10 epochs
        if epoch % 10 == 0:
            saver = tf.train.Saver()
            saver.save(sess, save_dir)
            print('Model Trained and Saved')
            

Epoch   1 Batch  144/144   train_loss = 6.483   time_elapsed = 208.531   time_remaining = 2085101
Model Trained and Saved
Epoch   2 Batch  144/144   train_loss = 6.469   time_elapsed = 420.586   time_remaining = 2102508
Epoch   3 Batch  144/144   train_loss = 6.465   time_elapsed = 628.625   time_remaining = 2094788
Epoch   4 Batch  144/144   train_loss = 6.466   time_elapsed = 836.495   time_remaining = 2090401
Epoch   5 Batch  144/144   train_loss = 6.466   time_elapsed = 1044.271   time_remaining = 2087497
Epoch   6 Batch  144/144   train_loss = 6.466   time_elapsed = 1251.556   time_remaining = 2084675
Epoch   7 Batch  144/144   train_loss = 6.464   time_elapsed = 1458.474   time_remaining = 2082076
Epoch   8 Batch  144/144   train_loss = 6.462   time_elapsed = 1665.455   time_remaining = 2080153
Epoch   9 Batch  144/144   train_loss = 6.338   time_elapsed = 1872.648   time_remaining = 2078847
Epoch  10 Batch  144/144   train_loss = 6.278   time_elapsed = 2080.062   time_remaining 

Model Trained and Saved
Epoch  82 Batch  144/144   train_loss = 3.545   time_elapsed = 17095.380   time_remaining = 2067707
Epoch  83 Batch  144/144   train_loss = 3.523   time_elapsed = 17302.661   time_remaining = 2067355
Epoch  84 Batch  144/144   train_loss = 3.510   time_elapsed = 17510.039   time_remaining = 2067018
Epoch  85 Batch  144/144   train_loss = 3.507   time_elapsed = 17717.511   time_remaining = 2066696
Epoch  86 Batch  144/144   train_loss = 3.487   time_elapsed = 17924.611   time_remaining = 2066333
Epoch  87 Batch  144/144   train_loss = 3.482   time_elapsed = 18131.600   time_remaining = 2065960
Epoch  88 Batch  144/144   train_loss = 3.465   time_elapsed = 18338.684   time_remaining = 2065603
Epoch  89 Batch  144/144   train_loss = 3.454   time_elapsed = 18545.614   time_remaining = 2065231
Epoch  90 Batch  144/144   train_loss = 3.443   time_elapsed = 18752.711   time_remaining = 2064882
Epoch  91 Batch  144/144   train_loss = 3.439   time_elapsed = 18959.752   t

Epoch 162 Batch  144/144   train_loss = 2.987   time_elapsed = 33673.942   time_remaining = 2044964
Epoch 163 Batch  144/144   train_loss = 2.978   time_elapsed = 33880.433   time_remaining = 2044674
Epoch 164 Batch  144/144   train_loss = 2.962   time_elapsed = 34086.995   time_remaining = 2044388
Epoch 165 Batch  144/144   train_loss = 2.985   time_elapsed = 34293.618   time_remaining = 2044107
Epoch 166 Batch  144/144   train_loss = 2.985   time_elapsed = 34500.275   time_remaining = 2043830
Epoch 167 Batch  144/144   train_loss = 2.983   time_elapsed = 34707.086   time_remaining = 2043562
Epoch 168 Batch  144/144   train_loss = 2.971   time_elapsed = 34913.906   time_remaining = 2043295
Epoch 169 Batch  144/144   train_loss = 2.953   time_elapsed = 35120.742   time_remaining = 2043030
Epoch 170 Batch  144/144   train_loss = 2.949   time_elapsed = 35327.330   time_remaining = 2042751
Epoch 171 Batch  144/144   train_loss = 2.971   time_elapsed = 35534.208   time_remaining = 2042490


Epoch 243 Batch  144/144   train_loss = 2.727   time_elapsed = 50440.176   time_remaining = 2025287
Epoch 244 Batch  144/144   train_loss = 2.706   time_elapsed = 50647.217   time_remaining = 2025058
Epoch 245 Batch  144/144   train_loss = 2.703   time_elapsed = 50854.223   time_remaining = 2024828
Epoch 246 Batch  144/144   train_loss = 2.703   time_elapsed = 51061.237   time_remaining = 2024599
Epoch 247 Batch  144/144   train_loss = 2.686   time_elapsed = 51268.173   time_remaining = 2024366
Epoch 248 Batch  144/144   train_loss = 2.688   time_elapsed = 51475.259   time_remaining = 2024140
Epoch 249 Batch  144/144   train_loss = 2.679   time_elapsed = 51682.328   time_remaining = 2023913
Epoch 250 Batch  144/144   train_loss = 2.686   time_elapsed = 51889.375   time_remaining = 2023686
Epoch 251 Batch  144/144   train_loss = 2.689   time_elapsed = 52096.357   time_remaining = 2023456
Model Trained and Saved
Epoch 252 Batch  144/144   train_loss = 2.672   time_elapsed = 52307.264   t

Epoch 324 Batch  144/144   train_loss = 2.534   time_elapsed = 67252.079   time_remaining = 2008429
Epoch 325 Batch  144/144   train_loss = 2.530   time_elapsed = 67459.233   time_remaining = 2008209
Epoch 326 Batch  144/144   train_loss = 2.543   time_elapsed = 67666.477   time_remaining = 2007992
Epoch 327 Batch  144/144   train_loss = 2.519   time_elapsed = 67873.447   time_remaining = 2007767
Epoch 328 Batch  144/144   train_loss = 2.519   time_elapsed = 68080.579   time_remaining = 2007547
Epoch 329 Batch  144/144   train_loss = 2.511   time_elapsed = 68287.609   time_remaining = 2007324
Epoch 330 Batch  144/144   train_loss = 2.510   time_elapsed = 68494.624   time_remaining = 2007100
Epoch 331 Batch  144/144   train_loss = 2.510   time_elapsed = 68701.541   time_remaining = 2006874
Model Trained and Saved
Epoch 332 Batch  144/144   train_loss = 2.501   time_elapsed = 68913.019   time_remaining = 2006780
Epoch 333 Batch  144/144   train_loss = 2.503   time_elapsed = 69119.954   t

### Checkpoint

In [97]:
import tensorflow as tf
import numpy as np
import pickle

corpus_int, vocab_to_int, int_to_vocab, token_dict = pickle.load(open('preprocess.p', mode='rb'))
seq_length, save_dir = pickle.load(open('params.p', mode='rb'))


# Generate GOT Text
### Pick a Random Word

In [98]:
def pick_word(probabilities, int_to_vocab):
    """
    Pick the next word with some randomness
    :param probabilities: Probabilites of the next word
    :param int_to_vocab: Dictionary of word ids as the keys and words as the values
    :return: String of the predicted word
    """
    return np.random.choice(list(int_to_vocab.values()), 1, p=probabilities)[0]


### Load the Graph and Generate

In [None]:
gen_length = 1000
prime_words = 'daenerys'

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load the saved model
    loader = tf.train.import_meta_graph(save_dir + '.meta')
    loader.restore(sess, save_dir)
    
    # Get tensors from loaded graph
    input_text = loaded_graph.get_tensor_by_name('input:0')
    initial_state = loaded_graph.get_tensor_by_name('initial_state:0')
    final_state = loaded_graph.get_tensor_by_name('final_state:0')
    probs = loaded_graph.get_tensor_by_name('probs:0')
    
    # Sentences generation setup
    gen_sentences = prime_words.split()
    prev_state = sess.run(initial_state, {input_text: np.array([[1 for word in gen_sentences]])})
    
    # Generate sentences
    for n in range(gen_length):
        # Dynamic Input
        dyn_input = [[vocab_to_int[word] for word in gen_sentences[-seq_length:]]]
        dyn_seq_length = len(dyn_input[0])

        # Get Prediction
        probabilities, prev_state = sess.run(
            [probs, final_state],
            {input_text: dyn_input, initial_state: prev_state})

        pred_word = pick_word(probabilities[dyn_seq_length-1], int_to_vocab)

        gen_sentences.append(pred_word)
        
    # Remove tokens
    chapter_text = ' '.join(gen_sentences)
    for key, token in token_dict.items():
        chapter_text = chapter_text.replace(' ' + token.lower(), key)
        
    print(chapter_text)

# Save a Chapter
### Cleanup Data a Bit

In [174]:
chapter_text = ' '.join(gen_sentences)
for key, token in token_dict.items():
    chapter_text = chapter_text.replace(' ' + token.lower(), key)
chapter_text = chapter_text.replace('\n ', '\n')
chapter_text = chapter_text.replace('( ', '(')
chapter_text = chapter_text.replace(' ”', '”')

capitalize_words = ['lannister', 'stark', 'lord', 'ser', 'tyrion', 'jon', 'john snow', 'daenerys', 'targaryen', 'cersei', 'jaime', 'arya', 'sansa', 'bran', 'rikkon', 'joffrey', 
                    'khal', 'drogo', 'gregor', 'clegane', 'kings landing', 'winterfell', 'the mountain', 'the hound', 'ramsay', 'bolton', 'melisandre', 'shae', 'tyrell',
                   'margaery', 'sandor', 'hodor', 'ygritte', 'brienne', 'tarth', 'petyr', 'baelish', 'eddard', 'greyjoy', 'theon', 'gendry', 'baratheon', 'baraTheon',
                   'varys', 'stannis', 'bronn', 'jorah', 'mormont', 'martell', 'oberyn', 'catelyn', 'robb', 'loras', 'missandei', 'tommen', 'robert', 'lady', 'donella', 'redwyne'
                   'myrcella', 'samwell', 'tarly', 'grey worm', 'podrick', 'osha', 'davos', 'seaworth', 'jared', 'jeyne poole', 'rickard', 'yoren', 'meryn', 'trant', 'king', 'queen',
                   'aemon']

for word in capitalize_words:
    chapter_text = chapter_text.replace(word, word.lower().title())

### Save File

In [175]:
import os
version_dir = './generated-book-v1'
if not os.path.exists(version_dir):
    os.makedirs(version_dir)

num_chapters = len([name for name in os.listdir(version_dir) if os.path.isfile(os.path.join(version_dir, name))])
next_chapter = version_dir + '/chapter-' + str(num_chapters + 1) + '.md'
with open(next_chapter, "w") as text_file:
    text_file.write(chapter_text)