In [32]:
# import all required libraries
import numpy as np
import tensorflow as tf
from tensorflow.python.ops import variable_scope
from tensorflow.contrib.framework.python.framework import checkpoint_utils

import random
import collections
import time

In [2]:
# define parameters of the program
corpus_path = '../data/got_all_edited.txt'

num_epoch = 30

batch_size = 30
num_steps = 60
embedding_size = 100

hidden_unit_size = 256
vocabulary_size = 20000
learning_rate = 1e-4

sample_length = 10

STOP_TOKEN = '*STOP*'

In [3]:
# define a function to load and preprocess the text corpus then return list of chars
def read_file(path):
    with open(corpus_path) as f:
        char_tokens = ['*STOP*']
        text = f.read()
        char_tokens.extend(text)
        
        for i in range(len(char_tokens)):
            if char_tokens[i] == '\n':
                char_tokens[i] = STOP_TOKEN
        
        return char_tokens
    
def build_dataset(tokens):
    counts = []
    counts.extend(collections.Counter(tokens).most_common())
    
    dictionary = dict()
    data = list()
    
    for token, _ in counts:
        dictionary[token] = len(dictionary)
        
    for token in tokens:
        data.append(dictionary[token])
        
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    
    return data, dictionary, reverse_dictionary

def generate_batch(dataset, batch_size, num_steps, offset=0):
    assert offset + batch_size * num_steps < len(dataset)
    
    batch_context = np.ndarray((batch_size, num_steps), dtype=np.int32)
    batch_target = np.ndarray((batch_size, num_steps), dtype=np.int32)
    
    for i in range(batch_size):
        batch_context[i] = dataset[offset : offset+num_steps]
        batch_target[i] = dataset[offset+1 : offset+num_steps+1]
        offset += num_steps
        
    return batch_context, batch_target, offset

tokens = read_file(corpus_path)
data, tokendict, tokendictreversed = build_dataset(tokens)

vocabsize = len(tokendict)

In [148]:
graph = tf.Graph()
with graph.as_default():
    # setup input and labels placeholders
    seed_inputs = tf.placeholder(tf.int32, shape=[1, None])
    single_input = tf.placeholder(tf.int32, shape=[1])
    prev_state_c = tf.placeholder(tf.float32, shape=[1, 256])
    prev_state_h = tf.placeholder(tf.float32, shape=[1, 256])
    prev_state = (prev_state_c, prev_state_h)
    
    bsize = tf.placeholder(tf.int32)
    temperature = tf.placeholder(tf.float32)
    
    logits_weights = tf.Variable(tf.truncated_normal([hidden_unit_size, vocabsize], stddev=0.1), 
                                     name='Variable_1')
    logits_biases = tf.Variable(tf.zeros([vocabsize]),
                                   name='Variable_2')
    
    # instantiate embedding matrix
    charvectors = tf.Variable(tf.random_normal([vocabsize, embedding_size]), name='Variable')
    seedcharvectors = tf.nn.embedding_lookup(charvectors, seed_inputs)
    
    rnn_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_unit_size, forget_bias=0.0, state_is_tuple=True)
    init_state = rnn_cell.zero_state(bsize, tf.float32)
    outs, seed_state = tf.nn.dynamic_rnn(rnn_cell, seedcharvectors, initial_state=init_state)
    seed_output = seed_state.h
    seed_logits = tf.matmul(seed_output, logits_weights) + logits_biases
   
    with tf.variable_scope("RNN") as scope:
        scope.reuse_variables()
        current_input = tf.nn.embedding_lookup(charvectors, single_input)
        current_output, current_state = rnn_cell(current_input, prev_state)
 
        logits = tf.matmul(current_output, logits_weights) + logits_biases

In [186]:
def sample_softmax(logits, temperature=1.0):
    logits = logits / temperature
    softmax = np.exp(logits) / np.sum(np.exp(logits))
    r = random.random() # range: [0,1)
    total = 0.0
    for i in range(len(softmax)):
        total += softmax[i]
        if total > r:
            return i
    return len(softmax)-1

In [191]:
with tf.Session(graph=graph) as sess:
    var_saver = tf.train.Saver(tf.trainable_variables())
    path = 'checkpoints/char_rnn_langmodel.ckpt'
    var_saver.restore(sess, path)
    
    initial = sess.run(init_state, feed_dict={bsize: 1})
    
    seed_string = [map(lambda x: tokendict[x], 'Arya ')]
    
    feed_dict = {seed_inputs: seed_string,
                 bsize: 1, init_state: initial}
    [seed_s, seed_l] = sess.run([seed_state, seed_logits], feed_dict=feed_dict)
    
    # iterate through the length of the sample:
    samples = [] + seed_string[0]
    current_s = seed_s
    current_logits = seed_l
    current_inp = sample_softmax(current_logits[0], temperature=0.8)
    for i in range(5000):
        feed_dict = {single_input: [current_inp], prev_state_c: current_s.c, prev_state_h: current_s.h}
        [current_logits, current_s] = sess.run([logits, current_state], feed_dict=feed_dict)

        current_inp = sample_softmax(current_logits[0], temperature=0.8)
        samples.append(current_inp)
        
    print ''.join(map(lambda x: tokendictreversed[x], samples))

Arya re which and seen that the warrior said, but both swords through him to the ears and through her sgots of beds and steel of the sea with by she paused as he should have to lurd do with a few away. "I was an ellow the boy best of Starks. You’re no strengt and stranger swords make off and with them to war. The lionfood black part when he will be more than a preyer of it. Jon murded the Stone captively, he could not have been not like the name. The gate had towno had past her spit of the tree burned they were all between the white look.*STOP*"She was a chance of House Tyrell had gone."*STOP*"I stood lay by head. I have comes at marry him, and your father was a face. What do you this to be marry more than a grite with the First the three gold from galleys. The fish fleed to say. "My lord," she could not be blood and rose and pits and bold. It is no cold down see their desperous tank they could not be saying to drink.*STOP*"They say. It was will be a bird son he can watch the rest of a