In [53]:
import helper
import os
import csv
data_dir = './data/poem.csv'


In [65]:

def load_data(path):
    """
    Load Dataset from File
    """
    input_file = os.path.join(path)
    tittles = []
    contents = []
    with open(input_file, "r") as f:
        data = csv.reader(f)
        for row in data:
            tittles.append(row[0])
            contents.append(row[1])

    return tittles[1:], contents[1:]

In [66]:
tittles,contents = load_data(data_dir)

In [67]:
poems = ""
for poem in contents:
    poems += poem

In [68]:
view_sentence_range = (0, 10)

"""
contents:list version of the poems
poems:str version of the poems
"""
import numpy as np

print('Dataset Stats')
print('Roughly the number of unique words: {}'.format(len({word: None for word in poems.split()})))

print('Number of poems: {}'.format(len(contents)))
sentence_count_poem = [poems.count('.') for poem in contents]
print('Average number of sentences in each poem: {}'.format(np.average(sentence_count_poem)))

sentences = [sentence for poem in contents for sentence in poem.split('.')]
print('Number of lines: {}'.format(len(sentences)))
word_count_sentence = [len(sentence.split()) for sentence in sentences]
print('Average number of words in each line: {}'.format(np.average(word_count_sentence)))

print()
print('The sentences {} to {}:'.format(*view_sentence_range))
print('\n'.join(poems.split('.')[view_sentence_range[0]:view_sentence_range[1]]))

Dataset Stats
Roughly the number of unique words: 34240
Number of poems: 324
Average number of sentences in each poem: 5472.0
Number of lines: 5796
Average number of words in each line: 23.32246376811594

The sentences 0 to 10:
                        A scene, which 'wildered fancy viewedIn the soul's coldest solitude,With that same scene when peaceful loveFlings rapture's colour o'er the grove,When mountain, meadow, wood and stream With unalloying glory gleam,And to the spirit's ear and eyeAre unison and harmony
The moonlight was my dearer day;Then would I wander far away, And, lingering on the wild brook's shoreTo hear its unremitting roar,Would lose in the ideal flowAll sense of overwhelming woe;Or at the noiseless noon of nightWould climb some heathy mountain's height,And listen to the mystic soundThat stole in fitful gasps around
I joyed to see the streaks of dayAbove the purple peaks decay, And watch the latest line of lightJust mingling with the shades of night;For day with me w

In [69]:
import numpy as np

def create_lookup_tables(text):
    
    vocab_to_int ={word:i for i,word in enumerate(text)}
    int_to_vocab={i[1]:i[0] for i in vocab_to_int.items()}
    """
    Create lookup tables for vocabulary
    :param text: The text of tv scripts split into words
    :return: A tuple of dicts (vocab_to_int, int_to_vocab)
    """
    return vocab_to_int, int_to_vocab

In [70]:
def token_lookup():
    
    punctuation ={
        ".":"||Period||",
        ",":"||Comma||",
        '"':"||Quotation_Mark||",
        ";":"||Semicolon||",
        "!":"||Exclamation_mark",
        "?":"||Question_mark||",
        "(":"||Left_Parentheses||",
        ")":"||Right_Parentheses||",
        "--":"||Dash||",
        "\n":"||Return||"
    }
    """
    Generate a dict to turn punctuation into a token.
    :return: Tokenize dictionary where the key is the punctuation and the value is the token
    """
    return punctuation

In [74]:
# Preprocess Training, Validation, and Testing Data
import pickle
def preprocess_and_save_data(text, token_lookup, create_lookup_tables):
    """
    Preprocess Text Data
    """

    # Ignore notice, since we don't use it for analysing the data

    token_dict = token_lookup()
    for key, token in token_dict.items():
        text = text.replace(key, ' {} '.format(token))

    text = text.lower()
    text = text.split()

    vocab_to_int, int_to_vocab = create_lookup_tables(text)
    int_text = [vocab_to_int[word] for word in text]
    pickle.dump((int_text, vocab_to_int, int_to_vocab, token_dict), open('preprocess.p', 'wb'))


In [75]:
preprocess_and_save_data(poems, token_lookup, create_lookup_tables)

In [76]:
import helper
import numpy as np


int_text, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()

19782

In [79]:

from distutils.version import LooseVersion
import warnings
import tensorflow as tf

# Check TensorFlow Version
assert LooseVersion(tf.__version__) >= LooseVersion('1.0'), 'Please use TensorFlow version 1.0 or newer'
print('TensorFlow Version: {}'.format(tf.__version__))

# Check for a GPU
if not tf.test.gpu_device_name():
    warnings.warn('No GPU found. Please use a GPU to train your neural network.')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

TensorFlow Version: 1.0.0




In [80]:
def get_inputs():
    inputs = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    learning_rate = tf.placeholder(tf.float32, None, name='learning_rate')
    """
    Create TF Placeholders for input, targets, and learning rate.
    :return: Tuple (input, targets, learning rate)
    """
    return inputs, targets, learning_rate


In [81]:
def get_init_cell(batch_size, rnn_size):
    keep_prob=0.5
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size, state_is_tuple=True)
    drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    cell = tf.contrib.rnn.MultiRNNCell([drop]*rnn_size) 
    initial_state = cell.zero_state(batch_size, tf.int32)
    initial_state = tf.identity(initial_state, name='initial_state')
    """
    Create an RNN Cell and initialize it.
    :param batch_size: Size of batches
    :param rnn_size: Size of RNNs
    :return: Tuple (cell, initialize state)
    """
    return (cell, initial_state)



In [82]:
def get_embed(input_data, vocab_size, embed_dim):
    
    embedding = tf.Variable(tf.random_uniform((vocab_size, embed_dim),-1,1))
    embed = tf.nn.embedding_lookup(embedding, input_data)
    return embed


In [83]:
def build_rnn(cell, inputs):
    Outputs, FinalState = tf.nn.dynamic_rnn(cell, inputs, dtype = tf.float32)
    FinalState = tf.identity(FinalState, name = "final_state")
    return (Outputs, FinalState)

In [84]:
def build_nn(cell, rnn_size, input_data, vocab_size):
    """
    Build part of the neural network
    :param cell: RNN cell
    :param rnn_size: Size of rnns
    :param input_data: Input data
    :param vocab_size: Vocabulary size
    :return: Tuple (Logits, FinalState)
    """
    embed_layer = get_embed(input_data,vocab_size,rnn_size)
    output, FinalState = build_rnn(cell,embed_layer)
    full_conn = tf.contrib.layers.fully_connected(output,vocab_size,activation_fn= None)
    return (full_conn,FinalState)


In [85]:
def get_batches(int_text, batch_size, seq_length):
    """
    Return batches of input and target
    :param int_text: Text with the words replaced by their ids
    :param batch_size: The size of batch
    :param seq_length: The length of sequence
    :return: Batches as a Numpy array
    """
    num_batches = int(len(int_text)/(batch_size*seq_length))
    xdata = np.array(int_text[: num_batches * batch_size * seq_length])
    ydata = np.array(int_text[1: num_batches * batch_size * seq_length + 1])

    x_batches = np.split(xdata.reshape(batch_size, -1), num_batches, 1)
    y_batches = np.split(ydata.reshape(batch_size, -1), num_batches, 1)

    return np.array(list(zip(x_batches, y_batches)))


In [89]:
# Number of Epochs
num_epochs = 500
# Batch Size
batch_size = 50
# RNN Size
rnn_size = 4
# Sequence Length
seq_length = 18
embed_dim =200
# Learning Rate
learning_rate = 0.01
# Show stats for every n number of batches
show_every_n_batches = 25


save_dir = './save'

In [90]:

from tensorflow.contrib import seq2seq

train_graph = tf.Graph()
with train_graph.as_default():
    vocab_size = len(int_to_vocab)
    input_text, targets, lr = get_inputs()
    input_data_shape = tf.shape(input_text)
    cell, initial_state = get_init_cell(input_data_shape[0], rnn_size)
    logits, final_state = build_nn(cell, rnn_size, input_text, vocab_size)

    # Probabilities for generating words
    probs = tf.nn.softmax(logits, name='probs')  

    # Loss function
    cost = seq2seq.sequence_loss(
        logits,
        targets,
        tf.ones([input_data_shape[0], input_data_shape[1]]))

    # Optimizer
    optimizer = tf.train.AdamOptimizer(lr)

    # Gradient Clipping
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients]
    train_op = optimizer.apply_gradients(capped_gradients)

In [92]:

batches = get_batches(int_text, batch_size, seq_length)

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())

    for epoch_i in range(num_epochs):
        state = sess.run(initial_state, {input_text: batches[0][0]})

        for batch_i, (x, y) in enumerate(batches):
            feed = {
                input_text: x,
                targets: y,
                initial_state: state,
                lr: learning_rate}
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed)

            # Show every <show_every_n_batches> batches
            if (epoch_i * len(batches) + batch_i) % show_every_n_batches == 0:
                print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}'.format(
                    epoch_i,
                    batch_i,
                    len(batches),
                    train_loss))

    # Save Model
    saver = tf.train.Saver()
    saver.save(sess, save_dir)
    print('Model Trained and Saved')

InvalidArgumentError: indices[0,0] = 169674 is not in [0, 19782)
	 [[Node: embedding_lookup = Gather[Tindices=DT_INT32, Tparams=DT_FLOAT, _class=["loc:@Variable"], validate_indices=true, _device="/job:localhost/replica:0/task:0/cpu:0"](Variable/read, _recv_input_0)]]

Caused by op 'embedding_lookup', defined at:
  File "/Users/rivers/anaconda3/envs/tensorflow/lib/python3.5/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/Users/rivers/anaconda3/envs/tensorflow/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Users/rivers/anaconda3/envs/tensorflow/lib/python3.5/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/Users/rivers/anaconda3/envs/tensorflow/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/Users/rivers/anaconda3/envs/tensorflow/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 474, in start
    ioloop.IOLoop.instance().start()
  File "/Users/rivers/anaconda3/envs/tensorflow/lib/python3.5/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/Users/rivers/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tornado/ioloop.py", line 887, in start
    handler_func(fd_obj, events)
  File "/Users/rivers/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/rivers/anaconda3/envs/tensorflow/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/Users/rivers/anaconda3/envs/tensorflow/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/Users/rivers/anaconda3/envs/tensorflow/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/Users/rivers/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/rivers/anaconda3/envs/tensorflow/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/Users/rivers/anaconda3/envs/tensorflow/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/Users/rivers/anaconda3/envs/tensorflow/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 390, in execute_request
    user_expressions, allow_stdin)
  File "/Users/rivers/anaconda3/envs/tensorflow/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/Users/rivers/anaconda3/envs/tensorflow/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 501, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/Users/rivers/anaconda3/envs/tensorflow/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Users/rivers/anaconda3/envs/tensorflow/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/Users/rivers/anaconda3/envs/tensorflow/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-90-2ff53a3bfb33>", line 10, in <module>
    logits, final_state = build_nn(cell, rnn_size, input_text, vocab_size)
  File "<ipython-input-84-d931e32bdda8>", line 11, in build_nn
    embed_layer = get_embed(input_data,vocab_size,rnn_size)
  File "<ipython-input-82-3615d6b9bb47>", line 4, in get_embed
    embed = tf.nn.embedding_lookup(embedding, input_data)
  File "/Users/rivers/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/ops/embedding_ops.py", line 111, in embedding_lookup
    validate_indices=validate_indices)
  File "/Users/rivers/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/ops/gen_array_ops.py", line 1359, in gather
    validate_indices=validate_indices, name=name)
  File "/Users/rivers/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 763, in apply_op
    op_def=op_def)
  File "/Users/rivers/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2395, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/Users/rivers/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1264, in __init__
    self._traceback = _extract_stack()

InvalidArgumentError (see above for traceback): indices[0,0] = 169674 is not in [0, 19782)
	 [[Node: embedding_lookup = Gather[Tindices=DT_INT32, Tparams=DT_FLOAT, _class=["loc:@Variable"], validate_indices=true, _device="/job:localhost/replica:0/task:0/cpu:0"](Variable/read, _recv_input_0)]]
