In [30]:
import helper
import os
import csv
data_dir = './data/poem.csv'

In [31]:
import io
def load_data(path):
    """
    Load Dataset from File
    """
    input_file = os.path.join(path)
    tittles = []
    contents = []
    with io.open(input_file, "r", encoding = 'utf-8') as f:
        data = csv.reader(f)
        for row in data:
            tittles.append(row[0])
            contents.append(row[1])

    return tittles[1:], contents[1:]

In [32]:
tittles,contents = load_data(data_dir)
tittles = tittles[-10:]
contents= contents[-10:]

In [33]:
poems = ""
for poem in contents:
    poems += poem

In [34]:
view_sentence_range = (0, 10)

"""
contents:list version of the poems
poems:str version of the poems
"""
import numpy as np

print('Dataset Stats')
print('Roughly the number of unique words: {}'.format(len({word: None for word in poems.split()})))

print('Number of poems: {}'.format(len(contents)))
sentence_count_poem = [poems.count('.') for poem in contents]
print('Average number of sentences in each poem: {}'.format(np.average(sentence_count_poem)))

sentences = [sentence for poem in contents for sentence in poem.split('.')]
print('Number of lines: {}'.format(len(sentences)))
word_count_sentence = [len(sentence.split()) for sentence in sentences]
print('Average number of words in each line: {}'.format(np.average(word_count_sentence)))

print()
print('The sentences {} to {}:'.format(*view_sentence_range))
print('\n'.join(poems.split('.')[view_sentence_range[0]:view_sentence_range[1]]))

Dataset Stats
Roughly the number of unique words: 2321
Number of poems: 10
Average number of sentences in each poem: 90.0
Number of lines: 100
Average number of words in each line: 48.04

The sentences 0 to 10:
                        Music, when soft voices die,Vibrates in the memory;Odours, when sweet violets sicken,Live within the sense they quicken
Rose leaves, when the rose is dead,Are heaped for the beloved's bed;And so thy thoughts, when thou art gone,Love itself shall slumber on
                                                                     I arise from dreams of thee In the first sweet sleep of night, When the winds are breathing low, And the stars are shining bright I arise from dreams of thee, And a spirit in my feet Has led me -  who knows how? -  To thy chamber-window, sweet! The wandering airs they faint On the dark, the silent stream, -  The champak odors fall Like sweet thoughts in a dream, The nightingale's complaint, It dies upon her heart, As I must die on thin

In [35]:
import numpy as np
import problem_unittests as tests
from collections import Counter

def create_lookup_tables(text):
    """
    Create lookup tables for vocabulary
    :param text: The text of tv scripts split into words
    :return: A tuple of dicts (vocab_to_int, int_to_vocab)
    """
    # TODO: Implement Function
    word_counts = Counter(text)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
    vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}

    return vocab_to_int, int_to_vocab


"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
tests.test_create_lookup_tables(create_lookup_tables)

Tests Passed


In [36]:
def token_lookup():
    """
    Generate a dict to turn punctuation into a token.
    :return: Tokenize dictionary where the key is the punctuation and the value is the token
    """
    # TODO: Implement Function
    punk_dict = {'.':'||Period||',
                ',':'||Comma||',
                '"':'||Quotation_mark||',
                ';':'||Semicolon||',
                '!':'||Exclamation_mark||',
                '?':'||Question_mark||',
                '(':'||Left_Parentheses||',
                ')':'||Right_Parentheses||',
                '--':'||Dash||',
                '\n':'||Return||'}
    
    return punk_dict

"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
tests.test_tokenize(token_lookup)

Tests Passed


In [37]:
# Preprocess Training, Validation, and Testing Data
import pickle
def preprocess_and_save_data(text, token_lookup, create_lookup_tables):
    """
    Preprocess Text Data
    """

    # Ignore notice, since we don't use it for analysing the data

    token_dict = token_lookup()
    for key, token in token_dict.items():
        text = text.replace(key, ' {} '.format(token))

    text = text.lower()
    text = text.split()

    vocab_to_int, int_to_vocab = create_lookup_tables(text)
    int_text = [vocab_to_int[word] for word in text]
    pickle.dump((int_text, vocab_to_int, int_to_vocab, token_dict), open('preprocess.p', 'wb'))


In [38]:
preprocess_and_save_data(poems, token_lookup, create_lookup_tables)

In [39]:
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
import helper
import numpy as np
import problem_unittests as tests

int_text, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()

In [40]:
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
from distutils.version import LooseVersion
import warnings
import tensorflow as tf

# Check TensorFlow Version
assert LooseVersion(tf.__version__) >= LooseVersion('1.0'), 'Please use TensorFlow version 1.0 or newer'
print('TensorFlow Version: {}'.format(tf.__version__))

# Check for a GPU
if not tf.test.gpu_device_name():
    warnings.warn('No GPU found. Please use a GPU to train your neural network.')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

TensorFlow Version: 1.0.0




In [41]:
def get_inputs():
    """
    Create TF Placeholders for input, targets, and learning rate.
    :return: Tuple (input, targets, learning rate)
    """
    """
    tf.placeholder : placeholder(
                        dtype
                        shape = None,
                        name = None)
    """
    inputs = tf.placeholder(tf.int32, [None,None], name='input')
    targets = tf.placeholder(tf.int32, [None,None], name='targets')
    learning_rate = tf.placeholder(tf.float32, name='learning_rate')
    
    return inputs, targets, learning_rate
"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
tests.test_get_inputs(get_inputs)

Tests Passed


In [42]:
def get_init_cell(batch_size, rnn_size):
    """
    Create an RNN Cell and initialize it.
    :param batch_size: Size of batches
    :param rnn_size: Size of RNNs
    :return: Tuple (cell, initialize state)
    """
    # TODO: Implement Function
    
    num_layers = 2
    
    ### Build the LSTM Cell
    # Use a basic LSTM cell
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    
    # Stack up multiple LSTM layers, for deep learning
    cell = tf.contrib.rnn.MultiRNNCell([lstm] * num_layers)
    initial_state = tf.identity(cell.zero_state(batch_size, tf.float32), name="initial_state")
   
    return cell, initial_state


"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
tests.test_get_init_cell(get_init_cell)

Tests Passed


In [43]:
def get_embed(input_data, vocab_size, embed_dim):
    """
    Create embedding for <input_data>.
    :param input_data: TF placeholder for text input.
    :param vocab_size: Number of words in vocabulary.
    :param embed_dim: Number of embedding dimensions
    :return: Embedded input.
    """
    # TODO: Implement Function

    embedding = tf.Variable(tf.random_uniform((vocab_size, embed_dim), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, input_data)
    return embed


"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
tests.test_get_embed(get_embed)

Tests Passed


In [44]:
def build_rnn(cell, inputs):
    """
    Create a RNN using a RNN Cell
    :param cell: RNN Cell
    :param inputs: Input text data
    :return: Tuple (Outputs, Final State)
    """
    # TODO: Implement Function

    outputs, final_state = tf.nn.dynamic_rnn(cell, inputs, dtype = tf.float32, initial_state = None)
    final_state = tf.identity(final_state, name = "final_state")
    return outputs, final_state


"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
tests.test_build_rnn(build_rnn)

Tests Passed


In [45]:
def build_nn(cell, rnn_size, input_data, vocab_size, embed_dim):
    """
    Build part of the neural network
    :param cell: RNN cell
    :param rnn_size: Size of rnns
    :param input_data: Input data
    :param vocab_size: Vocabulary size
    :param embed_dim: Number of embedding dimensions
    :return: Tuple (Logits, FinalState)
    """
    # TODO: Implement Function
    embed = get_embed(input_data, vocab_size, embed_dim)
    lstm_output, final_state = build_rnn(cell, embed) 
    
    # Reshape
    #seq_output = tf.concat(lstm_output, axis=1)
    #x = tf.reshape(seq_output, [-1, rnn_size])
    
    weights = tf.truncated_normal_initializer(stddev=0.1)
    bias = tf.zeros_initializer()
    
    logits = tf.contrib.layers.fully_connected(lstm_output, vocab_size, weights_initializer = weights, biases_initializer = bias, activation_fn= None)
                       
    return logits, final_state


"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
tests.test_build_nn(build_nn)

Tests Passed


In [46]:
def get_batches(int_text, batch_size, seq_length):
    """
    Return batches of input and target
    :param int_text: Text with the words replaced by their ids
    :param batch_size: The size of batch
    :param seq_length: The length of sequence
    :return: Batches as a Numpy array
    """
    # TODO: Implement Function
    text_len = len(int_text)
    total_batch = batch_size * seq_length
    no_batches = text_len // total_batch
    int_text = np.array(int_text[:total_batch * no_batches])

    # Create offsetting targets

    targets = np.zeros_like(int_text)
    targets[:-1], targets[-1] = int_text[1:], int_text[0]

    # Create output array

    output = np.zeros(len(int_text) * 2)
    output = np.resize(output,(no_batches,2,batch_size,seq_length))

    # inputs
    int_text = np.reshape(int_text,(batch_size,-1))

    for bs in range(0,batch_size):
        for nb in range(0,no_batches):
            output[nb,0,bs] = int_text[bs,nb * seq_length : nb * seq_length + seq_length]

    # targets
    targets = np.reshape(targets,(batch_size,-1))

    for bs in range(0,batch_size):
        for nb in range(0,no_batches):
            output[nb,1,bs] = targets[bs,nb * seq_length : nb * seq_length + seq_length]

    return output


"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
tests.test_get_batches(get_batches)

Tests Passed


In [47]:
# Number of Epochs
num_epochs = 1
# Batch Size
batch_size = 64
# RNN Size
rnn_size = 4
# Embedding Dimension Size
embed_dim = 20
# Sequence Length
seq_length = 8
# Learning Rate
learning_rate = .001
# Show stats for every n number of batches
show_every_n_batches = 10

"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
save_dir = './save'

In [48]:
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
from tensorflow.contrib import seq2seq

train_graph = tf.Graph()
with train_graph.as_default():
    vocab_size = len(int_to_vocab)
    input_text, targets, lr = get_inputs()
    input_data_shape = tf.shape(input_text)
    cell, initial_state = get_init_cell(input_data_shape[0], rnn_size)
    logits, final_state = build_nn(cell, rnn_size, input_text, vocab_size, embed_dim)

    # Probabilities for generating words
    probs = tf.nn.softmax(logits, name='probs')

    # Loss function
    cost = seq2seq.sequence_loss(
        logits,
        targets,
        tf.ones([input_data_shape[0], input_data_shape[1]]))

    # Optimizer
    optimizer = tf.train.AdamOptimizer(lr)

    # Gradient Clipping
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)

In [49]:
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
batches = get_batches(int_text, batch_size, seq_length)

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())

    for epoch_i in range(num_epochs):
        state = sess.run(initial_state, {input_text: batches[0][0]})

        for batch_i, (x, y) in enumerate(batches):
            feed = {
                input_text: x,
                targets: y,
                initial_state: state,
                lr: learning_rate}
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed)

            # Show every <show_every_n_batches> batches
            if (epoch_i * len(batches) + batch_i) % show_every_n_batches == 0:
                print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}'.format(
                    epoch_i,
                    batch_i,
                    len(batches),
                    train_loss))

    # Save Model
    saver = tf.train.Saver()
    saver.save(sess, save_dir)
    print('Model Trained and Saved')

Epoch   0 Batch    0/11   train_loss = 7.522
Epoch   0 Batch   10/11   train_loss = 7.515
Model Trained and Saved


In [50]:
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
# Save parameters for checkpoint
helper.save_params((seq_length, save_dir))

In [51]:
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
import tensorflow as tf
import numpy as np
import helper
import problem_unittests as tests

_, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()
seq_length, load_dir = helper.load_params()

In [52]:
def get_tensors(loaded_graph):
    """
    Get input, initial state, final state, and probabilities tensor from <loaded_graph>
    :param loaded_graph: TensorFlow graph loaded from file
    :return: Tuple (InputTensor, InitialStateTensor, FinalStateTensor, ProbsTensor)
    """
    # TODO: Implement Function
    InputTensor = loaded_graph.get_tensor_by_name('input:0')
    InitialStateTensor = loaded_graph.get_tensor_by_name('initial_state:0')
    FinalStateTensor = loaded_graph.get_tensor_by_name('final_state:0')
    ProbsTensor = loaded_graph.get_tensor_by_name('probs:0')
    
    return InputTensor, InitialStateTensor, FinalStateTensor, ProbsTensor

"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
tests.test_get_tensors(get_tensors)

Tests Passed


In [53]:
def pick_word(probabilities, int_to_vocab):
    """
    Pick the next word in the generated text
    :param probabilities: Probabilites of the next word
    :param int_to_vocab: Dictionary of word ids as the keys and words as the values
    :return: String of the predicted word
    """
    # TODO: Implement Function
    word = np.random.choice(list(int_to_vocab.values()),p=probabilities)
    
    return word


"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
tests.test_pick_word(pick_word)

Tests Passed


In [83]:
gen_length = 200
# homer_simpson, moe_szyslak, or Barney_Gumble
prime_word = "sad Hour, selected from all  mourn our loss rouse thy obscure compeers"

"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(load_dir + '.meta')
    loader.restore(sess, load_dir)
    input_text, initial_state, final_state, probs = get_tensors(loaded_graph)
    gen_sentences = prime_word.split()
    prev_state = sess.run(initial_state, {input_text: np.array([[1]])})
    for n in range(gen_length):
        # Dynamic Input
        dyn_input = [[vocab_to_int[word] for word in gen_sentences[-seq_length:]]]
        dyn_seq_length = len(dyn_input[0])

        # Get Prediction
        probabilities, prev_state = sess.run(
            [probs, final_state],
            {input_text: dyn_input, initial_state: prev_state})
        
        pred_word = pick_word(probabilities[dyn_seq_length-1], int_to_vocab)

        gen_sentences.append(pred_word)
    
    # Remove tokens
    tv_script = ' '.join(gen_sentences)
    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        tv_script = tv_script.replace(' ' + token.lower(), key)
    tv_script = tv_script.replace('\n ', '\n')
    tv_script = tv_script.replace('( ', '(')
        
    print(tv_script)        

sad Hour, selected from all mourn our loss rouse thy obscure compeers core incantation mourna mingle let happier happier thou cares massy stretch burn breakher saddened she: picturing droop watched pitched lived spreadon tender men faint among-- clips both swift lived bulk silken whose annihilation ages illumine chastisement solemn kings:look boughs wounded wont had renownrose vibrates is:what misery breathwhich no swarms ceased bethe dearwith wanderings dimmed mediterranean changed sapless in welcoming surge wither hyacinth work flocks hectic true-love seed form year bier to checks each spray true-love draw firmament build where'er man grasped guarded ploughfor west slumber fear sword thrown -it phantasies long whence shore brink peace trunkless complaint shrank full loveliest benediction bark angel lovely: sphere life's wan bedthe frost though sate wintry brainthat shattered future watch-tower depart blowher spreads visions soughther mist turn fall. stars bethe traveller rude blind d

In [None]:
   # Get Tensors from loaded model
    input_text, initial_state, final_state, probs = get_tensors(loaded_graph)

    # Sentences generation setup
    gen_sentences = [prime_word + ':']
    prev_state = sess.run(initial_state, {input_text: np.array([[1]])})

    # Generate sentences
    for n in range(gen_length):
        # Dynamic Input
        dyn_input = [[vocab_to_int[word] for word in gen_sentences[-seq_length:]]]
        dyn_seq_length = len(dyn_input[0])

        # Get Prediction
        probabilities, prev_state = sess.run(
            [probs, final_state],
            {input_text: dyn_input, initial_state: prev_state})
        
        pred_word = pick_word(probabilities[dyn_seq_length-1], int_to_vocab)

        gen_sentences.append(pred_word)
    
    # Remove tokens
    tv_script = ' '.join(gen_sentences)
    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        tv_script = tv_script.replace(' ' + token.lower(), key)
    tv_script = tv_script.replace('\n ', '\n')
    tv_script = tv_script.replace('( ', '(')
        
    print(tv_script)

In [63]:
t ="sad Hour, selected from all yearsTo mourn our loss, rouse thy obscure compeers"

In [75]:
s =t.split()

In [76]:
s[-8:]

['yearsTo', 'mourn', 'our', 'loss,', 'rouse', 'thy', 'obscure', 'compeers']