## 3.3 Tensor FLow Project (TEXT): RNN with Sentiment Analysis

In [6]:
import numpy as np
import tensorflow as tf
from string import punctuation
from collections import Counter

In [4]:
with open('../ClassSampleData/Data1.5_reviews.txt', 'r') as f:
    reviews = f.read()
with open('../ClassSampleData/Data1.5_labels.txt', 'r') as f:
    labels = f.read()

all_text = ''.join([c for c in reviews if c not in punctuation])
reviews = all_text.split('\n')

all_text = ' '.join(reviews)
words = all_text.split()

### Encoding

In [7]:
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

reviews_ints = []
for each in reviews:
    reviews_ints.append([vocab_to_int[word] for word in each.split()])
review_lens = Counter([len(x) for x in reviews_ints])

labels = labels.split('\n')
labels = np.array([1 if each == 'positive' else 0 for each in labels])

# Remove 0-length reviews
non_zero_idx = [ii for ii, review in enumerate(reviews_ints) if len(review) != 0]

reviews_ints = [reviews_ints[ii] for ii in non_zero_idx]
labels = np.array([labels[ii] for ii in non_zero_idx])

### Creating model datasets

In [8]:
seq_len = 200
# lef padding review if length < seg_len, if length>seq_len, only keep seq_len
features = np.zeros((len(reviews_ints), seq_len), dtype=int)
for i, row in enumerate(reviews_ints):
    features[i, -len(row):] = np.array(row)[:seq_len] #here only assigned to the right most len(row)

split_frac = 0.8
split_idx = int(len(features)*0.8)
train_x, val_x = features[:split_idx], features[split_idx:]
train_y, val_y = labels[:split_idx], labels[split_idx:]

test_idx = int(len(val_x)*0.5)
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

In [33]:
def get_batches(x, y, batch_size=100):
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

### Build the graph

In [30]:
lstm_size = 256
lstm_layers = 1
batch_size = 500
learning_rate = 0.001

n_words = len(vocab_to_int) + 1 # Adding 1 because we use 0's for padding, dictionary started at 1
embed_size = 300 

# Create the graph object
graph = tf.Graph()
# Add nodes to the graph
with graph.as_default():
    inputs_ = tf.placeholder(tf.int32, [None, None], name='inputs')
    labels_ = tf.placeholder(tf.int32, [None, None], name='labels')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    
    embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs_)
    
    # basic LSTM cell
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    
    # Stack up multiple LSTM layers, for deep learning
    cell = tf.contrib.rnn.MultiRNNCell([drop] * lstm_layers)
    
    # Getting an initial state of all zeros
    initial_state = cell.zero_state(batch_size, tf.float32)
    
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)
    
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
    cost = tf.losses.mean_squared_error(labels_, predictions)
    
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
    
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

Instructions for updating:
Colocations handled automatically by placer.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.


In [35]:
epochs = 10

with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y[:, None],
                    keep_prob: 0.5,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            
            if iteration%25==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%150==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/sentiment.ckpt")

Epoch: 0/10 Iteration: 25 Train loss: 0.225
Epoch: 1/10 Iteration: 50 Train loss: 0.207
Epoch: 1/10 Iteration: 75 Train loss: 0.128
Epoch: 2/10 Iteration: 100 Train loss: 0.144
Epoch: 3/10 Iteration: 125 Train loss: 0.128
Epoch: 3/10 Iteration: 150 Train loss: 0.125
Val acc: 0.786
Epoch: 4/10 Iteration: 175 Train loss: 0.189
Epoch: 4/10 Iteration: 200 Train loss: 0.108
Epoch: 5/10 Iteration: 225 Train loss: 0.067
Epoch: 6/10 Iteration: 250 Train loss: 0.125
Epoch: 6/10 Iteration: 275 Train loss: 0.056
Epoch: 7/10 Iteration: 300 Train loss: 0.083
Val acc: 0.802
Epoch: 8/10 Iteration: 325 Train loss: 0.059
Epoch: 8/10 Iteration: 350 Train loss: 0.047
Epoch: 9/10 Iteration: 375 Train loss: 0.046
Epoch: 9/10 Iteration: 400 Train loss: 0.042


In [36]:
test_acc = []
with tf.Session(graph=graph) as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {inputs_: x,
                labels_: y[:, None],
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

INFO:tensorflow:Restoring parameters from checkpoints\sentiment.ckpt
Test accuracy: 0.854


## 3.4 Tensor Flow Project (TEXT): RNN with TV Script

In [58]:
import pickle
import numpy as np

from collections import Counter
from tensorflow.contrib import seq2seq
import tensorflow as tf


In [54]:
# Helper function for 3.4

def load_data(input_file):
    with open(input_file, "r") as f:
        data = f.read()
    return data

def preprocess_and_save_data(dataset_path, token_lookup, create_lookup_tables):
    """
    Preprocess Text Data
    """
    text = load_data(dataset_path)
    
    # Ignore notice, since we don't use it for analysing the data
    text = text[81:]

    token_dict = token_lookup()
    for key, token in token_dict.items():
        text = text.replace(key, ' {} '.format(token))

    text = text.lower()
    text = text.split()

    vocab_to_int, int_to_vocab = create_lookup_tables(text)
    int_text = [vocab_to_int[word] for word in text]
    pickle.dump((int_text, vocab_to_int, int_to_vocab, token_dict), open('./checkpoints/N3_3_preprocess.p', 'wb'))


def load_preprocess():
    """
    Load the Preprocessed Training data and return them in batches of <batch_size> or less
    """
    return pickle.load(open('./checkpoints/N3_3_preprocess.p', mode='rb'))


def save_params(params):
    """
    Save parameters to file
    """
    pickle.dump(params, open('./checkpoints/N3_3_params.p', 'wb'))


def load_params():
    """
    Load parameters from file
    """
    return pickle.load(open('./checkpoints/N3_3_params.p', mode='rb'))

def create_lookup_tables(text):
    vocab_to_int={}
    int_to_vocab={}
    counts = Counter(text)
    vocab = sorted(counts, key=counts.get, reverse=True)
    vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}
    for key,value in vocab_to_int.items():
        int_to_vocab[value]=key      
    return vocab_to_int, int_to_vocab

def token_lookup():
    token_dict = {}
    token_dict['.']='||Period||'
    token_dict[',']='||Comma||'
    token_dict['"']='||QuotationM||'
    token_dict[';']='||SemicolonM||'
    token_dict['!']='||ExclamationM||'
    token_dict['?']='||QuestionM||'
    token_dict['(']='||LeftP||'
    token_dict[')']='||RightP||'
    token_dict['--']='||Dash||'
    token_dict['\n']='||Return||'
    return token_dict

In [55]:
data_dir = '../ClassSampleData/moes_tavern_lines.txt'
preprocess_and_save_data(data_dir, token_lookup, create_lookup_tables)
int_text, vocab_to_int, int_to_vocab, token_dict = load_preprocess()

In [56]:
def get_inputs():
    """
    Create TF Placeholders for input, targets, and learning rate.
    :return: Tuple (input, targets, learning rate)
    """
    inputs = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32,[None, None], name='targets')
    learning_rate = tf.placeholder(tf.float32)
    return inputs, targets, learning_rate

def get_init_cell(batch_size, rnn_size):
    """
    Create an RNN Cell and initialize it.
    :param batch_size: Size of batches
    :param rnn_size: Size of RNNs
    :return: Tuple (cell, initialize state)
    """
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=0.5)
    cell = tf.contrib.rnn.MultiRNNCell([drop])
    initial_state = tf.identity(cell.zero_state(batch_size, tf.float32), name='initial_state')
    return cell, initial_state

def get_embed(input_data, vocab_size, embed_dim):
    """
    Create embedding for <input_data>.
    :param input_data: TF placeholder for text input.
    :param vocab_size: Number of words in vocabulary.
    :param embed_dim: Number of embedding dimensions
    :return: Embedded input.
    """
    embedding = tf.Variable(tf.random_uniform((vocab_size, embed_dim), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, input_data)
    return embed

def build_rnn(cell, inputs):
    """
    Create a RNN using a RNN Cell
    :param cell: RNN Cell
    :param inputs: Input text data
    :return: Tuple (Outputs, Final State)
    """
    outputs, final_state = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
    final_state = tf.identity(final_state, name='final_state')
    return outputs, final_state

def build_nn(cell, rnn_size, input_data, vocab_size, embed_dim):
    """
    Build part of the neural network
    :param cell: RNN cell
    :param rnn_size: Size of rnns
    :param input_data: Input data
    :param vocab_size: Vocabulary size
    :param embed_dim: Number of embedding dimensions
    :return: Tuple (Logits, FinalState)
    """
    embed = get_embed(input_data, vocab_size, embed_dim)
    outputs, final_state = build_rnn(cell, embed)
    logits = tf.contrib.layers.fully_connected(outputs, vocab_size, activation_fn=None)
    return logits, final_state

def get_batches(int_text, batch_size, seq_length):
    """
    Return batches of input and target
    :param int_text: Text with the words replaced by their ids
    :param batch_size: The size of batch
    :param seq_length: The length of sequence
    :return: Batches as a Numpy array
    """
    batch_len = batch_size*seq_length
    n_batch = len(int_text)//batch_len
    x = int_text[:n_batch*batch_len]
    y = x[1:]+x[:1]
    out = np.zeros((n_batch, 2, batch_size, seq_length))
    for i in range(n_batch):
        for j in range(batch_size):
            out[i, 0, j]= x[i*seq_length+n_batch*seq_length*j:i*seq_length+n_batch*seq_length*j+seq_length]
            out[i, 1, j]= y[i*seq_length+n_batch*seq_length*j:i*seq_length+n_batch*seq_length*j+seq_length]
    return out

In [57]:
# Number of Epochs
num_epochs = 50
# Batch Size
batch_size = 100
# RNN Size
rnn_size = 1024
# Embedding Dimension Size
embed_dim = 300
# Sequence Length
seq_length = 50
# Learning Rate
learning_rate = 0.003
# Show stats for every n number of batches
show_every_n_batches = 100

save_dir = './checkpoints'

In [59]:
train_graph = tf.Graph()
with train_graph.as_default():
    vocab_size = len(int_to_vocab)
    input_text, targets, lr = get_inputs()
    input_data_shape = tf.shape(input_text)
    cell, initial_state = get_init_cell(input_data_shape[0], rnn_size)
    logits, final_state = build_nn(cell, rnn_size, input_text, vocab_size, embed_dim)

    # Probabilities for generating words
    probs = tf.nn.softmax(logits, name='probs')

    # Loss function
    cost = seq2seq.sequence_loss(logits, targets, tf.ones([input_data_shape[0], input_data_shape[1]]))

    # Optimizer
    optimizer = tf.train.AdamOptimizer(lr)

    # Gradient Clipping
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)

### Train and save model and parameters

In [60]:
batches = get_batches(int_text, batch_size, seq_length)

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())

    for epoch_i in range(num_epochs):
        state = sess.run(initial_state, {input_text: batches[0][0]})

        for batch_i, (x, y) in enumerate(batches):
            feed = {
                input_text: x,
                targets: y,
                initial_state: state,
                lr: learning_rate}
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed)

            # Show every <show_every_n_batches> batches
            if (epoch_i * len(batches) + batch_i) % show_every_n_batches == 0:
                print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}'.format(
                    epoch_i,
                    batch_i,
                    len(batches),
                    train_loss))

    # Save Model
    saver = tf.train.Saver()
    saver.save(sess, save_dir)
    print('Model Trained and Saved')
save_params((seq_length, save_dir))

Epoch   0 Batch    0/13   train_loss = 8.826
Epoch   7 Batch    9/13   train_loss = 4.013
Epoch  15 Batch    5/13   train_loss = 2.565
Epoch  23 Batch    1/13   train_loss = 1.799
Epoch  30 Batch   10/13   train_loss = 1.222
Epoch  38 Batch    6/13   train_loss = 0.779
Epoch  46 Batch    2/13   train_loss = 0.478
Model Trained and Saved


In [62]:
_, vocab_to_int, int_to_vocab, token_dict = load_preprocess()
seq_length, load_dir = load_params()

In [64]:
def get_tensors(loaded_graph):
    """
    Get input, initial state, final state, and probabilities tensor from <loaded_graph>
    :param loaded_graph: TensorFlow graph loaded from file
    :return: Tuple (InputTensor, InitialStateTensor, FinalStateTensor, ProbsTensor)
    """
    with tf.Session(graph=loaded_graph):
        graph = tf.get_default_graph()
        inputs = graph.get_tensor_by_name("input:0")
        initialstate = graph.get_tensor_by_name('initial_state:0')
        fianlstate = graph.get_tensor_by_name('final_state:0')
        prob = graph.get_tensor_by_name('probs:0')

    return inputs, initialstate, fianlstate, prob

def pick_word(probabilities, int_to_vocab):
    """
    Pick the next word in the generated text
    :param probabilities: Probabilites of the next word
    :param int_to_vocab: Dictionary of word ids as the keys and words as the values
    :return: String of the predicted word
    """
    pos = np.argmax(probabilities)
    picked = int_to_vocab[pos]
    return picked

In [65]:
gen_length = 200
# homer_simpson, moe_szyslak, or Barney_Gumble
prime_word = 'moe_szyslak'

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(load_dir + '.meta')
    loader.restore(sess, load_dir)

    # Get Tensors from loaded model
    input_text, initial_state, final_state, probs = get_tensors(loaded_graph)

    # Sentences generation setup
    gen_sentences = [prime_word + ':']
    prev_state = sess.run(initial_state, {input_text: np.array([[1]])})

    # Generate sentences
    for n in range(gen_length):
        # Dynamic Input
        dyn_input = [[vocab_to_int[word] for word in gen_sentences[-seq_length:]]]
        dyn_seq_length = len(dyn_input[0])

        # Get Prediction
        probabilities, prev_state = sess.run(
            [probs, final_state],
            {input_text: dyn_input, initial_state: prev_state})
        
        pred_word = pick_word(probabilities[0][dyn_seq_length-1], int_to_vocab)

        gen_sentences.append(pred_word)
    
    # Remove tokens
    tv_script = ' '.join(gen_sentences)
    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        tv_script = tv_script.replace(' ' + token.lower(), key)
    tv_script = tv_script.replace('\n ', '\n')
    tv_script = tv_script.replace('( ', '(')
        
    print(tv_script)

INFO:tensorflow:Restoring parameters from ./checkpoints
moe_szyslak: really?
moe_szyslak: well, i assume i'm not the clone moe.
moe_szyslak: are you kiddin' me?
homer_simpson: hello, is this one alive.
moe_szyslak:(into phone) moe's tavern. hold on the roof...
moe_szyslak: okay, here's your bets.
moe_szyslak: yuh-huh.
homer_simpson: fine. there are plenty of other ways for me to alter my consciousness.
carl_carlson: you know who the one who was"!
moe_szyslak: what?
moe_szyslak: well, lessee...
moe_szyslak: okay. i got a window here.
homer_simpson:(pointed) and tip.


moe_szyslak: let's see you little here hey moe, you sure look like a jerk later!
homer_simpson: you know, i love you, moe.
moe_szyslak: hey, hey, ya mugs. thinkin' ain't drinkin'!
c. _montgomery_burns: work for the first time.
carl_carlson: yeah, you should not drink to forget your problems.
barney_gumble: yeah, you should only drink to forget your problems.
barney_gumble: yeah,
