# The RNN/LSTM on NYT with SP500 running trend as the sentiment indicator

This is an implementation based on the rnn/lstm model we used in Assignment 1, Part 2, adapted to predict and learn to associate each word of a sentence to its corresponding stock trend.  In this case a stock trend will be 'p' if the SP500 mention company's current weekly average price rised compared to the previous week  

### Model Level Description

in utility.py, I switched the corpus loading from 'brown' to 'text.full.txt', the NYT 130051 sents which has a mention to one of the SP500 company.


![RNNLM - layers](RNNLM - layers.png)

In [2]:
import os, sys, re, json, time, shutil
import itertools
import collections
from IPython.display import display

# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import tensorflow as tf

# Pandas because pandas are awesome, and for pretty-printing
import pandas as pd
# Set pandas floating point display
pd.set_option('float_format', lambda f: "{0:.04f}".format(f))

# Helper libraries for this notebook
import utils; reload(utils)
import vocabulary; reload(vocabulary)
import rnnsm; reload(rnnsm)

<module 'rnnsm' from 'rnnsm.pyc'>

In [14]:
import tensorflow as tf
import rnnsm; reload(rnnsm)

# Clear old log directory
shutil.rmtree("tf_summaries", ignore_errors=True)


with tf.Graph().as_default():
  tf.set_random_seed(42)

  sm = rnnsm.RNNSM(V=10000, Z=4, H=200, num_layers=2)
  sm.BuildCoreGraph()
  sm.BuildTrainGraph()
  sm.BuildSamplerGraph()

  summary_writer = tf.train.SummaryWriter("tf_summaries", 
                                          tf.get_default_graph())

Instructions for updating:
Please switch to tf.summary.FileWriter. The interface and behavior is the same; this is just a rename.


Instructions for updating:
Please switch to tf.summary.FileWriter. The interface and behavior is the same; this is just a rename.


In [15]:
def run_epoch(sm, session, batch_iterator, train=False,
              verbose=False, tick_s=10, 
              keep_prob=1.0, learning_rate=0.1):
  start_time = time.time()
  tick_time = start_time  # for showing status
  total_cost = 0.0  # total cost, summed over all words
  total_words = 0

  if train:
    train_op = sm.train_step_
    keep_prob = keep_prob
    loss = sm.train_loss_
  else:
    train_op = tf.no_op()
    keep_prob = 1.0  # no dropout at test time
    loss = sm.loss_  # true loss, if train_loss is an approximation

  for i, (w, y) in enumerate(batch_iterator):
    cost = 0.0
    #### YOUR CODE HERE ####
    np.random.seed(42)

    # At first batch in epoch, get a clean intitial state
    if i == 0:
        h = session.run(sm.initial_h_, {sm.input_w_: w})
 
    cost, h, _ = session.run([loss, sm.final_h_, train_op], feed_dict= {sm.target_y_: y, sm.initial_h_:h,
        sm.input_w_: w, sm.dropout_keep_prob_:keep_prob, sm.learning_rate_:learning_rate})      
    
    #### END(YOUR CODE) ####
    total_cost += cost
    total_words += w.size  # w.size = batch_size * max_time

    ##
    # Print average loss-so-far for epoch
    # If using train_loss_, this may be an underestimate.
    if verbose and (time.time() - tick_time >= tick_s):
      avg_cost = total_cost / total_words
      avg_wps = total_words / (time.time() - start_time)
      print "[batch %d]: seen %d words at %d wps, loss = %.3f" % (i,
          total_words, avg_wps, avg_cost)
      tick_time = time.time()  # reset time ticker

  return total_cost / total_words

In [16]:
# Load the dataset
import time
os.environ['TZ'] = 'US/Pacific'
print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())

np.random.seed(168)
reload(utils)
V = 10000
Z = 4
vocab, svocab, train_ids, train_sids, test_ids, test_sids, dev_ids, dev_sids, test_sents, test_sentis = \
    utils.load_data("text.full.txt", "sn0p.full.txt", train=0.5, test=0.25, V=V, Z=Z, shuffle=True)
print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())

Sun, 18 Dec 2016 07:01:38
Loaded 130051 sentences (4.28041e+06 tokens)
Loaded 130051 sentiments (130051 tokens)
Training set: 65025 sentences (2141129 tokens)
Test set: 32513 sentences (1072187 tokens)
dev set: 32513 sentences (1067090 tokens)
Training set: 65025 sentiments (65025 tokens)
Test set: 32513 sentiments (32513 tokens)
dev set: 32513 sentiments (32513 tokens)
Sun, 18 Dec 2016 07:02:18


In [17]:
collections.Counter([i[0] for i in test_sentis]) # for lm approach model

Counter({'0': 79, 'n': 15738, 'p': 16696})

In [18]:
# Training parameters
max_time = 20
batch_size = 50
learning_rate = 0.5
keep_prob = 1.0
num_epochs = 10

# Model parameters
model_params = dict(V=V, 
                    H=100,
                    Z=Z,
                    num_layers=1)

trained_filename = './tf_saved/tf_saved_rnnsm_trained'

In [19]:
def score_dataset(sm, session, ids, sids, name="Data"):
  bi = utils.batch_generator(ids, sids, batch_size=100, max_time=100)
  cost = run_epoch(sm, session, bi, 
                   learning_rate=1.0, keep_prob=1.0, 
                   train=False, verbose=False, tick_s=3600)
  print "%s: avg. loss: %.03f  (perplexity: %.02f)" % (name, cost, np.exp(cost))

In [20]:
# Will print status every this many seconds
reload(utils)
print_interval = 5

# Clear old log directory
shutil.rmtree("tf_summaries", ignore_errors=True)

with tf.Graph().as_default(), tf.Session() as session:
  # Seed RNG for repeatability
  os.environ['TZ'] = 'US/Pacific'
  print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())
  tf.set_random_seed(42)
  
  with tf.variable_scope("model", reuse=None):
    sm = rnnsm.RNNSM(**model_params)
    sm.BuildCoreGraph()
    sm.BuildTrainGraph()
  
  session.run(tf.initialize_all_variables())
  saver = tf.train.Saver()
  
  for epoch in xrange(1,num_epochs+1):
    t0_epoch = time.time()
    bi = utils.batch_generator(train_ids, train_sids, batch_size, max_time)
    print "[epoch %d] Starting epoch %d" % (epoch, epoch)
    #### YOUR CODE HERE ####

    run_epoch(sm, session, bi, train=True, keep_prob=keep_prob, learning_rate=learning_rate)

    #### END(YOUR CODE) ####
    print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())
    print "[epoch %d] Completed in %s" % (epoch, utils.pretty_timedelta(since=t0_epoch))
    
    ##
    # score_dataset will run a forward pass over the entire dataset
    # and report perplexity scores. This can be slow (around 1/2 to 
    # 1/4 as long as a full epoch), so you may want to comment it out
    # to speed up training on a slow machine. Be sure to run it at the 
    # end to evaluate your score.
    print ("[epoch %d]" % epoch),
    score_dataset(sm, session, train_ids, train_sids, name="Train set")
    print ("[epoch %d]" % epoch),
    score_dataset(sm, session, dev_ids, dev_sids, name="Test set")
    print ""
    
    # Save a checkpoint
    saver.save(session, './tf_saved/tf_saved_rnnsm', global_step=epoch)
    
  # Save final model
  saver.save(session, trained_filename)
  
  print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())

Sun, 18 Dec 2016 07:02:34
Instructions for updating:
Use `tf.global_variables_initializer` instead.


Instructions for updating:
Use `tf.global_variables_initializer` instead.


[epoch 1] Starting epoch 1
in batch_generator 2206155 2206155 2206150 2206150 2206150 50
Sun, 18 Dec 2016 07:06:34
[epoch 1] Completed in 0:03:58
[epoch 1] in batch_generator 2206155 2206155 2206100 2206100 2206100 100
Train set: avg. loss: 0.719  (perplexity: 2.05)
[epoch 1] in batch_generator 1099604 1099604 1099600 1099600 1099600 100
Test set: avg. loss: 0.717  (perplexity: 2.05)

[epoch 2] Starting epoch 2
in batch_generator 2206155 2206155 2206150 2206150 2206150 50
Sun, 18 Dec 2016 07:11:34
[epoch 2] Completed in 0:03:57
[epoch 2] in batch_generator 2206155 2206155 2206100 2206100 2206100 100
Train set: avg. loss: 0.713  (perplexity: 2.04)
[epoch 2] in batch_generator 1099604 1099604 1099600 1099600 1099600 100
Test set: avg. loss: 0.716  (perplexity: 2.05)

[epoch 3] Starting epoch 3
in batch_generator 2206155 2206155 2206150 2206150 2206150 50
Sun, 18 Dec 2016 07:16:29
[epoch 3] Completed in 0:03:53
[epoch 3] in batch_generator 2206155 2206155 2206100 2206100 2206100 100
Train

In [21]:
def sample_step(sm, session, input_w, initial_h):
  """Run a single RNN step and return sampled predictions.
  
  Args:
    sm : rnnsm.RNNSM
    session: tf.Session
    input_w : [batch_size] list of indices
    initial_h : [batch_size, hidden_dims]
  
  Returns:
    final_h : final hidden state, compatible with initial_h
    samples : [batch_size, 1] vector of indices
  """
  #### YOUR CODE HERE ####
  # Reshape input to column vector
  input_w = np.array(input_w, dtype=np.int32).reshape([-1,1])
  
  # Run sample ops
  final_h, samples = session.run([sm.final_h_, sm.pred_samples_], 
        feed_dict={sm.input_w_: input_w, sm.initial_h_: initial_h, sm.dropout_keep_prob_: 1.0, sm.learning_rate_:0.1})
  
  #### END(YOUR CODE) ####
  return final_h, samples[:,-1,:]

In [22]:
def seq_predict(sm, session, seq, vocab, svocab):
  """Score by test_ids vs test_sids"""
  padded_ids = vocab.words_to_ids(utils.canonicalize_words(["<s>"] + seq, 
                                                           wordset=vocab.word_to_id))
  w = np.reshape(padded_ids[:-1], [1,-1])
  h = session.run(sm.initial_h_, {sm.input_w_: w})
  h, y = sample_step(sm, session, w[:,-1:], h)

  y = [1 if k == 3 else k for k in utils.flatten(y)]

  #return [svocab.ids_to_words(k) for k in y]
  return svocab.ids_to_words(y)[0]


In [40]:
with tf.Graph().as_default(), tf.Session() as session:
    tf.set_random_seed(168)
    with tf.variable_scope("model", reuse=None):
        sm = rnnsm.RNNSM(**model_params)
        sm.BuildCoreGraph()
        sm.BuildSamplerGraph()
        
    # Load the trained model
    saver = tf.train.Saver()
    saver.restore(session, './'+trained_filename)
    pred = []

    for s in test_sents:
        pred.append(seq_predict(sm, session, s, vocab, svocab))

    non0 = 0
    correct = 0
    for i in range(len(test_sents)):
        if not test_sentis[i][0] == '0':
            non0 = non0 + 1
            if pred[i] == test_sentis[i][0]:
                correct = correct + 1
print "Test result:", correct, 'out of', non0, ' correct, and total dev is ', len(test_sents)
print "Accuracy rate is %.2f\n" % (correct * 1.0/ non0)                        

Test result: 15894 out of 32434  correct, and total dev is  32513
Accuracy rate is 0.49



In [26]:
print len(train_sids), len(dev_ids), len(test_sids), len(test_sents), len(test_sentis), (len(train_sids)+len(dev_sids)+len(test_sids))

2206155 1099604 1104701 32513 32513 4410460


In [32]:
# Load the dataset
import time
os.environ['TZ'] = 'US/Pacific'
print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())

np.random.seed(168)
reload(utils)
V = 10000
Z = 4
vocab, svocab, train_ids, train_sids, test_ids, test_sids, dev_ids, dev_sids, train_sents, train_sentis, test_sents, test_sentis, dev_sents, dev_sentis = \
    utils.load_data("text.full.txt", "sn0p.full.txt", train=0.5, test=0.25, V=V, Z=Z, shuffle=True)
print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())

Sun, 18 Dec 2016 08:09:46
Loaded 130051 sentences (4.28041e+06 tokens)
Loaded 130051 sentiments (130051 tokens)
Training set: 65025 sentences (2141129 tokens)
Test set: 32513 sentences (1072187 tokens)
dev set: 32513 sentences (1067090 tokens)
Training set: 65025 sentiments (65025 tokens)
Test set: 32513 sentiments (32513 tokens)
dev set: 32513 sentiments (32513 tokens)
Sun, 18 Dec 2016 08:10:28


In [33]:
print len(train_sids), len(dev_ids), len(test_sids), len(test_sents), len(test_sentis), (len(train_sids)+len(dev_sids)+len(test_sids))


2206155 1099604 1104701 32513 32513 4410460


In [34]:
print collections.Counter([i[0] for i in test_sentis])

Counter({'p': 16696, 'n': 15738, '0': 79})


In [41]:
#For the lm approach, both train_sents and dev_sents together is to be split into hot.train and not.train, then test_sents
#is to be split into hot_test and not_test.
print collections.Counter([i[0] for i in train_sentis])
print collections.Counter([i[0] for i in dev_sentis])

Counter({'p': 33780, 'n': 31068, '0': 177})
Counter({'p': 16691, 'n': 15745, '0': 77})


In [56]:
' '.join(train_sents[0])

u'Analysts were full of suggestions yesterday as to where Kodak should consolidate and prune its businesses , regardless of the outcome of the battle with Fuji .'

In [60]:
f = open('hot.train.txt', 'w')
g = open('not.train.txt', 'w')
#first train_sents
for i in range(len(train_sents)):
    if train_sentis[i][0] == 'p':
        f.write(' '.join(train_sents[i]).encode('utf-8')+'\n')
    else:
        if train_sentis[i][0] == 'n':
            g.write(' '.join(train_sents[i]).encode('utf-8')+'\n')
#then dev_sents
for i in range(len(dev_sents)):
    if train_sentis[i][0] == 'p':
        f.write(' '.join(dev_sents[i]).encode('utf-8')+'\n')
    else:
        if train_sentis[i][0] == 'n':
            g.write(' '.join(dev_sents[i]).encode('utf-8')+'\n')
f.close()
g.close()
#then test
f = open('hot.test.txt', 'w')
g = open('not.test.txt', 'w')
#first train_sents
for i in range(len(test_sents)):
    if train_sentis[i][0] == 'p':
        f.write(' '.join(test_sents[i]).encode('utf-8')+'\n')
    else:
        if train_sentis[i][0] == 'n':
            g.write(' '.join(test_sents[i]).encode('utf-8')+'\n')
f.close()
g.close()
