In [1]:
#!pip install --upgrade pip
#!pip install --upgrade tensorflow
#!pip install --upgrade numpy
#!pip install --upgrade nltk
#!pip install --upgrade pandas


In [2]:
import os, sys, re, json, time, shutil
import itertools
import collections
from IPython.display import display

# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import tensorflow as tf

# Pandas because pandas are awesome, and for pretty-printing
import pandas as pd
# Set pandas floating point display
pd.set_option('float_format', lambda f: "{0:.04f}".format(f))

# Helper libraries for this notebook
import utils; reload(utils)
import vocabulary; reload(vocabulary)
import rnnsm; reload(rnnsm)

<module 'rnnsm' from 'rnnsm.pyc'>

In [3]:
import tensorflow as tf
import rnnsm; reload(rnnsm)

# Clear old log directory
shutil.rmtree("tf_summaries", ignore_errors=True)


with tf.Graph().as_default():
  tf.set_random_seed(42)

  sm = rnnsm.RNNSM(V=10000, Z=6, H=200, num_layers=2)
  sm.BuildCoreGraph()
  sm.BuildTrainGraph()
  sm.BuildSamplerGraph()

  summary_writer = tf.train.SummaryWriter("tf_summaries", 
                                          tf.get_default_graph())

In [4]:
def run_epoch(sm, session, batch_iterator, train=False,
              verbose=False, tick_s=10, 
              keep_prob=1.0, learning_rate=0.1):
  start_time = time.time()
  tick_time = start_time  # for showing status
  total_cost = 0.0  # total cost, summed over all words
  total_words = 0

  if train:
    train_op = sm.train_step_
    keep_prob = keep_prob
    loss = sm.train_loss_
  else:
    train_op = tf.no_op()
    keep_prob = 1.0  # no dropout at test time
    loss = sm.loss_  # true loss, if train_loss is an approximation

  for i, (w, y) in enumerate(batch_iterator):
    cost = 0.0
    #### YOUR CODE HERE ####
    np.random.seed(42)

    # At first batch in epoch, get a clean intitial state
    if i == 0:
        h = session.run(sm.initial_h_, {sm.input_w_: w})
 
    cost, h, _ = session.run([loss, sm.final_h_, train_op], feed_dict= {sm.target_y_: y, sm.initial_h_:h,
        sm.input_w_: w, sm.dropout_keep_prob_:keep_prob, sm.learning_rate_:learning_rate})      
    
    #### END(YOUR CODE) ####
    total_cost += cost
    total_words += w.size  # w.size = batch_size * max_time

    ##
    # Print average loss-so-far for epoch
    # If using train_loss_, this may be an underestimate.
    if verbose and (time.time() - tick_time >= tick_s):
      avg_cost = total_cost / total_words
      avg_wps = total_words / (time.time() - start_time)
      print "[batch %d]: seen %d words at %d wps, loss = %.3f" % (i,
          total_words, avg_wps, avg_cost)
      tick_time = time.time()  # reset time ticker

  return total_cost / total_words

In [5]:
# Load the dataset
import time
os.environ['TZ'] = 'US/Pacific'
print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())
reload(utils)
V = 10000
Z = 4
vocab, svocab, train_ids, train_sids, test_ids, test_sids, dev_ids, dev_sids, test_sents, test_sentis = \
    utils.load_data("text.full.txt", "sn0p.new.full.txt", train=0.5, test=0.25, V=V, Z=Z, shuffle=True)
print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())

Sat, 17 Dec 2016 19:38:24
Loaded 130051 sentences (4.28041e+06 tokens)
Loaded 130051 sentiments (130051 tokens)
Training set: 65025 sentences (2137524 tokens)
Test set: 32513 sentences (1072135 tokens)
dev set: 32513 sentences (1070747 tokens)
Training set: 65025 sentiments (65025 tokens)
Test set: 32513 sentiments (32513 tokens)
dev set: 32513 sentiments (32513 tokens)
Sat, 17 Dec 2016 19:39:30


In [6]:
print train_ids[:100]
print train_sids[:100]
print test_ids[:100]
print test_sids[:100]
print dev_ids[:100]
print dev_sids[:100]
print test_sents[:30]
print test_sentis[:30]
print len(train_ids), len(train_sids), len(test_ids), len(test_sids), len(dev_ids), len(dev_ids)

[   0  343 5635   62  145   35  532   13 5050    3   45   22   11   41   39
  607  729    9    4    2    7  223   11 1128   62    7 1634 2865    5    0
   22   31  360    9   40   12   93  176    3   33   31  399  388    6 6101
   99  379   70   59  966    3  499   87   17 3996  109    5    0  470  306
    3    9   75    7  174  615 3982    9  333  616    8  411 1576    3   96
   12   22   32  839   14  228  395   26   40  320    3   58   24   48    7
   14  326  219   15   96   22   32 2320  233   40]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[   0   20   11  351   13   14  194 1344    7  115    3  255 2172    8 1001
    6   35  984  413  194   52    3  287 1321    6   43  928   12 1968    5
    0    4  549   31 3191   26    2  210 3062    3   33  658   21   53   49
    9 9111 1131  177  606  751   79    4   25 1231 1401 1456

In [15]:
# Training parameters
max_time = 20
batch_size = 50
learning_rate = 0.5
keep_prob = 1.0
num_epochs = 5

# Model parameters
model_params = dict(V=V, 
                    H=100,
                    Z=Z,
                    num_layers=1)

trained_filename = './tf_saved/tf_saved_rnnsm_trained'

In [16]:
def score_dataset(sm, session, ids, sids, name="Data"):
  bi = utils.batch_generator(ids, sids, batch_size=100, max_time=100)
  cost = run_epoch(sm, session, bi, 
                   learning_rate=1.0, keep_prob=1.0, 
                   train=False, verbose=False, tick_s=3600)
  print "%s: avg. loss: %.03f  (perplexity: %.02f)" % (name, cost, np.exp(cost))

In [9]:
# Will print status every this many seconds
reload(utils)
print_interval = 5

# Clear old log directory
shutil.rmtree("tf_summaries", ignore_errors=True)

with tf.Graph().as_default(), tf.Session() as session:
  # Seed RNG for repeatability
  os.environ['TZ'] = 'US/Pacific'
  print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())
  tf.set_random_seed(42)
  
  with tf.variable_scope("model", reuse=None):
    sm = rnnsm.RNNSM(**model_params)
    sm.BuildCoreGraph()
    sm.BuildTrainGraph()
  
  session.run(tf.initialize_all_variables())
  saver = tf.train.Saver()
  
  for epoch in xrange(1,num_epochs+1):
    t0_epoch = time.time()
    bi = utils.batch_generator(train_ids, train_sids, batch_size, max_time)
    print "[epoch %d] Starting epoch %d" % (epoch, epoch)
    #### YOUR CODE HERE ####

    run_epoch(sm, session, bi, train=True, keep_prob=keep_prob, learning_rate=learning_rate)

    #### END(YOUR CODE) ####
    print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())
    print "[epoch %d] Completed in %s" % (epoch, utils.pretty_timedelta(since=t0_epoch))
    
    ##
    # score_dataset will run a forward pass over the entire dataset
    # and report perplexity scores. This can be slow (around 1/2 to 
    # 1/4 as long as a full epoch), so you may want to comment it out
    # to speed up training on a slow machine. Be sure to run it at the 
    # end to evaluate your score.
    print ("[epoch %d]" % epoch),
    score_dataset(sm, session, train_ids, train_sids, name="Train set")
    print ("[epoch %d]" % epoch),
    score_dataset(sm, session, dev_ids, dev_sids, name="Test set")
    print ""
    
    # Save a checkpoint
    saver.save(session, './tf_saved/tf_saved_rnnsm', global_step=epoch)
    
  # Save final model
  saver.save(session, trained_filename)
  
  print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())

Wed, 14 Dec 2016 07:26:53
Instructions for updating:
Use `tf.global_variables_initializer` instead.


Instructions for updating:
Use `tf.global_variables_initializer` instead.


[epoch 1] Starting epoch 1
in batch_generator 2206693 2206693 2206650 2206650 2206650 50
Wed, 14 Dec 2016 07:31:29
[epoch 1] Completed in 0:04:34
[epoch 1] in batch_generator 2206693 2206693 2206600 2206600 2206600 100
Train set: avg. loss: 0.726  (perplexity: 2.07)
[epoch 1] in batch_generator 1100956 1100956 1100900 1100900 1100900 100
Test set: avg. loss: 0.722  (perplexity: 2.06)

[epoch 2] Starting epoch 2
in batch_generator 2206693 2206693 2206650 2206650 2206650 50
Wed, 14 Dec 2016 07:37:17
[epoch 2] Completed in 0:04:32
[epoch 2] in batch_generator 2206693 2206693 2206600 2206600 2206600 100
Train set: avg. loss: 0.718  (perplexity: 2.05)
[epoch 2] in batch_generator 1100956 1100956 1100900 1100900 1100900 100
Test set: avg. loss: 0.717  (perplexity: 2.05)

[epoch 3] Starting epoch 3
in batch_generator 2206693 2206693 2206650 2206650 2206650 50
Wed, 14 Dec 2016 07:43:07
[epoch 3] Completed in 0:04:36
[epoch 3] in batch_generator 2206693 2206693 2206600 2206600 2206600 100
Train

In [17]:
def sample_step(sm, session, input_w, initial_h):
  """Run a single RNN step and return sampled predictions.
  
  Args:
    sm : rnnsm.RNNSM
    session: tf.Session
    input_w : [batch_size] list of indices
    initial_h : [batch_size, hidden_dims]
  
  Returns:
    final_h : final hidden state, compatible with initial_h
    samples : [batch_size, 1] vector of indices
  """
  #### YOUR CODE HERE ####
  # Reshape input to column vector
  input_w = np.array(input_w, dtype=np.int32).reshape([-1,1])
  
  # Run sample ops
  final_h, samples = session.run([sm.final_h_, sm.pred_samples_], 
        feed_dict={sm.input_w_: input_w, sm.initial_h_: initial_h, sm.dropout_keep_prob_: 1.0, sm.learning_rate_:0.1})
  
  #### END(YOUR CODE) ####
  return final_h, samples[:,-1,:]

In [18]:
#skip unless mixed
# Same as above, but as a batch
import collections
reload(rnnsm)
max_steps = 20
num_samples = 10
random_seed = 42

with tf.Graph().as_default(), tf.Session() as session:
  # Seed RNG for repeatability
  tf.set_random_seed(random_seed)

  with tf.variable_scope("model", reuse=None):
    sm = rnnsm.RNNSM(**model_params)
    sm.BuildCoreGraph()
    sm.BuildSamplerGraph()

  # Load the trained model
  saver = tf.train.Saver()
  saver.restore(session, trained_filename)

  # Make initial state for a batch with batch_size = num_samples
  w = np.repeat([[vocab.START_ID]], num_samples, axis=0)
  #print w
  h = session.run(sm.initial_h_, {sm.input_w_: w})
  # We'll take one step for each sequence on each iteration 
  for i in xrange(max_steps):
    h, y = sample_step(sm, session, w[:,-1:], h)
    w = np.hstack((w,y))
  #print w
  svocab=vocabulary.Vocabulary(True)
  for row in w:
    k = [sword_id for i, sword_id in enumerate(row)]
    k = [svocab.id_to_word[sword_id] for sword_id in k]
    #print collections.Counter(k)
    #print svocab.id_to_word[sword_id],
  #collections.Counter([svocab.id_to_word(sword_id) for i, sword_id in k])
        
  # Print generated sentences
  #for row in w:
  #  for i, word_id in enumerate(row):
  #    print vocab.id_to_word[word_id],
  #    if (i != 0) and (word_id == vocab.START_ID):
  #      break
  #  print ""

In [46]:
def seq_predict(sm, session, seq, vocab, svocab):
  """Score by test_ids vs test_sids"""
  padded_ids = vocab.words_to_ids(utils.canonicalize_words(["<s>"] + seq, 
                                                           wordset=vocab.word_to_id))
  w = np.reshape(padded_ids[:-1], [1,-1])
  h = session.run(sm.initial_h_, {sm.input_w_: w})
  h, y = sample_step(sm, session, w[:,-1:], h)

  y = [1 if k == 3 else k for k in utils.flatten(y)]

  #return [svocab.ids_to_words(k) for k in y]
  return svocab.ids_to_words(y)[0]


In [48]:
with tf.Graph().as_default(), tf.Session() as session:  
    with tf.variable_scope("model", reuse=None):
        sm = rnnsm.RNNSM(**model_params)
        sm.BuildCoreGraph()
        sm.BuildSamplerGraph()
        
    # Load the trained model
    saver = tf.train.Saver()
    saver.restore(session, './'+trained_filename)
    pred = []
    for s in test_sents:
        pred.append(seq_predict(sm, session, s, vocab, svocab))
                        
    correct = 0
    for i in range(len(test_sents)):
        if pred[i] == test_sentis[i][0]:
            correct = correct + 1
    print correct
print "Accuracy rate is %.2f\n" % (correct * 1.0/ len(test_sents))                        

15396
Accuracy rate is 0.47

