In [1]:
#!pip install --upgrade pip
#!pip install --upgrade tensorflow
#!pip install --upgrade numpy
#!pip install --upgrade nltk
#!pip install --upgrade pandas


In [2]:
import os, sys, re, json, time, shutil
import itertools
import collections
from IPython.display import display

# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import tensorflow as tf

# Pandas because pandas are awesome, and for pretty-printing
import pandas as pd
# Set pandas floating point display
pd.set_option('float_format', lambda f: "{0:.04f}".format(f))

# Helper libraries for this notebook
import utils; reload(utils)
import vocabulary; reload(vocabulary)
import rnnsm; reload(rnnsm)

<module 'rnnsm' from 'rnnsm.pyc'>

In [3]:
import tensorflow as tf
import rnnsm; reload(rnnsm)

# Clear old log directory
shutil.rmtree("tf_summaries", ignore_errors=True)


with tf.Graph().as_default():
  tf.set_random_seed(42)

  sm = rnnsm.RNNSM(V=10000, Z=6, H=200, num_layers=2)
  sm.BuildCoreGraph()
  sm.BuildTrainGraph()
  sm.BuildSamplerGraph()

  summary_writer = tf.train.SummaryWriter("tf_summaries", 
                                          tf.get_default_graph())

Instructions for updating:
Please switch to tf.summary.FileWriter. The interface and behavior is the same; this is just a rename.


Instructions for updating:
Please switch to tf.summary.FileWriter. The interface and behavior is the same; this is just a rename.


In [4]:
def run_epoch(sm, session, batch_iterator, train=False,
              verbose=False, tick_s=10, 
              keep_prob=1.0, learning_rate=0.1):
  start_time = time.time()
  tick_time = start_time  # for showing status
  total_cost = 0.0  # total cost, summed over all words
  total_words = 0

  if train:
    train_op = sm.train_step_
    keep_prob = keep_prob
    loss = sm.train_loss_
  else:
    train_op = tf.no_op()
    keep_prob = 1.0  # no dropout at test time
    loss = sm.loss_  # true loss, if train_loss is an approximation

  for i, (w, y) in enumerate(batch_iterator):
    cost = 0.0
    #### YOUR CODE HERE ####
    np.random.seed(42)

    # At first batch in epoch, get a clean intitial state
    if i == 0:
        h = session.run(sm.initial_h_, {sm.input_w_: w})
 
    cost, h, _ = session.run([loss, sm.final_h_, train_op], feed_dict= {sm.target_y_: y, sm.initial_h_:h,
        sm.input_w_: w, sm.dropout_keep_prob_:keep_prob, sm.learning_rate_:learning_rate})      
    
    #### END(YOUR CODE) ####
    total_cost += cost
    total_words += w.size  # w.size = batch_size * max_time

    ##
    # Print average loss-so-far for epoch
    # If using train_loss_, this may be an underestimate.
    if verbose and (time.time() - tick_time >= tick_s):
      avg_cost = total_cost / total_words
      avg_wps = total_words / (time.time() - start_time)
      print "[batch %d]: seen %d words at %d wps, loss = %.3f" % (i,
          total_words, avg_wps, avg_cost)
      tick_time = time.time()  # reset time ticker

  return total_cost / total_words

In [5]:
# Load the dataset
import time
os.environ['TZ'] = 'US/Pacific'
print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())
reload(utils)
V = 10000
Z = 4
vocab, svocab, train_ids, train_sids, test_ids, test_sids, dev_ids, dev_sids, test_sents, test_sentis = \
    utils.load_data("text.full.txt", "sn0p.new.full.txt", train=0.5, test=0.25, V=V, Z=Z, shuffle=True)
print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())

Sat, 17 Dec 2016 20:43:20
Loaded 130051 sentences (4.28041e+06 tokens)
Loaded 130051 sentiments (130051 tokens)
Training set: 65025 sentences (2143415 tokens)
Test set: 32513 sentences (1069358 tokens)
dev set: 32513 sentences (1067633 tokens)
Training set: 65025 sentiments (65025 tokens)
Test set: 32513 sentiments (32513 tokens)
dev set: 32513 sentiments (32513 tokens)
Sat, 17 Dec 2016 20:44:02


In [6]:
print train_ids[:100]
print train_sids[:100]
print test_ids[:100]
print test_sids[:100]
print dev_ids[:100]
print dev_sids[:100]
print test_sents[:30]
print test_sentis[:30]
print len(train_ids), len(train_sids), len(test_ids), len(test_sids), len(dev_ids), len(dev_ids)

[   0  162   11  194  591   15  477 3589  862  676    2 4451 3145   13  209
  483 2327 5990   15 5373    2   12 7726  132    2 2475 1492   26    2   15
 1589   15  165   36   55   37    0    9  685    3  215   18   22   42  647
 1811    8  941 6808    6  805    6  482   14  566  392   26   21   65   51
   26   40    3  376   22 2042    6  143  245  137  219   22   42 1212  511
    5    0  312 1016    2   27  225   27   10   34  247 2047   12  161   22
  316 1819    6  213 4028 3666    5    0  157    4]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[   0  898 2085   16  526 6305    8 2264   13  324    3   30   18 6063 1370
 5575    3 5062    8   69   71    7  913    3   33   16  222    9 3757    2
    3 2603    5    0  209    7  324   61   47 1008    9    4 1065  932  516
   12    4  831   31   41 3957   39 2380   95    4 5376  116

In [8]:
# Training parameters
max_time = 20
batch_size = 50
learning_rate = 0.5
keep_prob = 1.0
num_epochs = 10

# Model parameters
model_params = dict(V=V, 
                    H=100,
                    Z=Z,
                    num_layers=1)

trained_filename = './tf_saved/tf_saved_rnnsm_trained'

In [9]:
def score_dataset(sm, session, ids, sids, name="Data"):
  bi = utils.batch_generator(ids, sids, batch_size=100, max_time=100)
  cost = run_epoch(sm, session, bi, 
                   learning_rate=1.0, keep_prob=1.0, 
                   train=False, verbose=False, tick_s=3600)
  print "%s: avg. loss: %.03f  (perplexity: %.02f)" % (name, cost, np.exp(cost))

In [10]:
# Will print status every this many seconds
reload(utils)
print_interval = 5

# Clear old log directory
shutil.rmtree("tf_summaries", ignore_errors=True)

with tf.Graph().as_default(), tf.Session() as session:
  # Seed RNG for repeatability
  os.environ['TZ'] = 'US/Pacific'
  print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())
  tf.set_random_seed(42)
  
  with tf.variable_scope("model", reuse=None):
    sm = rnnsm.RNNSM(**model_params)
    sm.BuildCoreGraph()
    sm.BuildTrainGraph()
  
  session.run(tf.initialize_all_variables())
  saver = tf.train.Saver()
  
  for epoch in xrange(1,num_epochs+1):
    t0_epoch = time.time()
    bi = utils.batch_generator(train_ids, train_sids, batch_size, max_time)
    print "[epoch %d] Starting epoch %d" % (epoch, epoch)
    #### YOUR CODE HERE ####

    run_epoch(sm, session, bi, train=True, keep_prob=keep_prob, learning_rate=learning_rate)

    #### END(YOUR CODE) ####
    print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())
    print "[epoch %d] Completed in %s" % (epoch, utils.pretty_timedelta(since=t0_epoch))
    
    ##
    # score_dataset will run a forward pass over the entire dataset
    # and report perplexity scores. This can be slow (around 1/2 to 
    # 1/4 as long as a full epoch), so you may want to comment it out
    # to speed up training on a slow machine. Be sure to run it at the 
    # end to evaluate your score.
    print ("[epoch %d]" % epoch),
    score_dataset(sm, session, train_ids, train_sids, name="Train set")
    print ("[epoch %d]" % epoch),
    score_dataset(sm, session, dev_ids, dev_sids, name="Test set")
    print ""
    
    # Save a checkpoint
    saver.save(session, './tf_saved/tf_saved_rnnsm', global_step=epoch)
    
  # Save final model
  saver.save(session, trained_filename)
  
  print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())

Sat, 17 Dec 2016 20:44:29
Instructions for updating:
Use `tf.global_variables_initializer` instead.


Instructions for updating:
Use `tf.global_variables_initializer` instead.


[epoch 1] Starting epoch 1
in batch_generator 2208441 2208441 2208400 2208400 2208400 50
Sat, 17 Dec 2016 20:49:01
[epoch 1] Completed in 0:04:31
[epoch 1] in batch_generator 2208441 2208441 2208400 2208400 2208400 100
Train set: avg. loss: 0.719  (perplexity: 2.05)
[epoch 1] in batch_generator 1100147 1100147 1100100 1100100 1100100 100
Test set: avg. loss: 0.718  (perplexity: 2.05)

[epoch 2] Starting epoch 2
in batch_generator 2208441 2208441 2208400 2208400 2208400 50
Sat, 17 Dec 2016 20:54:48
[epoch 2] Completed in 0:04:35
[epoch 2] in batch_generator 2208441 2208441 2208400 2208400 2208400 100
Train set: avg. loss: 0.712  (perplexity: 2.04)
[epoch 2] in batch_generator 1100147 1100147 1100100 1100100 1100100 100
Test set: avg. loss: 0.715  (perplexity: 2.04)

[epoch 3] Starting epoch 3
in batch_generator 2208441 2208441 2208400 2208400 2208400 50
Sat, 17 Dec 2016 21:00:33
[epoch 3] Completed in 0:04:32
[epoch 3] in batch_generator 2208441 2208441 2208400 2208400 2208400 100
Train

In [11]:
def sample_step(sm, session, input_w, initial_h):
  """Run a single RNN step and return sampled predictions.
  
  Args:
    sm : rnnsm.RNNSM
    session: tf.Session
    input_w : [batch_size] list of indices
    initial_h : [batch_size, hidden_dims]
  
  Returns:
    final_h : final hidden state, compatible with initial_h
    samples : [batch_size, 1] vector of indices
  """
  #### YOUR CODE HERE ####
  # Reshape input to column vector
  input_w = np.array(input_w, dtype=np.int32).reshape([-1,1])
  
  # Run sample ops
  final_h, samples = session.run([sm.final_h_, sm.pred_samples_], 
        feed_dict={sm.input_w_: input_w, sm.initial_h_: initial_h, sm.dropout_keep_prob_: 1.0, sm.learning_rate_:0.1})
  
  #### END(YOUR CODE) ####
  return final_h, samples[:,-1,:]

In [12]:
def seq_predict(sm, session, seq, vocab, svocab):
  """Score by test_ids vs test_sids"""
  padded_ids = vocab.words_to_ids(utils.canonicalize_words(["<s>"] + seq, 
                                                           wordset=vocab.word_to_id))
  w = np.reshape(padded_ids[:-1], [1,-1])
  h = session.run(sm.initial_h_, {sm.input_w_: w})
  h, y = sample_step(sm, session, w[:,-1:], h)

  y = [1 if k == 3 else k for k in utils.flatten(y)]

  #return [svocab.ids_to_words(k) for k in y]
  return svocab.ids_to_words(y)[0]


In [14]:
with tf.Graph().as_default(), tf.Session() as session:  
    with tf.variable_scope("model", reuse=None):
        sm = rnnsm.RNNSM(**model_params)
        sm.BuildCoreGraph()
        sm.BuildSamplerGraph()
        
    # Load the trained model
    saver = tf.train.Saver()
    saver.restore(session, './'+trained_filename)
    pred = []

    for s in test_sents:
        pred.append(seq_predict(sm, session, s, vocab, svocab))

    non0 = 0
    correct = 0
    for i in range(len(test_sents)):
        if not test_sentis[i][0] == '0':
            non0 = non0 + 1
            if pred[i] == test_sentis[i][0]:
                correct = correct + 1
print "Test result:", correct, 'out of', non0, ' correct, and total dev is ', len(test_sents)
print "Accuracy rate is %.2f\n" % (correct * 1.0/ non0)                        


Test result: 15687 out of 32432  correct, and total dev is  32513
Accuracy rate is 0.48

