In [1]:
#!pip install --upgrade pip
#!pip install --upgrade tensorflow
#!pip install --upgrade numpy
#!pip install --upgrade nltk
#!pip install --upgrade pandas


In [2]:
import os, sys, re, json, time, shutil
import itertools
import collections
from IPython.display import display

# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import tensorflow as tf

# Pandas because pandas are awesome, and for pretty-printing
import pandas as pd
# Set pandas floating point display
pd.set_option('float_format', lambda f: "{0:.04f}".format(f))

# Helper libraries for this notebook
import utils; reload(utils)
import vocabulary; reload(vocabulary)
import rnnsm; reload(rnnsm)

<module 'rnnsm' from 'rnnsm.pyc'>

In [3]:
import tensorflow as tf
import rnnsm; reload(rnnsm)

# Clear old log directory
shutil.rmtree("tf_summaries", ignore_errors=True)


with tf.Graph().as_default():
  tf.set_random_seed(42)

  sm = rnnsm.RNNSM(V=10000, Z=6, H=200, num_layers=2)
  sm.BuildCoreGraph()
  sm.BuildTrainGraph()
  sm.BuildSamplerGraph()

  summary_writer = tf.train.SummaryWriter("tf_summaries", 
                                          tf.get_default_graph())

In [4]:
def run_epoch(sm, session, batch_iterator, train=False,
              verbose=False, tick_s=10, 
              keep_prob=1.0, learning_rate=0.1):
  start_time = time.time()
  tick_time = start_time  # for showing status
  total_cost = 0.0  # total cost, summed over all words
  total_words = 0

  if train:
    train_op = sm.train_step_
    keep_prob = keep_prob
    loss = sm.train_loss_
  else:
    train_op = tf.no_op()
    keep_prob = 1.0  # no dropout at test time
    loss = sm.loss_  # true loss, if train_loss is an approximation

  for i, (w, y) in enumerate(batch_iterator):
    cost = 0.0
    #### YOUR CODE HERE ####
    np.random.seed(42)

    # At first batch in epoch, get a clean intitial state
    if i == 0:
        h = session.run(sm.initial_h_, {sm.input_w_: w})
 
    cost, h, _ = session.run([loss, sm.final_h_, train_op], feed_dict= {sm.target_y_: y, sm.initial_h_:h,
        sm.input_w_: w, sm.dropout_keep_prob_:keep_prob, sm.learning_rate_:learning_rate})      
    
    #### END(YOUR CODE) ####
    total_cost += cost
    total_words += w.size  # w.size = batch_size * max_time

    ##
    # Print average loss-so-far for epoch
    # If using train_loss_, this may be an underestimate.
    if verbose and (time.time() - tick_time >= tick_s):
      avg_cost = total_cost / total_words
      avg_wps = total_words / (time.time() - start_time)
      print "[batch %d]: seen %d words at %d wps, loss = %.3f" % (i,
          total_words, avg_wps, avg_cost)
      tick_time = time.time()  # reset time ticker

  return total_cost / total_words

In [5]:
# Load the dataset
import time
os.environ['TZ'] = 'US/Pacific'
print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())
reload(utils)
V = 10000
Z = 63
vocab, train_ids, train_sids, test_ids, test_sids = utils.load_corpus("text.txt", "senti.txt", split=0.8, V=V, Z=Z, shuffle=False)
print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())

Mon, 12 Dec 2016 20:49:48
Loaded 130043 sentences (4.28016e+06 tokens)
Training set: 104034 sentences (3413086 tokens)
Test set: 26009 sentences (867074 tokens)
Loaded 130043 sentences (130043 tokens)
Training set: 104034 sentences (104034 tokens)
Test set: 26009 sentences (26009 tokens)
Mon, 12 Dec 2016 20:50:33


In [6]:
print train_ids[:20]
print train_sids[:20]
print test_ids[:20]
print test_sids[:20]
print len(train_ids), len(train_sids), len(test_ids), len(test_sids)

[   0  155  807   40    3  101    4  149 7208  108  285   31  207    3    8
  685   40   36    4    2]
[30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30]
[   0   45    4 4064   16  139    7    4  159   11 5645  370    6 3279    4
 2754    7   20 2768    5]
[26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26]
3517121 3517121 893084 893084


In [7]:
# Training parameters
max_time = 20
batch_size = 50
learning_rate = 0.5
keep_prob = 1.0
num_epochs = 50

# Model parameters
model_params = dict(V=V, 
                    H=100,
                    Z=Z,
                    num_layers=1)

trained_filename = './tf_saved/rnnsm_trained'

In [8]:
def score_dataset(sm, session, ids, sids, name="Data"):
  bi = utils.batch_generator(ids, sids, batch_size=100, max_time=100)
  cost = run_epoch(sm, session, bi, 
                   learning_rate=1.0, keep_prob=1.0, 
                   train=False, verbose=False, tick_s=3600)
  print "%s: avg. loss: %.03f  (perplexity: %.02f)" % (name, cost, np.exp(cost))

In [9]:
# Will print status every this many seconds
reload(utils)
print_interval = 5

# Clear old log directory
shutil.rmtree("tf_summaries", ignore_errors=True)

with tf.Graph().as_default(), tf.Session() as session:
  # Seed RNG for repeatability
  os.environ['TZ'] = 'US/Pacific'
  print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())
  tf.set_random_seed(42)
  
  with tf.variable_scope("model", reuse=None):
    sm = rnnsm.RNNSM(**model_params)
    sm.BuildCoreGraph()
    sm.BuildTrainGraph()
  
  session.run(tf.initialize_all_variables())
  saver = tf.train.Saver()
  
  for epoch in xrange(1,num_epochs+1):
    t0_epoch = time.time()
    bi = utils.batch_generator(train_ids, train_sids, batch_size, max_time)
    print "[epoch %d] Starting epoch %d" % (epoch, epoch)
    #### YOUR CODE HERE ####

    run_epoch(sm, session, bi, train=True, keep_prob=keep_prob, learning_rate=learning_rate)
    print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())

    #### END(YOUR CODE) ####
    print "[epoch %d] Completed in %s" % (epoch, utils.pretty_timedelta(since=t0_epoch))
    
    ##
    # score_dataset will run a forward pass over the entire dataset
    # and report perplexity scores. This can be slow (around 1/2 to 
    # 1/4 as long as a full epoch), so you may want to comment it out
    # to speed up training on a slow machine. Be sure to run it at the 
    # end to evaluate your score.
    print ("[epoch %d]" % epoch),
    score_dataset(sm, session, train_ids, train_sids, name="Train set")
    print ("[epoch %d]" % epoch),
    score_dataset(sm, session, test_ids, test_sids, name="Test set")
    print ""
    
    # Save a checkpoint
    saver.save(session, './tf_saved/rnnsm', global_step=epoch)
    
  # Save final model
  saver.save(session, trained_filename)
  
  print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())

Mon, 12 Dec 2016 20:53:47
Instructions for updating:
Use `tf.global_variables_initializer` instead.


Instructions for updating:
Use `tf.global_variables_initializer` instead.


[epoch 1] Starting epoch 1
Mon, 12 Dec 2016 21:01:14
[epoch 1] Completed in 0:07:25
[epoch 1] Train set: avg. loss: 3.276  (perplexity: 26.47)
[epoch 1] Test set: avg. loss: 3.484  (perplexity: 32.60)

[epoch 2] Starting epoch 2
Mon, 12 Dec 2016 21:10:50
[epoch 2] Completed in 0:08:07
[epoch 2] Train set: avg. loss: 3.256  (perplexity: 25.95)
[epoch 2] Test set: avg. loss: 3.602  (perplexity: 36.69)

[epoch 3] Starting epoch 3
Mon, 12 Dec 2016 21:20:20
[epoch 3] Completed in 0:08:00
[epoch 3] Train set: avg. loss: 3.207  (perplexity: 24.70)
[epoch 3] Test set: avg. loss: 3.539  (perplexity: 34.42)

[epoch 4] Starting epoch 4
Mon, 12 Dec 2016 21:29:29
[epoch 4] Completed in 0:07:34
[epoch 4] Train set: avg. loss: 3.089  (perplexity: 21.96)
[epoch 4] Test set: avg. loss: 3.312  (perplexity: 27.44)

[epoch 5] Starting epoch 5
Mon, 12 Dec 2016 21:38:26
[epoch 5] Completed in 0:07:28
[epoch 5] Train set: avg. loss: 3.100  (perplexity: 22.19)
[epoch 5] Test set: avg. loss: 3.317  (perplexity

In [77]:
def sample_step(sm, session, input_w, initial_h):
  """Run a single RNN step and return sampled predictions.
  
  Args:
    sm : rnnsm.RNNSM
    session: tf.Session
    input_w : [batch_size] list of indices
    initial_h : [batch_size, hidden_dims]
  
  Returns:
    final_h : final hidden state, compatible with initial_h
    samples : [batch_size, 1] vector of indices
  """
  #### YOUR CODE HERE ####
  # Reshape input to column vector
  input_w = np.array(input_w, dtype=np.int32).reshape([-1,1])
  
  # Run sample ops
  final_h, samples = session.run([sm.final_h_, sm.pred_samples_], 
        feed_dict={sm.input_w_: input_w, sm.initial_h_: initial_h, sm.dropout_keep_prob_: 1.0, sm.learning_rate_:0.1})
  
  #### END(YOUR CODE) ####
  return final_h, samples[:,-1,:]

In [81]:
# Same as above, but as a batch
reload(rnnsm)
max_steps = 20
num_samples = 10
random_seed = 42

with tf.Graph().as_default(), tf.Session() as session:
  # Seed RNG for repeatability
  tf.set_random_seed(random_seed)

  with tf.variable_scope("model", reuse=None):
    sm = rnnsm.RNNSM(**model_params)
    sm.BuildCoreGraph()
    sm.BuildSamplerGraph()

  # Load the trained model
  saver = tf.train.Saver()
  saver.restore(session, './' + trained_filename)

  # Make initial state for a batch with batch_size = num_samples
  w = np.repeat([[vocab.START_ID]], num_samples, axis=0)
  h = session.run(sm.initial_h_, {sm.input_w_: w})
  # We'll take one step for each sequence on each iteration 
  for i in xrange(max_steps):
    h, y = sample_step(sm, session, w[:,-1:], h)
    w = np.hstack((w,y))

  # Print generated sentences
  for row in w:
    for i, word_id in enumerate(row):
      print vocab.id_to_word[word_id],
      if (i != 0) and (word_id == vocab.START_ID):
        break
    print ""

<s> an was new new new company from new DGDGDGDG '' was will from at new will by from from new 
<s> said from ( be new will from from from '' '' from new at from new as from was '' 
<s> company from by was which new not new ) will new '' from will '' in from will was from 
<s> new had be ( business from $ company ( was from was was was from was be which new said 
<s> DGDGDGDG will '' new which from which from was ( new which new '' will which new which new will 
<s> '' , which be the had had ( had from said new '' '' said had '' was '' will 
<s> ( '' '' as new an from in '' from ( has ( '' new from '' an '' from 
<s> ( which '' as from said new was said by more new from '' has ( new from new by 
<s> which was from new had from an was was from which from DGDGDGDG will was will from which which '' 
<s> new in '' new which new as was mr. new '' by '' company new by new '' '' in 


In [82]:
def score_seq(sm, session, seq, vocab):
  """Score a sequence of words. Returns total log-probability."""
  padded_ids = vocab.words_to_ids(utils.canonicalize_words(["<s>"] + seq, 
                                                           wordset=vocab.word_to_id))
  w = np.reshape(padded_ids[:-1], [1,-1])
  y = np.reshape(padded_ids[1:],  [1,-1])
    
  h = session.run(sm.initial_h_, {sm.input_w_: w})
  feed_dict = {sm.input_w_:w,
               sm.target_y_:y,
               sm.initial_h_:h,
               sm.dropout_keep_prob_: 1.0}
  # Return log(P(seq)) = -1*loss
  return -1*session.run(sm.loss_, feed_dict)

def load_and_score(inputs, sort=False):
  """Load the trained model and score the given words."""
  with tf.Graph().as_default(), tf.Session() as session:  
    with tf.variable_scope("model", reuse=None):
      lm = rnnsm.RNNSM(**model_params)
      lm.BuildCoreGraph()
        
    # Load the trained model
    saver = tf.train.Saver()
    saver.restore(session, './'+trained_filename)
  
    if isinstance(inputs[0], str) or isinstance(inputs[0], unicode):
      inputs = [inputs]

    # Actually run scoring
    results = []
    for words in inputs:
      score = score_seq(lm, session, words, vocab)
      results.append((score, words))
    
    # Sort if requested
    if sort: results = sorted(results, reverse=True)
    
    # Print results
    for score, words in results:
      print "\"%s\" : %.05f" % (" ".join(words), score)

In [83]:
sents = ["once upon a time",
         "the quick brown fox jumps over the lazy dog"]
load_and_score([s.split() for s in sents])

InvalidArgumentError: Received a label value of 2665 which is outside the valid range of [0, 63).  Label values: 633 2665 10 67
	 [[Node: model/loss/loss = SparseSoftmaxCrossEntropyWithLogits[T=DT_FLOAT, Tlabels=DT_INT32, _device="/job:localhost/replica:0/task:0/cpu:0"](model/loss/Reshape, model/loss/Reshape_1)]]

Caused by op u'model/loss/loss', defined at:
  File "/usr/lib/python2.7/runpy.py", line 162, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/usr/lib/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python2.7/dist-packages/traitlets/config/application.py", line 589, in launch_instance
    app.start()
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/kernelapp.py", line 405, in start
    ioloop.IOLoop.instance().start()
  File "/usr/local/lib/python2.7/dist-packages/zmq/eventloop/ioloop.py", line 162, in start
    super(ZMQIOLoop, self).start()
  File "/usr/local/lib/python2.7/dist-packages/tornado/ioloop.py", line 883, in start
    handler_func(fd_obj, events)
  File "/usr/local/lib/python2.7/dist-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/usr/local/lib/python2.7/dist-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/usr/local/lib/python2.7/dist-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/kernelbase.py", line 260, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/kernelbase.py", line 212, in dispatch_shell
    handler(stream, idents, msg)
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/kernelbase.py", line 370, in execute_request
    user_expressions, allow_stdin)
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/ipkernel.py", line 175, in do_execute
    shell.run_cell(code, store_history=store_history, silent=silent)
  File "/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py", line 2723, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py", line 2831, in run_ast_nodes
    if self.run_code(code, result):
  File "/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py", line 2885, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-83-369449b7415d>", line 3, in <module>
    load_and_score([s.split() for s in sents])
  File "<ipython-input-82-b36be2bb82a5>", line 20, in load_and_score
    lm.BuildCoreGraph()
  File "rnnsm.py", line 112, in BuildCoreGraph
    self.loss_ = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits_, self.target_y_, name = "loss"))
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/nn_ops.py", line 1551, in sparse_softmax_cross_entropy_with_logits
    precise_logits, labels, name=name)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_nn_ops.py", line 2378, in _sparse_softmax_cross_entropy_with_logits
    features=features, labels=labels, name=name)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 759, in apply_op
    op_def=op_def)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2240, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1128, in __init__
    self._traceback = _extract_stack()

InvalidArgumentError (see above for traceback): Received a label value of 2665 which is outside the valid range of [0, 63).  Label values: 633 2665 10 67
	 [[Node: model/loss/loss = SparseSoftmaxCrossEntropyWithLogits[T=DT_FLOAT, Tlabels=DT_INT32, _device="/job:localhost/replica:0/task:0/cpu:0"](model/loss/Reshape, model/loss/Reshape_1)]]


In [49]:
#### YOUR CODE HERE ####
sents = ["the boy and the girl are",
         "the boy and the girl is",
         
         "the boys are", 
         "the boys is",
         
         "there are many deer", 
         "there are many deers"]
load_and_score([s.split() for s in sents])

#### END(YOUR CODE) ####

"the boy and the girl are" : -36.40670
"the boy and the girl is" : -34.56690
"the boys are" : -16.81372
"the boys is" : -16.17620
"there are many deer" : -26.15632
"there are many deers" : -19.46089


In [59]:
#### YOUR CODE HERE ####

sents = [
         "In a good market, Apple's price falls",
         "In a good market, Apple's price rises",
    "In a good market, Apple's price stays"
         ]
load_and_score([s.split() for s in sents])


#### END(YOUR CODE) ####

"In a good market, Apple's price falls" : -39.03466
"In a good market, Apple's price rises" : -41.51408
"In a good market, Apple's price stays" : -37.77964


In [51]:
prefix = "I have lots of".split()
noun = "toys"
adjectives = ["square", "green", "plastic"]
inputs = []
for adjs in itertools.permutations(adjectives):
  words = prefix + list(adjs) + [noun]
  inputs.append(words)
    
load_and_score(inputs, sort=True)

"I have lots of green plastic square toys" : -61.22992
"I have lots of green square plastic toys" : -62.09601
"I have lots of plastic green square toys" : -62.21913
"I have lots of square green plastic toys" : -62.24586
"I have lots of plastic square green toys" : -62.28750
"I have lots of square plastic green toys" : -66.11003


In [52]:
prefix = "I have lots of Apple".split()
noun = "toys"
adjectives = ["stocks", "pies", "computers"]
inputs = []
for adjs in itertools.permutations(adjectives):
  words = prefix + list(adjs) + [noun]
  inputs.append(words)
    
load_and_score(inputs, sort=True)

"I have lots of green plastic flat toys" : -57.62775
"I have lots of green flat plastic toys" : -59.28275
"I have lots of plastic green flat toys" : -59.49638
"I have lots of plastic flat green toys" : -60.39898
"I have lots of flat plastic green toys" : -60.96680
"I have lots of flat green plastic toys" : -61.51214


In [53]:
prefix = "The future of Apple ".split()
noun = "toys"
adjectives = ["round", "green", "plastic"]
inputs = []
for adjs in itertools.permutations(adjectives):
  words = prefix + list(adjs) + [noun]
  inputs.append(words)
    
load_and_score(inputs, sort=True)

"I have lots of green plastic round toys" : -59.50684
"I have lots of plastic green round toys" : -61.64307
"I have lots of green round plastic toys" : -61.85661
"I have lots of plastic round green toys" : -62.10223
"I have lots of round green plastic toys" : -62.98605
"I have lots of round plastic green toys" : -63.16181


In [54]:
prefix = "I have lots of".split()
noun = "toys"
adjectives = ["small", "green", "plastic"]
inputs = []
for adjs in itertools.permutations(adjectives):
  words = prefix + list(adjs) + [noun]
  inputs.append(words)
    
load_and_score(inputs, sort=True)

"I have lots of small plastic green toys" : -55.65118
"I have lots of small green plastic toys" : -56.18058
"I have lots of green plastic small toys" : -58.51525
"I have lots of plastic small green toys" : -58.71899
"I have lots of green small plastic toys" : -59.45413
"I have lots of plastic green small toys" : -60.29629
