In [1]:
import os, sys, re, json, time, shutil
import itertools
import collections
from IPython.display import display

# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import tensorflow as tf

# Pandas because pandas are awesome, and for pretty-printing
import pandas as pd
# Set pandas floating point display
pd.set_option('float_format', lambda f: "{0:.04f}".format(f))

# Helper libraries for this notebook
import utils; reload(utils)
import vocabulary; reload(vocabulary)
import rnnlm; reload(rnnlm)

<module 'rnnlm' from 'rnnlm.pyc'>

In [2]:
import tensorflow as tf
import rnnlm; reload(rnnlm)

# Clear old log directory
shutil.rmtree("tf_summaries", ignore_errors=True)


with tf.Graph().as_default():
  tf.set_random_seed(42)

  lm = rnnlm.RNNLM(V=10000, H=200, num_layers=2)
  lm.BuildCoreGraph()
  lm.BuildTrainGraph()
  lm.BuildSamplerGraph()

  summary_writer = tf.train.SummaryWriter("tf_summaries", 
                                          tf.get_default_graph())


Instructions for updating:
Please switch to tf.summary.FileWriter. The interface and behavior is the same; this is just a rename.


Instructions for updating:
Please switch to tf.summary.FileWriter. The interface and behavior is the same; this is just a rename.


In [3]:
def run_epoch(lm, session, batch_iterator, train=False,
              verbose=False, tick_s=10, 
              keep_prob=1.0, learning_rate=0.1):
  start_time = time.time()
  tick_time = start_time  # for showing status
  total_cost = 0.0  # total cost, summed over all words
  total_words = 0

  if train:
    train_op = lm.train_step_
    keep_prob = keep_prob
    loss = lm.train_loss_
  else:
    train_op = tf.no_op()
    keep_prob = 1.0  # no dropout at test time
    loss = lm.loss_  # true loss, if train_loss is an approximation

  for i, (w, y) in enumerate(batch_iterator):
    cost = 0.0
    #### YOUR CODE HERE ####
    np.random.seed(168)

    # At first batch in epoch, get a clean intitial state
    if i == 0:
        h = session.run(lm.initial_h_, {lm.input_w_: w})
 
    cost, h, _ = session.run([loss, lm.final_h_, train_op], feed_dict= {lm.target_y_: y, lm.initial_h_:h,
        lm.input_w_: w, lm.dropout_keep_prob_:keep_prob, lm.learning_rate_:learning_rate})      
    
    #### END(YOUR CODE) ####
    total_cost += cost
    total_words += w.size  # w.size = batch_size * max_time

    ##
    # Print average loss-so-far for epoch
    # If using train_loss_, this may be an underestimate.
    if verbose and (time.time() - tick_time >= tick_s):
      avg_cost = total_cost / total_words
      avg_wps = total_words / (time.time() - start_time)
      print "[batch %d]: seen %d words at %d wps, loss = %.3f" % (i,
          total_words, avg_wps, avg_cost)
      tick_time = time.time()  # reset time ticker

  return total_cost / total_words

In [5]:
# Load the dataset
import time
reload(utils)
os.environ['TZ'] = 'US/Pacific'

print "Grand model starts at:", time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())

np.random.seed(168)

lm_name = 'yelp' # which is also the corpus name which is also the folder name in local directory in which each category
               # is specified as a txt file with <category_name>.txt as the filename.
pos_cat = '5'
neg_cat = '1'

corpus = nltk.corpus.CategorizedPlaintextCorpusReader('./'+lm_name+'/', r'.*\.txt', cat_pattern=r'(\w+)\.txt')

categories_train_set = [pos_cat, neg_cat]

V = 10000
# Training parameters
max_time = 20
batch_size = 50
learning_rate = 0.5
keep_prob = 1.0
num_epochs = 5

#to hold the test sents by categories
test_sents = {}

# Model parameters
model_params = dict(V=V, 
                    H=100, 
                    num_layers=2)

# Will print status every this many seconds
print_interval = 5
print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())
# Clear old log directory
shutil.rmtree("tf_summaries", ignore_errors=True)

def score_dataset(lm, session, ids, name="Data"):
  bi = utils.batch_generator(ids, batch_size=100, max_time=100)
  cost = run_epoch(lm, session, bi, 
                   learning_rate=1.0, keep_prob=1.0, 
                   train=False, verbose=False, tick_s=3600)
  print "%s: avg. loss: %.03f  (perplexity: %.02f)" % (name, cost, np.exp(cost))

for categories in categories_train_set:
    print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime()), "setting up training for '"+categories+"' started" 
    
    vocab, train_ids, test_ids, test_sents[categories] = utils.load_corpus(corpus, split=0.8, V=V, categories=categories, shuffle=True)

    print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime()), "setting up training for '"+categories+"' ended"


    trained_filename = './tf_saved/rnnlm_trained' + '_' + lm_name + '_' + categories
    
    print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime()), "training for '"+categories+"' started" 

    with tf.Graph().as_default(), tf.Session() as session:
      # Seed RNG for repeatability
      tf.set_random_seed(168)

      with tf.variable_scope("model", reuse=None):
        lm = rnnlm.RNNLM(**model_params)
        lm.BuildCoreGraph()
        lm.BuildTrainGraph()

      session.run(tf.initialize_all_variables())
      saver = tf.train.Saver()

      for epoch in xrange(1,num_epochs+1):
        t0_epoch = time.time()
        bi = utils.batch_generator(train_ids, batch_size, max_time)
        print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime()), "epoch for '"+categories+"' started" 

        print "[epoch %d] Starting epoch %d" % (epoch, epoch)
        #### YOUR CODE HERE ####

        run_epoch(lm, session, bi, train=True, keep_prob=keep_prob, learning_rate=learning_rate)

        #### END(YOUR CODE) ####
        print "[epoch %d] Completed in %s" % (epoch, utils.pretty_timedelta(since=t0_epoch))

        ##
        # score_dataset will run a forward pass over the entire dataset
        # and report perplexity scores. This can be slow (around 1/2 to 
        # 1/4 as long as a full epoch), so you may want to comment it out
        # to speed up training on a slow machine. Be sure to run it at the 
        # end to evaluate your score.
        print ("[epoch %d]" % epoch),
        score_dataset(lm, session, train_ids, name="Train set")
        print ("[epoch %d]" % epoch),
        score_dataset(lm, session, test_ids, name="Test set")
        print ""

        # Save a checkpoint
        saver.save(session, './tf_saved/rnnlm' + '_' + lm_name + '_' + categories, global_step=epoch)
        print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime()), "epoch for '"+categories+"' ended and saved" 
      # Save final model
      saver.save(session, trained_filename)
        
    print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime()), "training for '"+categories+"' ended" 


Grand model starts at: Tue, 20 Dec 2016 05:32:54
Tue, 20 Dec 2016 05:32:54
Tue, 20 Dec 2016 05:32:54 setting up training for '5' started Loaded 6640257 sentences (1.23978e+08 tokens)
Training set: 5312205 sentences (99175646 tokens)
Test set: 1328052 sentences (24802746 tokens)
Instructions for updating:
Use `tf.global_variables_initializer` instead.


Instructions for updating:
Use `tf.global_variables_initializer` instead.


 Tue, 20 Dec 2016 06:03:08 epoch for '5' started [epoch 1] Starting epoch 1


KeyboardInterrupt: 

In [None]:
def sample_step(lm, session, input_w, initial_h):
  """Run a single RNN step and return sampled predictions.
  
  Args:
    lm : rnnlm.RNNLM
    session: tf.Session
    input_w : [batch_size] list of indices
    initial_h : [batch_size, hidden_dims]
  
  Returns:
    final_h : final hidden state, compatible with initial_h
    samples : [batch_size, 1] vector of indices
  """
  #### YOUR CODE HERE ####
  # Reshape input to column vector
  input_w = np.array(input_w, dtype=np.int32).reshape([-1,1])
  
  # Run sample ops
  final_h, samples = session.run([lm.final_h_, lm.pred_samples_], 
        feed_dict={lm.input_w_: input_w, lm.initial_h_: initial_h, lm.dropout_keep_prob_: 1.0, lm.learning_rate_:0.1})
  
  #### END(YOUR CODE) ####
  return final_h, samples[:,-1,:]

In [None]:
# Same as above, but as a batch
reload(rnnlm)
max_steps = 20
num_samples = 10
random_seed = 168

with tf.Graph().as_default(), tf.Session() as session:
  # Seed RNG for repeatability
  tf.set_random_seed(random_seed)

  with tf.variable_scope("model", reuse=None):
    lm = rnnlm.RNNLM(**model_params)
    lm.BuildCoreGraph()
    lm.BuildSamplerGraph()

  # Load the trained model
  saver = tf.train.Saver()
  saver.restore(session, trained_filename)

  # Make initial state for a batch with batch_size = num_samples
  w = np.repeat([[vocab.START_ID]], num_samples, axis=0)
  h = session.run(lm.initial_h_, {lm.input_w_: w})
  # We'll take one step for each sequence on each iteration 
  for i in xrange(max_steps):
    h, y = sample_step(lm, session, w[:,-1:], h)
    w = np.hstack((w,y))

  # Print generated sentences
  for row in w:
    for i, word_id in enumerate(row):
      print vocab.id_to_word[word_id],
      if (i != 0) and (word_id == vocab.START_ID):
        break
    print ""

In [None]:
def score_seq(lm, session, seq, vocab):
  """Score a sequence of words. Returns total log-probability."""
  padded_ids = vocab.words_to_ids(utils.canonicalize_words(["<s>"] + seq, 
                                                           wordset=vocab.word_to_id))
  w = np.reshape(padded_ids[:-1], [1,-1])
  y = np.reshape(padded_ids[1:],  [1,-1])
  h = session.run(lm.initial_h_, {lm.input_w_: w})
  feed_dict = {lm.input_w_:w,
               lm.target_y_:y,
               lm.initial_h_:h,
               lm.dropout_keep_prob_: 1.0}
  # Return log(P(seq)) = -1*loss
  return -1*session.run(lm.loss_, feed_dict)

def load_and_score(inputs, sort=False, silent=False):
  """Load the trained model and score the given words."""
  with tf.Graph().as_default(), tf.Session() as session:  
    with tf.variable_scope("model", reuse=None):
      tf.set_random_seed(168)
      lm = rnnlm.RNNLM(**model_params)
      lm.BuildCoreGraph()
 
    # print 'loading', trained_filename
    # Load the trained model
    saver = tf.train.Saver()
    saver.restore(session, trained_filename)
  
    if isinstance(inputs[0], str) or isinstance(inputs[0], unicode):
      inputs = [inputs]

    # Actually run scoring
    results = []
    for words in inputs:
      score = score_seq(lm, session, words, vocab)
      results.append((score, words))
    
    # Sort if requested
    if sort: results = sorted(results, reverse=True)
    
    # Print results
    for score, words in results:
      if not silent:
          print "\"%s\" : %.05f" % (" ".join(words), score)
    if silent:
      return [score for score, words in results]

In [None]:
print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime()), "scoring started", 

#neg predictor
total_pos = 0
total_neg = 0
tn = 0
fn = 0
tp = 0
fp = 0
#num_sample = 10000
nopt = 0
nont = 0

categories = pos_cat
trained_filename = './tf_saved/rnnlm_trained' + '_' + lm_name + '_' +categories

pos_for_pos_score = load_and_score(test_sents[pos_cat], silent=True)
neg_for_pos_score = load_and_score(test_sents[neg_cat], silent=True)


categories = neg_cat
trained_filename = './tf_saved/rnnlm_trained' + '_' + lm_name + '_' + categories

pos_for_neg_score = load_and_score(test_sents[pos_cat], silent=True)
neg_for_neg_score = load_and_score(test_sents[neg_cat], silent=True)

for i in range(len(test_sents[pos_cat])):
    
    if pos_for_pos_score[i] > pos_for_neg_score[i]:
        tp = tp + 1
    else:
        if pos_for_pos_score[i] < pos_for_neg_score[i]:
            fp = fp + 1
        else: #flip a coin
            nopt = nopt + 1
            if np.random.randint(0, 1) == 0:
                tp = tp + 1
            else: 
                fp = fp + 1

for i in range(len(test_sents[neg_cat])):

    if neg_for_pos_score[i] < neg_for_neg_score[i]:
        tn = tn + 1
    else:
        if neg_for_pos_score[i] > neg_for_neg_score[i]:
            fn = fn + 1
        else: #flip a coin
            nont = nont + 1
            if np.random.randint(0, 1) == 0:
                tn = tn + 1
            else:
                fn = fn + 1

print "tp = %d, fp = %d, tn = %d, fn = %d, nopt = %d, nont = %d\n" % (tp, fp, tn, fn, nopt, nont)

precision = tp * 1.0 / (tp+fp)
recall = tp * 1.0 / (tp+fn)
accuracy = (tp + tn) * 1.0 / (tp+fp+tn+fn)
true_neg_rate = tn * 1.0 / (tn + fp)
f_measure = 2.0 * precision * recall / (precision + recall)

print "precision = %.2f, recall = %.2f, accuracy = %.2f, true_neg_rate = %.2f, f-measure = %.2f\n" % \
      (precision*100, recall*100, accuracy*100, true_neg_rate*100, f_measure*100)
print
print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime()), "scoring ended", 
