# Imports

In [1]:
%matplotlib inline
from IPython.display import clear_output
import tensorflow as tf
import pandas as pd
import numpy as np

tf.__version__

  from ._conv import register_converters as _register_converters


'1.8.0'

# Read Data

In [2]:
train_reviews = pd.read_csv('../data/op_spam_v1.4/train_reviews.csv')
valid_reviews = pd.read_csv('../data/op_spam_v1.4/valid_reviews.csv')
vocabulary_file = '../data/op_spam_v1.4/vocab.csv'

with open(vocabulary_file) as f:
    vocab_size = sum(1 for line in f) + 2

print('vocab_size', vocab_size)
train_reviews.head(2)

vocab_size 2857


Unnamed: 0,class,polarity,source,fold,file,review
0,0,positive_polarity,deceptive_from_MTurk,2,d_talbott_9.txt,"excellent staff and customer service, very cle..."
1,0,positive_polarity,deceptive_from_MTurk,2,d_talbott_8.txt,my stay at this hotel was one of the best i ha...


# Model Function

In [3]:
def words_to_word_ids(reviews, vocabulary_file, max_text_len=400):
    # Load vocabolary lookup table to map word => word_id
    vocab_table = tf.contrib.lookup.index_table_from_file(
        vocabulary_file=vocabulary_file,
        num_oov_buckets=1,
        default_value=-1)
    
    # Split text to words -> this will produce sparse tensor with variable-lengthes (word count) entries
    words = tf.string_split(reviews)
    # Convert sparse tensor to dense tensor by padding each entry to match the longest in the batch
    dense_words = tf.sparse_tensor_to_dense(words, default_value='#PAD#')
    # Convert word to word_ids via the vocab lookup table
    word_ids = vocab_table.lookup(dense_words)
    # Create a word_ids padding
    padding = tf.constant([[0, 0],[0, max_text_len]])
    # Pad all the word_ids entries to the maximum document length
    word_ids_padded = tf.pad(word_ids, padding)
    word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, max_text_len])
    return word_id_vector


def model_fn(features, labels, mode, params):
    default_params = dict(
        n_classes=2,
        embedding_size=5,
        max_text_len=400,
        learning_rate=0.001,
        hidden_units=[24, 16],
        forget_bias=1.,
        keep_prob=0.8,
    )
    
    default_params.update(params)
    params = default_params

    # words to word_id_vector
    word_id_vector = words_to_word_ids(
        features['review'],
        params['vocabulary_file'],
        params['max_text_len']
    )
    
    # word_id to embeddings
    word_embeddings = tf.contrib.layers.embed_sequence(
        word_id_vector,
        vocab_size=params['vocab_size'],
        embed_dim=params['embedding_size']
    )
    
    # configure the RNN
    cells = []
    for size in params['hidden_units']:
        cell = tf.nn.rnn_cell.LSTMCell(
            num_units=size, 
            forget_bias=params['forget_bias'],
            activation=tf.nn.tanh)
        cells.append(cell)

    # create a RNN cell composed sequentially of a number of RNNCells
    multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(cells)
    
    input_layer = tf.unstack(word_embeddings, axis=1)
    
    outputs, _ = tf.nn.static_rnn(cell=multi_rnn_cell, 
                                inputs=input_layer, 
                                dtype=tf.float32)

    logits = tf.layers.dense(inputs=outputs[-1], 
                             units=params['n_classes'], 
                             activation=None)

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'predictions': tf.argmax(logits, 1),
            'probabilities': tf.nn.softmax(logits)
        }

        return tf.estimator.EstimatorSpec(mode, predictions=predictions)


    loss = tf.losses.sparse_softmax_cross_entropy(
        logits=logits, labels=labels)
    
    tf.summary.scalar('loss', loss)
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        global_step=tf.train.get_global_step()
        optimizer = tf.train.AdamOptimizer(params['learning_rate'])
        train_op = optimizer.minimize(loss=loss, global_step=global_step)
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

    if mode == tf.estimator.ModeKeys.EVAL:
        probabilities = tf.nn.softmax(logits)
        predictions = tf.argmax(probabilities, 1)

        eval_metric_ops = {
            'accuracy': tf.metrics.accuracy(labels, predictions),
            'precision': tf.metrics.precision(labels, predictions),
            'recall': tf.metrics.recall(labels, predictions),
        }
        
        return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=eval_metric_ops)


# Estimator

In [8]:
model_dir = '/tmp/models/rnn'
#!rm -fr $model_dir
params = dict(
    vocab_size=vocab_size,
    n_classes=2,
    embedding_size=3,
    max_text_len=200,
    learning_rate=0.001,
    vocabulary_file=vocabulary_file,
    hidden_units=[32],
    forget_bias=1.,
    keep_prob=0.8
)

tf.logging.set_verbosity(tf.logging.INFO)

run_config = tf.estimator.RunConfig(
    log_step_count_steps=100,
    tf_random_seed=0,
    model_dir=model_dir
)

estimator = tf.estimator.Estimator(
    model_fn=model_fn,
    params=params, 
    config=run_config
)

INFO:tensorflow:Using config: {'_save_summary_steps': 100, '_global_id_in_cluster': 0, '_model_dir': '/tmp/models/rnn', '_evaluation_master': '', '_num_worker_replicas': 1, '_log_step_count_steps': 100, '_is_chief': True, '_save_checkpoints_steps': None, '_master': '', '_keep_checkpoint_max': 5, '_tf_random_seed': 0, '_num_ps_replicas': 0, '_task_id': 0, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fd5e31e0470>, '_save_checkpoints_secs': 600, '_task_type': 'worker', '_session_config': None, '_keep_checkpoint_every_n_hours': 10000, '_train_distribute': None}


# Input Functions

In [9]:
batch_size = 64

train_input_fn = tf.estimator.inputs.pandas_input_fn(
    x=train_reviews[['review']],
    y=train_reviews['class'],
    batch_size=batch_size,
    shuffle=True,
    queue_capacity=batch_size*3,
    num_threads=1,
    num_epochs=None,
)
valid_input_fn = tf.estimator.inputs.pandas_input_fn(
    x=valid_reviews[['review']],
    y=valid_reviews['class'],
    batch_size=batch_size,
    num_epochs=1,
    shuffle=False,
    queue_capacity=batch_size*3,
    num_threads=1
)

# Train

In [10]:
# DEBUG, INFO, WARN, ERROR, FATAL
tf.logging.set_verbosity(tf.logging.INFO)
result = pd.DataFrame()

In [None]:
for _ in range(100):
    estimator.train(input_fn=train_input_fn, steps=100)
    res = estimator.evaluate(input_fn=valid_input_fn)

    result = result.append(pd.DataFrame([res]))

    clear_output(wait=True)
    display(result[['global_step', 'loss', 'accuracy', 'precision', 'recall']].set_index('global_step').tail())

Unnamed: 0_level_0,loss,accuracy,precision,recall
global_step,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
400,0.691807,0.54375,0.524138,0.95
500,0.69335,0.50625,0.583333,0.04375


INFO:tensorflow:Calling model_fn.


## 5. Evaluate the Model

In [None]:
train_results = estimator.evaluate(input_fn=train_input_fn)
train_results['result_type'] = 'Train'
valid_results = estimator.evaluate(input_fn=valid_input_fn)
valid_results['result_type'] = 'Valid'
pd.DataFrame([train_results, valid_results]).set_index('result_type')