# Imports

In [1]:
%matplotlib inline
from IPython.display import clear_output
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import numpy as np

tf.__version__, hub.__version__

  from ._conv import register_converters as _register_converters


('1.8.0', '0.1.0')

# Read Data

In [2]:
train_reviews = pd.read_csv('../data/op_spam_v1.4/train_reviews.csv')
valid_reviews = pd.read_csv('../data/op_spam_v1.4/valid_reviews.csv')
vocabulary_file = '../data/op_spam_v1.4/vocab.csv'

with open(vocabulary_file) as f:
    vocab_size = sum(1 for line in f) + 2

print('vocab_size', vocab_size)
train_reviews.head(2)

vocab_size 2857


Unnamed: 0,class,polarity,source,fold,file,review
0,0,positive_polarity,deceptive_from_MTurk,2,d_talbott_9.txt,"excellent staff and customer service, very cle..."
1,0,positive_polarity,deceptive_from_MTurk,2,d_talbott_8.txt,my stay at this hotel was one of the best i ha...


# Model Function

In [3]:
def words_to_word_ids(reviews, vocabulary_file, max_text_len=400):
    # Load vocabolary lookup table to map word => word_id
    vocab_table = tf.contrib.lookup.index_table_from_file(
        vocabulary_file=vocabulary_file,
        num_oov_buckets=1,
        default_value=-1)
    
    # Split text to words -> this will produce sparse tensor with variable-lengthes (word count) entries
    words = tf.string_split(reviews)
    # Convert sparse tensor to dense tensor by padding each entry to match the longest in the batch
    dense_words = tf.sparse_tensor_to_dense(words, default_value='#PAD#')
    # Convert word to word_ids via the vocab lookup table
    word_ids = vocab_table.lookup(dense_words)
    # Create a word_ids padding
    padding = tf.constant([[0, 0],[0, max_text_len]])
    # Pad all the word_ids entries to the maximum document length
    word_ids_padded = tf.pad(word_ids, padding)
    word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, max_text_len])
    return word_id_vector


def model_fn(features, labels, mode, params):
    default_params = dict(
        n_classes=2,
        embedding_size=5,
        window_size=3,
        max_text_len=400,
        filters=5,
        learning_rate=0.001,
        hidden_units=None,
        # https://www.tensorflow.org/hub/modules/text
        embeddings_source='data',#'https://tfhub.dev/google/nnlm-en-dim50/1'
    )
    default_params.update(params)
    params = default_params

    if params['embeddings_source'] == 'data':
        # words to word_id_vector
        word_id_vector = words_to_word_ids(
            features['review'],
            params['vocabulary_file'],
            params['max_text_len']
        ) 

        # word_id to embeddings
        word_embeddings = tf.contrib.layers.embed_sequence(
            word_id_vector,
            vocab_size=params['vocab_size'],
            embed_dim=params['embedding_size']
        ) 
    else:
        embed = hub.Module(params['embeddings_source'])
        word_embeddings = embed(features['review'])

        
    # 1d convolution
    words_conv = tf.layers.conv1d(
        word_embeddings,
        filters=params['filters'],
        kernel_size=params['window_size'], 
        strides=params['window_size']//2,
        padding='SAME',
        activation=tf.nn.relu)
    
    
    words_conv_shape = words_conv.get_shape()
    dim = words_conv_shape[1] * words_conv_shape[2]
    input_layer = tf.reshape(words_conv,[-1, dim])
    
    if params['hidden_units'] is not None:
        hidden_layers = tf.contrib.layers.stack(
            inputs=input_layer,
            layer=tf.contrib.layers.fully_connected,
            stack_args=params['hidden_units'],
            activation_fn=tf.nn.relu)
    else:
        hidden_layers = input_layer

    logits = tf.layers.dense(
        inputs=hidden_layers, 
        units=params['n_classes'], 
        activation=None)
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'class': tf.argmax(logits, 1),
            'probabilities': tf.nn.softmax(logits)
        }
        
        return tf.estimator.EstimatorSpec(
            mode, predictions=predictions)

    loss = tf.losses.sparse_softmax_cross_entropy(
        logits=logits, labels=labels)
    
    tf.summary.scalar('loss', loss)
    
    if mode == tf.estimator.ModeKeys.TRAIN:

        global_step=tf.train.get_global_step()
        optimizer = tf.train.AdamOptimizer(params['learning_rate'])
        train_op = optimizer.minimize(loss=loss, global_step=global_step)

        return tf.estimator.EstimatorSpec(
            mode=mode, loss=loss, train_op=train_op)

    if mode == tf.estimator.ModeKeys.EVAL:
        probabilities = tf.nn.softmax(logits)
        predictions = tf.argmax(probabilities, 1)
        eval_metric_ops = {
            'accuracy': tf.metrics.accuracy(labels, predictions),
            'precision': tf.metrics.precision(labels, predictions),
            'recall': tf.metrics.recall(labels, predictions),
        }
        
        return tf.estimator.EstimatorSpec(
            mode, loss=loss, eval_metric_ops=eval_metric_ops)


# Estimator

In [4]:
model_dir = '/tmp/models/cnn'
#!rm -fr $model_dir
params = dict(
    vocab_size=vocab_size,
    n_classes=2,
    embedding_size=3,
    window_size=3,
    max_text_len=400,
    filters=10,
    learning_rate=0.0001,
    vocabulary_file=vocabulary_file,
    embeddings_source='data'
)

tf.logging.set_verbosity(tf.logging.INFO)

run_config = tf.estimator.RunConfig(
    log_step_count_steps=1000,
    tf_random_seed=0,
    model_dir=model_dir
)

estimator = tf.estimator.Estimator(
    model_fn=model_fn,
    params=params, 
    config=run_config
)

INFO:tensorflow:Using config: {'_save_summary_steps': 100, '_evaluation_master': '', '_service': None, '_task_type': 'worker', '_keep_checkpoint_max': 5, '_global_id_in_cluster': 0, '_save_checkpoints_secs': 600, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc3ca2932b0>, '_log_step_count_steps': 1000, '_task_id': 0, '_master': '', '_num_ps_replicas': 0, '_keep_checkpoint_every_n_hours': 10000, '_num_worker_replicas': 1, '_save_checkpoints_steps': None, '_model_dir': '/tmp/models/cnn', '_session_config': None, '_train_distribute': None, '_tf_random_seed': 0}


# Input Functions

In [5]:
batch_size = 64

train_input_fn = tf.estimator.inputs.pandas_input_fn(
    x=train_reviews[['review']],
    y=train_reviews['class'],
    batch_size=batch_size,
    shuffle=True,
    queue_capacity=batch_size*3,
    num_threads=1,
    num_epochs=None,
)
valid_input_fn = tf.estimator.inputs.pandas_input_fn(
    x=valid_reviews[['review']],
    y=valid_reviews['class'],
    batch_size=batch_size,
    num_epochs=1,
    shuffle=False,
    queue_capacity=batch_size*3,
    num_threads=1
)

# Train

In [6]:
# DEBUG, INFO, WARN, ERROR, FATAL
tf.logging.set_verbosity(tf.logging.INFO)
result = pd.DataFrame()

In [7]:
for _ in range(100):
    estimator.train(input_fn=train_input_fn, steps=1000)
    res = estimator.evaluate(input_fn=valid_input_fn)

    result = result.append(pd.DataFrame([res]))

    clear_output(wait=True)
    display(result[['global_step', 'loss', 'accuracy', 'precision', 'recall']].set_index('global_step').tail())

Unnamed: 0_level_0,loss,accuracy,precision,recall
global_step,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
23000,0.73861,0.85625,0.831395,0.89375
24000,0.766653,0.859375,0.83237,0.9
25000,0.793,0.8625,0.829545,0.9125
26000,0.817716,0.8625,0.829545,0.9125
27000,0.846147,0.859375,0.824859,0.9125


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/models/cnn/model.ckpt-27000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 27001 into /tmp/models/cnn/model.ckpt.
INFO:tensorflow:step = 27000, loss = 6.332993e-08


KeyboardInterrupt: 

## 5. Evaluate the Model

In [26]:
#train_results = estimator.evaluate(input_fn=train_input_fn)
#train_results['result_type'] = 'Train'
valid_results = estimator.evaluate(input_fn=valid_input_fn)
valid_results['result_type'] = 'Valid'
pd.DataFrame([valid_results]).set_index('result_type')

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-06-15-00:28:27
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/models/cnn/model.ckpt-27001
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-06-15-00:28:27
INFO:tensorflow:Saving dict for global step 27001: accuracy = 0.859375, global_step = 27001, loss = 0.8458332, precision = 0.8248588, recall = 0.9125


Unnamed: 0_level_0,accuracy,global_step,loss,precision,recall
result_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Valid,0.859375,27001,0.845833,0.824859,0.9125


In [15]:
predict_data = pd.read_csv('../data/1_reviewsFull.csv')
test_input_fn = tf.estimator.inputs.pandas_input_fn(
    x=predict_data[['review']],
    y=None,
    batch_size=batch_size,
    num_epochs=1,
    shuffle=False,
    queue_capacity=1,
    num_threads=1
)
result = estimator.predict(input_fn=test_input_fn)

In [16]:
probabilities = []
for res in result:
    probabilities.append(res['probabilities'][1])


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/models/cnn/model.ckpt-27001
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [18]:
predict_data['spam_probability'] = probabilities

In [19]:
predict_data.to_csv('../data/1_reviewsFull_result.csv', index=False)

In [25]:
predict_data['spam_probability'].mean()

0.6421691757892078