# Imports

In [1]:
%matplotlib inline
from IPython.display import clear_output
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import numpy as np

tf.__version__, hub.__version__

  from ._conv import register_converters as _register_converters


('1.8.0', '0.1.0')

# Read Data

In [2]:
train_reviews = pd.read_csv('../data/op_spam_v1.4/train_reviews.csv')
valid_reviews = pd.read_csv('../data/op_spam_v1.4/valid_reviews.csv')
vocabulary_file = '../data/op_spam_v1.4/vocab.csv'

with open(vocabulary_file) as f:
    vocab_size = sum(1 for line in f) + 2

print('vocab_size', vocab_size)
train_reviews.head(2)

vocab_size 2857


Unnamed: 0,class,polarity,source,fold,file,review
0,0,positive_polarity,deceptive_from_MTurk,2,d_talbott_9.txt,"excellent staff and customer service, very cle..."
1,0,positive_polarity,deceptive_from_MTurk,2,d_talbott_8.txt,my stay at this hotel was one of the best i ha...


# Estimator

In [3]:
model_dir = '/tmp/models/hub'
!rm -fr $model_dir
params = dict(
    vocab_size=vocab_size,
    n_classes=2,
    embedding_size=3,
    window_size=3,
    max_text_len=400,
    learning_rate=0.01,
    hidden_units=[256],
    vocabulary_file=vocabulary_file
)

tf.logging.set_verbosity(tf.logging.INFO)

run_config = tf.estimator.RunConfig(
    log_step_count_steps=100,
    tf_random_seed=0,
    model_dir=model_dir
)

text_embedding_column = hub.text_embedding_column(
    key='review',
    module_spec='https://tfhub.dev/google/universal-sentence-encoder/2',
    trainable=False)

feature_columns = [text_embedding_column]

estimator = tf.estimator.DNNClassifier(
    feature_columns=feature_columns,
    n_classes=params['n_classes'],
    hidden_units=params['hidden_units'],
    optimizer=tf.train.AdamOptimizer(learning_rate=params['learning_rate']),
    config=run_config
)

INFO:tensorflow:Using /tmp/tfhub_modules to cache modules.
INFO:tensorflow:Using config: {'_train_distribute': None, '_service': None, '_tf_random_seed': 0, '_is_chief': True, '_model_dir': '/tmp/models/hub', '_master': '', '_num_ps_replicas': 0, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_save_checkpoints_secs': 600, '_session_config': None, '_log_step_count_steps': 100, '_keep_checkpoint_every_n_hours': 10000, '_task_id': 0, '_save_checkpoints_steps': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f22bdd37da0>, '_global_id_in_cluster': 0, '_save_summary_steps': 100, '_evaluation_master': '', '_num_worker_replicas': 1}


# Input Functions

In [4]:
batch_size = 128

train_input_fn = tf.estimator.inputs.pandas_input_fn(
    x=train_reviews[['review']],
    y=train_reviews['class'],
    batch_size=batch_size,
    shuffle=True,
    queue_capacity=batch_size*3,
    num_threads=1,
    num_epochs=1,
)

valid_input_fn = tf.estimator.inputs.pandas_input_fn(
    x=valid_reviews[['review']],
    y=valid_reviews['class'],
    batch_size=batch_size,
    num_epochs=1,
    shuffle=False,
    queue_capacity=batch_size*3,
    num_threads=1
)

# Train

In [5]:
# DEBUG, INFO, WARN, ERROR, FATAL
tf.logging.set_verbosity(tf.logging.INFO)
result = pd.DataFrame()

In [6]:
for _ in range(100):
    estimator.train(input_fn=train_input_fn, steps=1000)
    res = estimator.evaluate(input_fn=valid_input_fn)

    result = result.append(pd.DataFrame([res]))

    clear_output(wait=True)
    display(result[['global_step', 'loss', 'accuracy', 'precision', 'recall']].set_index('global_step').tail())

Unnamed: 0_level_0,loss,accuracy,precision,recall
global_step,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10,75.676552,0.5,0.5,1.0
20,73.93792,0.5,0.5,1.0
30,74.343712,0.5,0.0,0.0
40,73.938263,0.5,0.0,0.0


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Initialize variable dnn/input_from_feature_columns/input_layer/review_hub_module_embedding/module/Embeddings_en/sharded_0:0 from checkpoint b'/tmp/tfhub_modules/1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47/variables/variables' with Embeddings_en/sharded_0
INFO:tensorflow:Initialize variable dnn/input_from_feature_columns/input_layer/review_hub_module_embedding/module/Embeddings_en/sharded_1:0 from checkpoint b'/tmp/tfhub_modules/1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47/variables/variables' with Embeddings_en/sharded_1
INFO:tensorflow:Initialize variable dnn/input_from_feature_columns/input_layer/review_hub_module_embedding/module/Embeddings_en/sharded_10:0 from checkpoint b'/tmp/tfhub_modules/1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47/variables/variables' with Embeddings_en/sharded_10
INFO:tensorflow:Initialize variable dnn/input_from_feature_columns/input_layer/review_hub_module_embedding/module/Embeddings_en/sharded_11:0 from checkpoint b'/

KeyboardInterrupt: 

## 5. Evaluate the Model

In [None]:
train_results = estimator.evaluate(input_fn=train_input_fn)
train_results['result_type'] = 'Train'
valid_results = estimator.evaluate(input_fn=valid_input_fn)
valid_results['result_type'] = 'Valid'
pd.DataFrame([train_results, valid_results]).set_index('result_type')