In [1]:
from __future__ import print_function

import os, pprint
import tensorflow as tf
import tfx_utils

tf.enable_eager_execution()
pp = pprint.PrettyPrinter(indent=4)

def _make_default_sqlite_uri(pipeline_name):
    return os.path.join('/tmp/tmpxd30uz6t/testSimplePipeline/tfx/metadata', pipeline_name, 'metadata.db')

def get_metadata_store(pipeline_name):
    return tfx_utils.TFXReadonlyMetadataStore.from_sqlite_db(_make_default_sqlite_uri(pipeline_name))

pipeline_name = 'chicago_taxi_simple' # or taxi_solution
pipeline_db_path = _make_default_sqlite_uri(pipeline_name)
print('Pipeline DB:\n{}'.format(pipeline_db_path))

store = get_metadata_store(pipeline_name)

  'Running the Apache Beam SDK on Python 3 is not yet fully supported. '


Pipeline DB:
/tmp/tmpxd30uz6t/testSimplePipeline/tfx/metadata/chicago_taxi_simple/metadata.db


# TFX & WIT
### Exploring using the What-If Tool with TFX

This notebook explores using MLMD payloads from the Chicago Taxi pipeline example, which are created in the TFX developer tutorial, with the What-If Tool.

# Something seems wrong
I load the SavedModel created by Trainer, which should include the transform graph created by Transform.  It should expect the tf.Examples created by ExampleGen, and the schema from SchemaGen, **but that doesn't work**.  What does work is to give the model the transformed examples and transformed schema from **Transform**.  This suggests to me that **Trainer isn't including the transform graph in the SavedModel that it creates**.

Get the SavedModel:

In [2]:
from os import listdir
models = store.get_artifacts_of_type_df(tfx_utils.TFXArtifactTypes.MODEL)
modelroot = os.path.join(models.URI.iloc[0], 'serving_model_dir', 'export', 'chicago-taxi')
newest = str(sorted([int(f) for f in listdir(modelroot) if f.isdigit()])[-1])
modeldir = os.path.join(modelroot, newest)

print('modeldir: {}'.format(modeldir))

modeldir: /tmp/tmpxd30uz6t/testSimplePipeline/tfx/pipelines/chicago_taxi_simple/Trainer/output/6/serving_model_dir/export/chicago-taxi/1561488174


Prepare the feature columns for the Estimator:

In [3]:
# Categorical features are assumed to each have a maximum value in the dataset.
_MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12]

_CATEGORICAL_FEATURE_KEYS = [
    'trip_start_hour', 'trip_start_day', 'trip_start_month',
    'pickup_census_tract', 'dropoff_census_tract', 'pickup_community_area',
    'dropoff_community_area'
]

_DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds']

# Number of buckets used by tf.transform for encoding each feature.
_FEATURE_BUCKET_COUNT = 10

_BUCKET_FEATURE_KEYS = [
    'pickup_latitude', 'pickup_longitude', 'dropoff_latitude',
    'dropoff_longitude'
]

_VOCAB_FEATURE_KEYS = [
    'payment_type',
    'company',
]

# Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform
_VOCAB_SIZE = 1000

# Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed.
_OOV_SIZE = 10

# WEIRD: Since we're using the features created by Transform, we need to change the names to match what Transform names them
def _transformed_name(key):
    return key + '_xf'

def _transformed_names(keys):
    return [_transformed_name(key) for key in keys]

real_valued_columns = [
    tf.feature_column.numeric_column(key, shape=(), default_value=0)
    for key in _transformed_names(_DENSE_FLOAT_FEATURE_KEYS)
]

categorical_columns = [
    tf.feature_column.categorical_column_with_identity(key, num_buckets=_VOCAB_SIZE + _OOV_SIZE, default_value=0)
    for key in _transformed_names(_VOCAB_FEATURE_KEYS)
]

categorical_columns += [
    tf.feature_column.categorical_column_with_identity(
        key, num_buckets=_FEATURE_BUCKET_COUNT, default_value=0)
    for key in _transformed_names(_BUCKET_FEATURE_KEYS)
]

categorical_columns += [
    tf.feature_column.categorical_column_with_identity(  # pylint: disable=g-complex-comprehension
        key, num_buckets=num_buckets, default_value=0) for key, num_buckets in zip(
        _transformed_names(_CATEGORICAL_FEATURE_KEYS), _MAX_CATEGORICAL_FEATURE_VALUES)
]

Instantiate the trained Estimator from the SavedModel saved by Trainer:

In [4]:
# Number of nodes in the first layer of the DNN
first_dnn_layer_size = 100
num_dnn_layers = 4
dnn_decay_factor = 0.7

hidden_units=[
    max(2, int(first_dnn_layer_size * dnn_decay_factor**i))
    for i in range(num_dnn_layers)
]

model = tf.estimator.DNNLinearCombinedClassifier(
    linear_feature_columns=categorical_columns,
    dnn_feature_columns=real_valued_columns,
    dnn_hidden_units=hidden_units,
    warm_start_from=modeldir)
    
print('model is a ({})'.format(type(model)))

INFO:tensorflow:Using default config.


I0625 18:22:06.853214 140249631389504 estimator.py:1739] Using default config.




W0625 18:22:06.856242 140249631389504 estimator.py:1760] Using temporary folder as model directory: /tmp/tmpmvcmzu9o


INFO:tensorflow:Using config: {'_num_worker_replicas': 1, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f8e5d351518>, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_model_dir': '/tmp/tmpmvcmzu9o', '_log_step_count_steps': 100, '_evaluation_master': '', '_tf_random_seed': None, '_device_fn': None, '_is_chief': True, '_keep_checkpoint_every_n_hours': 10000, '_experimental_distribute': None, '_global_id_in_cluster': 0, '_task_id': 0, '_protocol': None, '_eval_distribute': None, '_save_checkpoints_steps': None, '_num_ps_replicas': 0, '_master': '', '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_train_distribute': None, '_task_type': 'worker', '_keep_checkpoint_max': 5}


I0625 18:22:06.859966 140249631389504 estimator.py:201] Using config: {'_num_worker_replicas': 1, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f8e5d351518>, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_model_dir': '/tmp/tmpmvcmzu9o', '_log_step_count_steps': 100, '_evaluation_master': '', '_tf_random_seed': None, '_device_fn': None, '_is_chief': True, '_keep_checkpoint_every_n_hours': 10000, '_experimental_distribute': None, '_global_id_in_cluster': 0, '_task_id': 0, '_protocol': None, '_eval_distribute': None, '_save_checkpoints_steps': None, '_num_ps_replicas': 0, '_master': '', '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_train_distribute': None, '_task_type': 'worker', '_keep_checkpoint_max': 5}


INFO:tensorflow:Warm-starting from a SavedModel


I0625 18:22:06.862369 140249631389504 estimator.py:2292] Warm-starting from a SavedModel


model is a (<class 'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedClassifier'>)


**WEIRD: Grab the schema created by Transform:**

In [5]:
transform_root = store.get_artifacts_of_type_df(tfx_utils.TFXArtifactTypes.TRANSFORMED_EXAMPLES).iloc[0].URI
print('transform_root: {}'.format(transform_root))
!ls -RF {transform_root}
schema_uri = os.path.join(transform_root, 'transformed_metadata', 'schema.pbtxt')
print('schema_uri: {}'.format(schema_uri))

example_gen_root = store.get_artifacts_of_type_df(tfx_utils.TFXArtifactTypes.EXAMPLES).iloc[0].URI
print('example_gen_root: {}'.format(example_gen_root))
schema_gen_root = store.get_artifacts_of_type_df(tfx_utils.TFXArtifactTypes.SCHEMA).iloc[0].URI
print('schema_gen_root: {}'.format(schema_gen_root))

original_schema_uri = os.path.join(schema_gen_root, 'schema.pbtxt')
!ls -RF {original_schema_uri}

transform_root: /tmp/tmpxd30uz6t/testSimplePipeline/tfx/pipelines/chicago_taxi_simple/Transform/transform_output/4/
/tmp/tmpxd30uz6t/testSimplePipeline/tfx/pipelines/chicago_taxi_simple/Transform/transform_output/4/:
metadata/  transformed_metadata/  transform_fn/

/tmp/tmpxd30uz6t/testSimplePipeline/tfx/pipelines/chicago_taxi_simple/Transform/transform_output/4/metadata:
schema.pbtxt

/tmp/tmpxd30uz6t/testSimplePipeline/tfx/pipelines/chicago_taxi_simple/Transform/transform_output/4/transformed_metadata:
schema.pbtxt

/tmp/tmpxd30uz6t/testSimplePipeline/tfx/pipelines/chicago_taxi_simple/Transform/transform_output/4/transform_fn:
assets/  saved_model.pb  variables/

/tmp/tmpxd30uz6t/testSimplePipeline/tfx/pipelines/chicago_taxi_simple/Transform/transform_output/4/transform_fn/assets:
vocab_compute_and_apply_vocabulary_1_vocabulary
vocab_compute_and_apply_vocabulary_vocabulary

/tmp/tmpxd30uz6t/testSimplePipeline/tfx/pipelines/chicago_taxi_simple/Transform/transform_output/4/transform_fn

In [6]:
import tensorflow_transform as tft
from tfx.utils import io_utils
from tensorflow_metadata.proto.v0 import schema_pb2

schema_utils = tft.tf_metadata.schema_utils
schema_proto = io_utils.parse_pbtxt_file(file_name=schema_uri, message=schema_pb2.Schema())
feature_spec, domains = schema_utils.schema_as_feature_spec(schema_proto)

pp.pprint(feature_spec)

original_feature_spec, original_domains = schema_utils.schema_as_feature_spec(io_utils.parse_pbtxt_file(file_name=original_schema_uri, message=schema_pb2.Schema()))

print('Feature spec for original schema')
pp.pprint(original_feature_spec)

{   'company_xf': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
    'dropoff_census_tract_xf': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
    'dropoff_community_area_xf': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
    'dropoff_latitude_xf': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
    'dropoff_longitude_xf': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
    'fare_xf': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
    'payment_type_xf': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
    'pickup_census_tract_xf': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
    'pickup_community_area_xf': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
    'pickup_latitude_xf': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
    'pickup_longitude_xf': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
    'tips_

**WEIRD: Extract the tf.Examples from the tf.Records file created by Transform**

In [7]:
# examples_root = transform_root.replace('transform_output', 'transformed_examples')
examples_root = os.path.dirname(store.get_artifacts_of_type_df(tfx_utils.TFXArtifactTypes.EXAMPLES).iloc[0].URI)
examples_root = os.path.dirname(examples_root)
print('examples_root: {}'.format(examples_root))
!ls -RF {examples_root}
examples_uri = os.path.join(examples_root, 'eval', listdir(os.path.join(examples_root, 'eval'))[0])
print('examples_uri: {}\n'.format(examples_uri))

raw_dataset = tf.data.TFRecordDataset([examples_uri], compression_type='GZIP')
print('raw_dataset: ({}) {}'.format(type(raw_dataset), raw_dataset))

parsed_examples = []
for ex in raw_dataset:
    ex2 = tf.train.Example.FromString(ex.numpy())
    parsed_examples.append(ex2)
print('parsed_examples is a: {}'.format(type(raw_dataset)))

examples_root: /tmp/tmpxd30uz6t/testSimplePipeline/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1
/tmp/tmpxd30uz6t/testSimplePipeline/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1:
eval/  train/

/tmp/tmpxd30uz6t/testSimplePipeline/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/eval:
data_tfrecord-00000-of-00001.gz

/tmp/tmpxd30uz6t/testSimplePipeline/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/train:
data_tfrecord-00000-of-00001.gz
examples_uri: /tmp/tmpxd30uz6t/testSimplePipeline/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/eval/data_tfrecord-00000-of-00001.gz

raw_dataset: (<class 'tensorflow.python.data.ops.readers.TFRecordDatasetV1'>) <TFRecordDatasetV1 shapes: (), types: tf.string>
Instructions for updating:
Colocations handled automatically by placer.


W0625 18:22:07.538847 140249631389504 deprecation.py:323] From /usr/local/google/home/zhitaoli/tfx-env/lib/python3.5/site-packages/tensorflow/python/data/ops/iterator_ops.py:532: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


parsed_examples is a: <class 'tensorflow.python.data.ops.readers.TFRecordDatasetV1'>


**WEIRD: This should not work.  The model should be expecting the examples from ExampleGen, and the schema from SchemaGen.  We're giving it the transformed examples from Transform, and the transformed schema from Transform.  Something is fishy.**

Now analyze the model performance:

In [8]:
from witwidget.notebook.visualization import WitConfigBuilder
from witwidget.notebook.visualization import WitWidget

tool_height_in_px = 1000

# Old code: Setup the tool with the test examples and the trained classifier
# config_builder = WitConfigBuilder(parsed_examples).set_estimator_and_feature_spec(
#    model, original_feature_spec).set_label_vocab(['good_tipper', 'bad_tipper'])
# WitWidget(config_builder, height=tool_height_in_px)


# New code: Setup the tool with the test examples and the trained classifier
config_builder = WitConfigBuilder(parsed_examples).set_inference_address(
    "127.0.0.1:8500").set_model_name("chicago_taxi").set_label_vocab(['good_tipper', 'bad_tipper'])
WitWidget(config_builder, height=tool_height_in_px)



WitWidget(config={'model_name': 'chicago_taxi', 'inference_address': '127.0.0.1:8500', 'are_sequence_examples'…

