In [60]:
# import required libs
import glob
import os

import tensorflow as tf
import tensorflow_data_validation as tfdv
from tfx import v1 as tfx

from pipeline.config import pipe_config

In [63]:
from tfx.orchestration.metadata import Metadata
from tfx.types import standard_artifacts
from tfx.orchestration.portable.mlmd import execution_lib
from tfx.orchestration.experimental.interactive import visualizations

from pipeline.schema_pipeline.utils import get_latest_artifacts

def visualize_artifacts(artifacts):
    """Visualizes artifacts using standard visualization modules."""
    for artifact in artifacts:
        visualization = visualizations.get_registry().get_visualization(
            artifact.type_name)
    if visualization:
        visualization.display(artifact)

metadata_connection_config = tfx.orchestration.metadata.sqlite_metadata_connection_config(
    pipe_config.METADATA_PATH)

with Metadata(metadata_connection_config) as store:
    stats_artifacts = store.get_artifacts_by_type(standard_artifacts.ExampleStatistics.TYPE_NAME)
    schema_artifacts = store.get_artifacts_by_type(standard_artifacts.Schema.TYPE_NAME)
    anomalies_artifacts = store.get_artifacts_by_type(standard_artifacts.ExampleAnomalies.TYPE_NAME)

In [64]:
# configure output paths
stats_path = stats_artifacts[-1].uri
train_stats_file = os.path.join(stats_path, 'Split-train', 'FeatureStats.pb')
eval_stats_file = os.path.join(stats_path, 'Split-eval', 'FeatureStats.pb')
print("Train stats file:{}, Eval stats file:{}".format(
    train_stats_file, eval_stats_file))

schema_file = os.path.join(schema_artifacts[-1].uri, 'schema.pbtxt')
print("Generated schame file:{}".format(schema_file))
anomalies_file = os.path.join(anomalies_artifacts[-1].uri, 'Split-train','SchemaDiff.pb')
print("Generated anomalies file:{}".format(anomalies_file))

Train stats file:pipeline_output/penguin-e2e/StatisticsGen/statistics/7/Split-train/FeatureStats.pb, Eval stats file:pipeline_output/penguin-e2e/StatisticsGen/statistics/7/Split-eval/FeatureStats.pb
Generated schame file:schema/penguin-schema/schema.pbtxt/schema.pbtxt
Generated anomalies file:pipeline_output/penguin-e2e/ExampleValidator/anomalies/10/Split-train/SchemaDiff.pb


In [65]:
# load generated statistics from StatisticsGen
train_stats = tfdv.load_stats_binary(train_stats_file)
eval_stats = tfdv.load_stats_binary(eval_stats_file)
tfdv.visualize_statistics(lhs_statistics=eval_stats, rhs_statistics=train_stats,
                          lhs_name='EVAL_DATASET', rhs_name='TRAIN_DATASET')

In [66]:
# load generated schema from SchemaGen
schema = tfdv.load_schema_text(schema_file)
tfdv.display_schema(schema=schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'bill_depth_mm',FLOAT,required,,-
'bill_length_mm',FLOAT,required,,-
'body_mass_g',FLOAT,required,,-
'flipper_length_mm',FLOAT,required,,-
'island',STRING,required,,'island'
'sex',STRING,required,,'sex'
'species',INT,required,,-
'year',INT,required,,-


  pd.set_option('max_colwidth', -1)


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'island',"'Biscoe', 'Dream', 'Torgersen'"
'sex',"'female', 'male'"


In [67]:
anomalies = tfdv.validate_statistics(statistics=train_stats, schema=schema)
tfdv.display_anomalies(anomalies)

  pd.set_option('max_colwidth', -1)
