Runs post pipeline analysis

In [36]:
from tfx import v1 as tfx
from pipeline.config import pipe_config
# import tensorflow_decision_forests as tfdf

In [37]:
from tfx.orchestration.experimental.interactive import visualizations

def visualize_artifacts(artifacts):
  """Visualizes artifacts using standard visualization modules."""
  for artifact in artifacts:
    visualization = visualizations.get_registry().get_visualization(
        artifact.type_name)
    if visualization:
      visualization.display(artifact)

from tfx.orchestration.experimental.interactive import standard_visualizations
standard_visualizations.register_standard_visualizations()

# Connect to Pipeline output

In [38]:
metadata_connection_config = tfx.orchestration.metadata.sqlite_metadata_connection_config(
    pipe_config.METADATA_PATH)

In [77]:
from pipeline.schema_pipeline.utils import get_latest_artifacts
from tfx.orchestration.metadata import Metadata
from tfx.types import standard_component_specs

with Metadata(metadata_connection_config) as metadata_handler:
    stat_gen_output = get_latest_artifacts(metadata_handler, pipe_config.PIPELINE_NAME,
                                         'StatisticsGen')
    stats_artifacts = stat_gen_output[standard_component_specs.STATISTICS_KEY]

    ev_output = get_latest_artifacts(metadata_handler, pipe_config.PIPELINE_NAME,
                                   'ExampleValidator')
    anomalies_artifacts = ev_output[standard_component_specs.ANOMALIES_KEY]

    trainer_outputs = get_latest_artifacts(metadata_handler, pipe_config.PIPELINE_NAME,
                                          'Trainer')
    example_gen_outputs = get_latest_artifacts(metadata_handler, pipe_config.PIPELINE_NAME,
                                          'CsvExampleGen')
    evaluator_outputs = get_latest_artifacts(metadata_handler, pipe_config.PIPELINE_NAME,
                                          'Evaluator')
    eval_artifact = evaluator_outputs[standard_component_specs.EVALUATION_KEY][0]
    pusher_outputs = get_latest_artifacts(metadata_handler, pipe_config.PIPELINE_NAME,
                                          'Pusher')
    

In [78]:
import tensorflow_data_validation as tfdv
import os
path1 = os.path.join(stats_artifacts[0].uri,'Split-train/FeatureStats.pb')
path2 = os.path.join(stats_artifacts[0].uri,'Split-eval/FeatureStats.pb')
stats1 = tfdv.load_stats_binary(path1)
stats2 = tfdv.load_stats_binary(path2)

tfdv.visualize_statistics(stats1, stats2)

## Example Validator

In [79]:
schema = tfdv.load_schema_text('./schema/penguin-schema/schema.pbtxt/schema.pbtxt')
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'bill_depth_mm',FLOAT,required,,-
'bill_length_mm',FLOAT,required,,-
'body_mass_g',FLOAT,required,,-
'flipper_length_mm',FLOAT,required,,-
'island',STRING,required,,'island'
'sex',STRING,required,,'sex'
'species',INT,required,,-
'year',INT,required,,-


  pd.set_option('max_colwidth', -1)


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'island',"'Biscoe', 'Dream', 'Torgersen'"
'sex',"'female', 'male'"


In [80]:
visualize_artifacts(anomalies_artifacts)

  pd.set_option('max_colwidth', -1)


## Trainer

In [81]:
model_run_artifact_dir = trainer_outputs['model_run'][0].uri
model_run_artifact_dir

'pipeline_output/penguin-e2e/Trainer/model_run/21'

In [82]:
# %reload_ext tensorboard
%tensorboard --logdir {model_run_artifact_dir} 

## Evaluator

In [83]:
import tensorflow_model_analysis as tfma

In [84]:
# !jupyter nbextension enable --py widgetsnbextension
# !jupyter nbextension enable --py tensorflow_model_analysis
# !jupyter labextension install tensorflow_model_analysis@0.33.0

In [85]:
!jupyter nbextension list
!jupyter labextension list  # for JupyterLab


Known nbextensions:
  config dir: /usr/local/etc/jupyter/nbconfig
    notebook section
      jupyter-js-widgets/extension  enabled 
      - Validating: OK
JupyterLab v3.0.0
/usr/local/share/jupyter/labextensions
        @jupyter-widgets/jupyterlab-manager v3.0.0 enabled OK (python, jupyterlab_widgets)

Other labextensions (built into JupyterLab)
   app dir: /usr/local/share/jupyter/lab
        tensorflow_model_analysis v0.33.0 enabled OK



In [86]:
eval_result = tfma.load_eval_result(eval_artifact.uri,model_name="candidate")

In [87]:
tfma.view.render_slicing_metrics(eval_result)

SlicingMetricsViewer(config={'weightedExamplesColumn': 'example_count'}, data=[{'slice': 'Overall', 'metrics':…

In [88]:
tfma.view.render_slicing_metrics(eval_result,slicing_column='sex')

SlicingMetricsViewer(config={'weightedExamplesColumn': 'example_count'}, data=[{'slice': 'sex:male', 'metrics'…

In [89]:
tfma.addons.fairness.view.widget_view.render_fairness_indicator(eval_result)

FairnessIndicatorViewer(slicingMetrics=[{'sliceValue': 'male', 'slice': 'sex:male', 'metrics': {'sparse_catego…

In [90]:
tfma.view.render_plot(eval_result, tfma.SlicingSpec(feature_values={"sex": "male"}))

PlotViewer(config={'sliceName': 'sex:male', 'metricKeys': {'calibrationPlot': {'metricName': 'calibrationHisto…

In [91]:
tfma.view.render_plot(eval_result, tfma.SlicingSpec()) # whole dataset

PlotViewer(config={'sliceName': 'Overall', 'metricKeys': {'calibrationPlot': {'metricName': 'calibrationHistog…

In [92]:
# Print validation result
validation_output_path = os.path.join(eval_artifact.uri,'validations')
validation_result = tfma.load_validation_result(validation_output_path)
print(validation_result.validation_ok)

False


### Model performance over time

In [93]:
example_gen_outputs['examples'][0].uri

'pipeline_output/penguin-e2e/CsvExampleGen/examples/17'

In [94]:
# from tfx.utils import proto_utils
# import json

# eval_config_path = os.path.join(eval_artifact.uri,'eval_config.json')
# with open(eval_config_path) as f:
#     eval_config_dict = json.load(f)

    
# eval_config = proto_utils.dict_to_proto(eval_config_dict['evalConfig'],tfma.EvalConfig())

In [95]:
from google.protobuf import text_format

# TODO how to create EvalConfig from eval_config.json from pipeline_output

eval_config = text_format.Parse("""
  ## Model information
  model_specs {
    # For keras (and serving models), you need to add a `label_key`.
    label_key: "species"
  }

  ## Post training metric information. These will be merged with any built-in
  ## metrics from training.
  metrics_specs {
    metrics { class_name: "ExampleCount" }
    metrics { class_name: "SparseCategoricalAccuracy" }
    metrics { class_name: "AUC" }
    metrics { class_name: "AUCPrecisionRecall" }
    metrics { class_name: "Precision" }
    metrics { class_name: "Recall" }
    metrics { class_name: "MeanLabel" }
    metrics { class_name: "MeanPrediction" }
    metrics { class_name: "Calibration" }
    metrics { class_name: "CalibrationPlot" }
    metrics { class_name: "ConfusionMatrixPlot" }
    # ... add additional metrics and plots ...
  }

  ## Slicing information

  # overall slice
  slicing_specs {}

  # slice specific features
  slicing_specs {
    feature_keys: ["sex"]
  }
  slicing_specs {
    feature_keys: ["island"]
  }
""", tfma.EvalConfig())

In [96]:
eval_config

model_specs {
  label_key: "species"
}
slicing_specs {
}
slicing_specs {
  feature_keys: "sex"
}
slicing_specs {
  feature_keys: "island"
}
metrics_specs {
  metrics {
    class_name: "ExampleCount"
  }
  metrics {
    class_name: "SparseCategoricalAccuracy"
  }
  metrics {
    class_name: "AUC"
  }
  metrics {
    class_name: "AUCPrecisionRecall"
  }
  metrics {
    class_name: "Precision"
  }
  metrics {
    class_name: "Recall"
  }
  metrics {
    class_name: "MeanLabel"
  }
  metrics {
    class_name: "MeanPrediction"
  }
  metrics {
    class_name: "Calibration"
  }
  metrics {
    class_name: "CalibrationPlot"
  }
  metrics {
    class_name: "ConfusionMatrixPlot"
  }
}

In [97]:
import gzip
import shutil
import os
import tempfile

DATA_ROOT = tempfile.mkdtemp(prefix='past-data') 
TFRECORD_DAY1 = os.path.join(DATA_ROOT, 't1.tfrecord')
TFRECORD_DAY2 = os.path.join(DATA_ROOT, 't2.tfrecord')
TFRECORD_DAY1_gz = os.path.join(example_gen_outputs['examples'][0].uri, 'Split-train','data_tfrecord-00000-of-00001.gz')
TFRECORD_DAY2_gz = os.path.join(example_gen_outputs['examples'][0].uri, 'Split-eval','data_tfrecord-00000-of-00001.gz')

with gzip.open(TFRECORD_DAY1_gz, 'rb') as f_in:
    with open(TFRECORD_DAY1, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
with gzip.open(TFRECORD_DAY2_gz, 'rb') as f_in:
    with open(TFRECORD_DAY2, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

# change model folder so that time series graph can show x value as category
model_path = pusher_outputs['pushed_model'][0].uri
new_model_path = os.path.join(DATA_ROOT, 'model')
!cp -R {model_path} {new_model_path}

In [98]:
import tensorflow_decision_forests

# Put data paths we prepared earlier in a list
TFRECORDS = [TFRECORD_DAY1, TFRECORD_DAY2]

# Initialize output paths list for each result 
output_paths = []

# Run eval on each tfrecord separately
for num, tfrecord in enumerate(TFRECORDS):

    # Use the same model as before
    eval_shared_model = tfma.default_eval_shared_model(
      eval_saved_model_path=new_model_path,
      eval_config=eval_config)

    # Prepare output path name
    output_path = os.path.join('.', 'time_series', str(num))
    output_paths.append(output_path)

    # Run TFMA on the current tfrecord in the loop
    tfma.run_model_analysis(eval_shared_model=eval_shared_model,
                          eval_config=eval_config,
                          data_location=tfrecord,
                          output_path=output_path)



OSError: SavedModel file does not exist at: /tmp/past-dataiqa7nhlr/model/{saved_model.pbtxt|saved_model.pb} [while running 'ExtractEvaluateAndWriteResults/ExtractAndEvaluate/ExtractTransformedFeatures/Predict']

In [None]:
# Load results for day 1 and day 2 datasets
eval_results_from_disk = tfma.load_eval_results(output_paths[:2])

# Visualize results
tfma.view.render_time_series(eval_results_from_disk)