In [337]:
# Installation

#To use this project, you'll need to install the TensorFlow Extended (TFX) library. You can do this using pip:

!pip install tfx

In [338]:
# Import necessary libraries and modules
import os
import pprint
import tempfile
import urllib
import polars as pl
import absl
import tensorflow as tf
import tensorflow_model_analysis as tfma

# Disable TensorFlow logger propagation to avoid duplicate log messages
tf.get_logger().propagate = False

# Initialize a pretty printer for better display of complex data structures
pp = pprint.PrettyPrinter()

# Import TFX library and InteractiveContext for running pipelines interactively
from tfx import v1 as tfx
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext

# Reload the tfx.orchestration.experimental.interactive.notebook_extensions.skip IPython extension
%reload_ext tfx.orchestration.experimental.interactive.notebook_extensions.skip


In [339]:
# Print TensorFlow and TFX versions
print("TensorFlow version: {}".format(tf.__version__))
print("TFX version: {}".format(tfx.__version__))


TensorFlow version: 2.15.1
TFX version: 1.15.0


In [340]:
# Import datetime library to generate a timestamp
from datetime import datetime

# Define constants and configuration variables
GOOGLE_CLOUD_REGION = 'us-central1'
GOOGLE_CLOUD_PROJECT = 'brldi-ds-capabilities-ccai'
GCS_BUCKET_NAME = 'chicago_taxi_mlops_pipeline'

# Pipeline configuration
PIPELINE_NAME = 'tensorflow-pipeline'
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
PIPELINE_ROOT = f'gs://{GCS_BUCKET_NAME}/pipeline_root/{PIPELINE_NAME}' # Path to various pipeline artifact.
MODULE_ROOT = f'gs://{GCS_BUCKET_NAME}/pipeline_module/{PIPELINE_NAME}' # Paths for users' Python module.
DATA_ROOT = f'gs://{GCS_BUCKET_NAME}/data/{PIPELINE_NAME}' # Paths for users' data.

# Vertex AI Endpoint configuration
ENDPOINT_NAME = PIPELINE_NAME + '-' + TIMESTAMP

In [341]:
# Import the BigQuery library from Google Cloud
from google.cloud import bigquery

# Define a SQL query to fetch data from the BigQuery public dataset
query = """
SELECT * FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
WHERE EXTRACT(year FROM trip_start_timestamp) > 2019
"""


In [342]:
# Import the BigQuery library from Google Cloud (already imported in the previous example)
# from google.cloud import bigquery

# Create a BigQuery client
client = bigquery.Client()

# Execute the query and store the results as a QueryJob
data = client.query(query)

# Convert the QueryJob to a pandas DataFrame
data = data.to_dataframe()

# Print the shape of the DataFrame (number of rows and columns)
print(data.shape)

(48191, 23)


In [343]:
# Create a new column "trip_start_day" in the DataFrame by extracting the day from the "trip_start_timestamp" column
data["trip_start_day"] = data.trip_start_timestamp.apply(lambda x: x.day)


In [344]:
# Create a new column "trip_start_month" in the DataFrame by extracting the month from the "trip_start_timestamp" column
data["trip_start_month"] = data.trip_start_timestamp.apply(lambda x: x.month)


In [345]:
# Create a new column "trip_start_hour" in the DataFrame by extracting the hour from the "trip_start_timestamp" column
data["trip_start_hour"] = data.trip_start_timestamp.apply(lambda x: x.hour)


In [346]:
# Save the DataFrame to a CSV file named "Chicago_Taxi_From_2020.csv"
data.to_csv("Chicago_Taxi_From_2020.csv")


In [347]:
# Note: These are shell commands, not Python code.

# Create a new Google Cloud Storage bucket with the specified name and region
!gsutil mb -l {GOOGLE_CLOUD_REGION} gs://{GCS_BUCKET_NAME}

# Copy the "Chicago_Taxi_From_2020.csv" file from the current working directory to the specified path in the Google Cloud Storage bucket
!gsutil cp Chicago_Taxi_From_2020.csv {DATA_ROOT}/


Creating gs://chicago_taxi_mlops_pipeline/...
ServiceException: 409 A Cloud Storage bucket named 'chicago_taxi_mlops_pipeline' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.
Copying file://Chicago_Taxi_From_2020.csv [Content-Type=text/csv]...
/ [1 files][ 19.2 MiB/ 19.2 MiB]                                                
Operation completed over 1 objects/19.2 MiB.                                     


In [348]:
# Import the necessary libraries
import importlib
import exampleGen

# Reload the exampleGen module to ensure any changes are loaded
importlib.reload(exampleGen)

# Create an instance of the exampleGen class, passing the DATA_ROOT path as an argument
example_gen = exampleGen.exampleGen(DATA_ROOT)




In [349]:
# Note: This code snippet is commented out.

# Get the first output artifact from the 'examples' output of the example_gen instance
# artifact = example_gen.outputs['examples'].get()[0]

# Print the split names and URI of the output artifact
# print(artifact.split_names, artifact.uri)


In [350]:
# Note: This code snippet is commented out.

# Get the URI of the output artifact representing the training examples, which is a directory
# train_uri = os.path.join(example_gen.outputs['examples'].get()[0].uri, 'Split-train')

# Get the list of files in this directory (all compressed TFRecord files)
# tfrecord_filenames = [os.path.join(train_uri, name)
#                       for name in os.listdir(train_uri)]

# Create a `TFRecordDataset` to read these files
# dataset = tf.data.TFRecordDataset(tfrecord_filenames, compression_type="GZIP")

# Iterate over the first 3 records and decode them.
# for tfrecord in dataset.take(3):
#   serialized_example = tfrecord.numpy()
#   example = tf.train.Example()
#   example.ParseFromString(serialized_example)
#   pp.pprint(example)


In [351]:
# Import the necessary libraries
import importlib
import statisticsGen

# Reload the statisticsGen module to ensure any changes are loaded
importlib.reload(statisticsGen)

# Create an instance of the statisticsGen class, passing the example_gen instance as an argument
statistics_gen = statisticsGen.statisticsGen(example_gen)

# Note: The following line is commented out because it requires an InteractiveContext instance (context) to be defined.
# context.run(statistics_gen, enable_cache=True)


In [352]:
# context.show(statistics_gen.outputs['statistics'])

In [353]:
# Import the necessary libraries
import importlib
import schemaGen

# Reload the schemaGen module to ensure any changes are loaded
importlib.reload(schemaGen)

# Create an instance of the schemaGen class, passing the statistics_gen instance as an argument
schema_gen = schemaGen.schemaGen(statistics_gen)

# Note: The following line is commented out because it requires an InteractiveContext instance (context) to be defined.
# context.run(schema_gen, enable_cache=True)


In [354]:
# Note: This code snippet is commented out.

# Display the output of the schema_gen pipeline, specifically the "schema" output.
# context.show(schema_gen.outputs["schema"])


In [355]:
# Import the necessary libraries
import importlib
import exampleValidator

# Reload the exampleValidator module to ensure any changes are loaded
importlib.reload(exampleValidator)

# Create an instance of the exampleValidator class, passing the statistics_gen and schema_gen instances as arguments
example_validator = exampleValidator.exampleValidator(statistics_gen, schema_gen)

# Note: The following line is commented out because it requires an InteractiveContext instance (context) to be defined.
# context.run(example_validator, enable_cache=True)


In [356]:
# Note: This code snippet is commented out.

# Display the output of the example_validator pipeline, specifically the "anomalies" output.
# context.show(example_validator.outputs['anomalies'])


In [357]:
# Create a Transform component instance, passing the necessary arguments
transform = tfx.components.Transform(
    examples=example_gen.outputs['examples'],  # Output of the ExampleGen component
    schema=schema_gen.outputs['schema'],        # Output of the SchemaGen component
    module_file=f"gs://{GCS_BUCKET_NAME}/models/taxi_transform.py"  # Path to the custom transform module
)

# Note: The following line is commented out because it requires an InteractiveContext instance (context) to be defined.
# context.run(transform, enable_cache=True)


In [358]:
# Note: This code snippet is commented out.

# Display the output of the transform pipeline, specifically the "transform_graph" output.
# context.show(transform.outputs["transform_graph"])


In [359]:
# Note: This code snippet is commented out.

# Get the URI of the output artifact representing the transformed examples, which is a directory
# train_uri = transform.outputs['transform_graph'].get()[0].uri

# List the contents of the directory
# os.listdir(train_uri)


In [360]:
# Note: This code snippet is commented out.

# Get the URI of the output artifact representing the transformed examples, which is a directory
# train_uri = os.path.join(transform.outputs['transformed_examples'].get()[0].uri, 'Split-train')

# Get the list of files in this directory (all compressed TFRecord files)
# tfrecord_filenames = [os.path.join(train_uri, name)
#                       for name in os.listdir(train_uri)]

# Create a `TFRecordDataset` to read these files
# dataset = tf.data.TFRecordDataset(tfrecord_filenames, compression_type="GZIP")

# Iterate over the first 3 records and decode them.
# for tfrecord in dataset.take(3):
#   serialized_example = tfrecord.numpy()
#   example = tf.train.Example()
#   example.ParseFromString(serialized_example)
#   pp.pprint(example)


In [361]:
# Create a Trainer component instance, passing the necessary arguments
trainer = tfx.components.Trainer(
    module_file=f"gs://{GCS_BUCKET_NAME}/models/taxi_trainer.py",  # Path to the custom trainer module
    examples=transform.outputs['transformed_examples'],           # Output of the Transform component
    transform_graph=transform.outputs['transform_graph'],         # Output of the Transform component
    schema=schema_gen.outputs['schema'],                          # Output of the SchemaGen component
    train_args=tfx.proto.TrainArgs(num_steps=10000),              # Training arguments
    eval_args=tfx.proto.EvalArgs(num_steps=5000)                   # Evaluation arguments
)

# Note: The following line is commented out because it requires an InteractiveContext instance (context) to be defined.
# context.run(trainer, enable_cache=True)


In [362]:
# Note: This code snippet is commented out.

# Get the URI of the output artifact representing the trained model
# model_artifact_dir = trainer.outputs['model'].get()[0].uri

# List the contents of the directory
# pp.pprint(os.listdir(model_artifact_dir))

# Get the path to the directory containing the model files in the SavedModel format
# model_dir = os.path.join(model_artifact_dir, 'Format-Serving')

# List the contents of the directory
# pp.pprint(os.listdir(model_dir))


In [363]:
# Note: This code snippet is commented out.

# Get the URI of the output artifact representing the model run, which contains the TensorBoard logs
# model_run_artifact_dir = trainer.outputs['model_run'].get()[0].uri

# Load the TensorBoard extension
# %load_ext tensorboard

# Launch TensorBoard to visualize the logs
# %tensorboard --logdir {model_run_artifact_dir}


In [364]:
# Importing required libraries
import sys
import importlib
import importlib.util
from google.cloud import storage

# Define the Cloud Storage path
# The bucket_name is the name of the Cloud Storage bucket where the model is stored
# The blob_name is the path to the model within the bucket
# The local_file_path is the path where the model will be downloaded to on the local machine
bucket_name = "chicago_taxi_mlops_pipeline"
blob_name = "models/taxi_constants.py"
local_file_path = "/tmp/taxi_constants.py"  # Local path to download the file

# Download the file from Cloud Storage
# The storage_client is used to interact with the Cloud Storage service
# The bucket object represents the Cloud Storage bucket where the model is stored
# The blob object represents the model within the bucket
# The download_to_filename method is used to download the model to the local machine
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(blob_name)
blob.download_to_filename(local_file_path)

# Load the module dynamically
# The spec object is used to specify the module that will be loaded
# The module_from_spec method is used to create the module object
# The exec_module method is used to execute the module's code
spec = importlib.util.spec_from_file_location("taxi_constants", local_file_path)
taxi_constants = importlib.util.module_from_spec(spec)
spec.loader.exec_module(taxi_constants)

# Now you can access the LABEL_KEY attribute
# The LABEL_KEY attribute is used to specify the label column in the data
label_key = taxi_constants.LABEL_KEY

# Define evaluation configuration for the model
# The EvalConfig object is used to specify the evaluation configuration for the model
# The model_specs attribute is used to specify the model's signature and input/output tensors
# The metrics_specs attribute is used to specify the metrics that will be computed during evaluation
# The slicing_specs attribute is used to specify the slices of data that will be used for evaluation
eval_config = tfma.EvalConfig(
    model_specs=[
        # This assumes a serving model with signature 'serving_default'. If
        # using estimator based EvalSavedModel, add signature_name: 'eval' and
        # remove the label_key.
        tfma.ModelSpec(
            signature_name='serving_default',
            label_key=label_key,
            preprocessing_function_names=['transform_features'],
            )
        ],
    metrics_specs=[
        tfma.MetricsSpec(
            # The metrics added here are in addition to those saved with the
            # model (assuming either a keras model or EvalSavedModel is used).
            # Any metrics added into the saved model (for example using
            # model.compile(..., metrics=[...]), etc) will be computed
            # automatically.
            # To add validation thresholds for metrics saved with the model,
            # add them keyed by metric name to the thresholds map.
            metrics=[
                tfma.MetricConfig(class_name='ExampleCount'),
                tfma.MetricConfig(class_name='BinaryAccuracy',
                  threshold=tfma.MetricThreshold(
                      value_threshold=tfma.GenericValueThreshold(
                          lower_bound={'value': 0.5}),
                      # Change threshold will be ignored if there is no
                      # baseline model resolved from MLMD (first run).
                      change_threshold=tfma.GenericChangeThreshold(
                          direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                          absolute={'value': -1e-10})))
            ]
        )
    ],


In [365]:
# Use TensorFlow Model Analysis (TFMA) to compute evaluation statistics over
# features of a model and validate them against a baseline.

# The model resolver is only required if we're performing model validation in
# addition to evaluation. In this case, we'll validate our candidate model
# against the latest blessed model. If no model has been blessed before (as in
# this case), the evaluator will make our candidate the first blessed model.
model_resolver = tfx.dsl.Resolver(
    # Use the experimental LatestBlessedModelStrategy strategy for resolving the
    # model and its corresponding blessing.
    strategy_class=tfx.dsl.experimental.LatestBlessedModelStrategy,
    # The model channel is used to pass the model artifact to the evaluator.
    model=tfx.dsl.Channel(type=tfx.types.standard_artifacts.Model),
    # The model blessing channel is used to pass the blessing artifact to the
    # evaluator.
    model_blessing=tfx.dsl.Channel(
        type=tfx.types.standard_artifacts.ModelBlessing)
    ).with_id('latest_blessed_model_resolver')

# Uncomment the following line to run the model resolver and enable caching.
# This will ensure that the same blessed model is used for multiple runs of the
# pipeline.
# context.run(model_resolver, enable_cache=True)


0,1
.execution_id,20
.component,<tfx.dsl.components.common.resolver.Resolver object at 0x7f5efe894f10>
.component.inputs,"['model']ResolvedChannel(artifact_type=Model, LatestBlessedModelStrategy(Dict(model=Input(), model_blessing=Input()))[""model""])['model_blessing']ResolvedChannel(artifact_type=ModelBlessing, LatestBlessedModelStrategy(Dict(model=Input(), model_blessing=Input()))[""model_blessing""])"
.component.outputs,['model'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Model' (1 artifact) at 0x7f5efe896f50.type_nameModel._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Model' (uri: /var/tmp/tfx-interactive-2024-05-14T00_05_33.210611-l7_h3fve/Trainer/model/6) at 0x7f5efe894670.type<class 'tfx.types.standard_artifacts.Model'>.uri/var/tmp/tfx-interactive-2024-05-14T00_05_33.210611-l7_h3fve/Trainer/model/6['model_blessing'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ModelBlessing' (1 artifact) at 0x7f5efe897850.type_nameModelBlessing._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ModelBlessing' (uri: /var/tmp/tfx-interactive-2024-05-14T00_05_33.210611-l7_h3fve/Evaluator/blessing/8) at 0x7f5efebfa1a0.type<class 'tfx.types.standard_artifacts.ModelBlessing'>.uri/var/tmp/tfx-interactive-2024-05-14T00_05_33.210611-l7_h3fve/Evaluator/blessing/8

0,1
['model'],"ResolvedChannel(artifact_type=Model, LatestBlessedModelStrategy(Dict(model=Input(), model_blessing=Input()))[""model""])"
['model_blessing'],"ResolvedChannel(artifact_type=ModelBlessing, LatestBlessedModelStrategy(Dict(model=Input(), model_blessing=Input()))[""model_blessing""])"

0,1
['model'],function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Model' (1 artifact) at 0x7f5efe896f50.type_nameModel._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Model' (uri: /var/tmp/tfx-interactive-2024-05-14T00_05_33.210611-l7_h3fve/Trainer/model/6) at 0x7f5efe894670.type<class 'tfx.types.standard_artifacts.Model'>.uri/var/tmp/tfx-interactive-2024-05-14T00_05_33.210611-l7_h3fve/Trainer/model/6
['model_blessing'],function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ModelBlessing' (1 artifact) at 0x7f5efe897850.type_nameModelBlessing._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ModelBlessing' (uri: /var/tmp/tfx-interactive-2024-05-14T00_05_33.210611-l7_h3fve/Evaluator/blessing/8) at 0x7f5efebfa1a0.type<class 'tfx.types.standard_artifacts.ModelBlessing'>.uri/var/tmp/tfx-interactive-2024-05-14T00_05_33.210611-l7_h3fve/Evaluator/blessing/8

0,1
.type_name,Model
._artifacts,[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Model' (uri: /var/tmp/tfx-interactive-2024-05-14T00_05_33.210611-l7_h3fve/Trainer/model/6) at 0x7f5efe894670.type<class 'tfx.types.standard_artifacts.Model'>.uri/var/tmp/tfx-interactive-2024-05-14T00_05_33.210611-l7_h3fve/Trainer/model/6

0,1
[0],function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Model' (uri: /var/tmp/tfx-interactive-2024-05-14T00_05_33.210611-l7_h3fve/Trainer/model/6) at 0x7f5efe894670.type<class 'tfx.types.standard_artifacts.Model'>.uri/var/tmp/tfx-interactive-2024-05-14T00_05_33.210611-l7_h3fve/Trainer/model/6

0,1
.type,<class 'tfx.types.standard_artifacts.Model'>
.uri,/var/tmp/tfx-interactive-2024-05-14T00_05_33.210611-l7_h3fve/Trainer/model/6

0,1
.type_name,ModelBlessing
._artifacts,[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ModelBlessing' (uri: /var/tmp/tfx-interactive-2024-05-14T00_05_33.210611-l7_h3fve/Evaluator/blessing/8) at 0x7f5efebfa1a0.type<class 'tfx.types.standard_artifacts.ModelBlessing'>.uri/var/tmp/tfx-interactive-2024-05-14T00_05_33.210611-l7_h3fve/Evaluator/blessing/8

0,1
[0],function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ModelBlessing' (uri: /var/tmp/tfx-interactive-2024-05-14T00_05_33.210611-l7_h3fve/Evaluator/blessing/8) at 0x7f5efebfa1a0.type<class 'tfx.types.standard_artifacts.ModelBlessing'>.uri/var/tmp/tfx-interactive-2024-05-14T00_05_33.210611-l7_h3fve/Evaluator/blessing/8

0,1
.type,<class 'tfx.types.standard_artifacts.ModelBlessing'>
.uri,/var/tmp/tfx-interactive-2024-05-14T00_05_33.210611-l7_h3fve/Evaluator/blessing/8


In [366]:
# Create an evaluator component to evaluate the performance of our model.
evaluator = tfx.components.Evaluator(
    # The examples input is used to provide the data that will be used for
    # evaluation. In this case, we're using the output of the example_gen
    # component.
    examples=example_gen.outputs['examples'],
    # The model input is used to provide the model that will be evaluated. In
    # this case, we're using the output of the trainer component.
    model=trainer.outputs['model'],
    # The baseline_model input is optional and can be used to provide a
    # baseline model for comparison. In this case, we're using the output of
    # the model_resolver component.
    baseline_model=model_resolver.outputs['model'],
    # The eval_config input is used to specify the configuration for the
    # evaluation. In this case, we're using the eval_config object that we
    # defined earlier.
    eval_config=eval_config
)

# Uncomment the following line to run the evaluator component and enable
# caching. This will ensure that the results of the evaluation are cached and
# reused for subsequent runs of the pipeline.
# context.run(evaluator, enable_cache=True)


In [367]:
 evaluator.outputs

{'evaluation': OutputChannel(artifact_type=ModelEvaluation, producer_component_id=Evaluator, output_key=evaluation, additional_properties={}, additional_custom_properties={}, _input_trigger=None, _is_async=False),
 'blessing': OutputChannel(artifact_type=ModelBlessing, producer_component_id=Evaluator, output_key=blessing, additional_properties={}, additional_custom_properties={}, _input_trigger=None, _is_async=False)}

In [369]:
# Import the TensorFlow Model Analysis library
#import tensorflow_model_analysis as tfma

# Get the TFMA output result path and load the result.
# Note: The evaluator object should have been created and run before this point.
#PATH_TO_RESULT = evaluator.outputs['evaluation'].get()[0].uri
#tfma_result = tfma.load_eval_result(PATH_TO_RESULT)

# Show data sliced along feature column 'trip_start_hour'.
# This will display a visualization of the slicing metrics, which can help identify performance
# differences across various slices of the data.
#tfma.view.render_slicing_metrics(
#    tfma_result, slicing_column='trip_start_hour')


In [370]:
# Get the blessing output path.
# The blessing output contains information about whether the model passed or failed the evaluation.
#blessing_uri = evaluator.outputs['blessing'].get()[0].uri

# List the contents of the blessing output directory.
# This will show the files that were generated as part of the blessing output.
#!ls -l {blessing_uri}


In [371]:
# Load the TensorFlow Model Analysis validation result from the specified file path
#PATH_TO_RESULT = evaluator.outputs['evaluation'].get()[0].uri
#validation_result = tfma.load_validation_result(PATH_TO_RESULT)


In [372]:
# Define the Pusher component to push the trained model and metadata to a specified destination
pusher = tfx.components.Pusher(
    model=trainer.outputs['model'],
    model_blessing=evaluator.outputs['blessing'],
    push_destination=tfx.proto.PushDestination(
        filesystem=tfx.proto.PushDestination.Filesystem(
            base_directory="gs://chicago_taxi_mlops_pipeline/pusher")))

# Commented out: Uncomment this line to execute the Pusher component and push the trained model and metadata to the specified destination
# context.run(pusher, enable_cache=True)


In [374]:
# Uncomment the line below after running the Pusher component to load the pushed model from the specified URI
# push_uri = pusher.outputs['pushed_model'].get()[0].uri

# Uncomment the lines below after loading the pushed model to inspect its signatures
# model = tf.saved_model.load(push_uri)
# for item in model.signatures.items():
#   pp.pprint(item)


In [376]:
#This is a list of TFX pipeline components that defines the pipeline's workflow
components = [
    example_gen,
    statistics_gen,
    schema_gen,
    example_validator,
    transform,
    trainer,
    model_resolver,
    evaluator,
    pusher
  ]

In [377]:
# Create a TFX pipeline object with the given pipeline name, pipeline root directory, and list of components.
pipeline = tfx.dsl.Pipeline(
    pipeline_name=PIPELINE_NAME,  # A unique name for the pipeline.
    pipeline_root=PIPELINE_ROOT,  # The root directory for storing pipeline artifacts and metadata.
    components=components  # A list of pipeline components that define the pipeline's workflow.
)


In [379]:
# Define the pipeline definition file name using the pipeline name and the '-pipeline.json' suffix.
PIPELINE_DEFINITION_FILE = PIPELINE_NAME + '-pipeline.json'

# Configure the pipeline runner to use the Kubeflow V2 DagRunner, which allows the pipeline to be executed on a Kubeflow cluster.
runner = tfx.orchestration.experimental.KubeflowV2DagRunner(
    config=tfx.orchestration.experimental.KubeflowV2DagRunnerConfig(),
    output_filenam


In [380]:
# Run the pipeline with the configured runner.
_ = runner.run(pipeline)


running bdist_wheel
running build
running build_py
creating build
creating build/lib
copying taxi_transform.py -> build/lib
installing to /var/tmp/tmp2jczwv76
running install
running install_lib
copying build/lib/taxi_transform.py -> /var/tmp/tmp2jczwv76
running install_egg_info
running egg_info
creating tfx_user_code_Transform.egg-info
writing tfx_user_code_Transform.egg-info/PKG-INFO
writing dependency_links to tfx_user_code_Transform.egg-info/dependency_links.txt
writing top-level names to tfx_user_code_Transform.egg-info/top_level.txt
writing manifest file 'tfx_user_code_Transform.egg-info/SOURCES.txt'
reading manifest file 'tfx_user_code_Transform.egg-info/SOURCES.txt'
writing manifest file 'tfx_user_code_Transform.egg-info/SOURCES.txt'
Copying tfx_user_code_Transform.egg-info to /var/tmp/tmp2jczwv76/tfx_user_code_Transform-0.0+81f524a1c5c8e7c5d8afa3fa47bbb1b9952677d155fa51958bc176ffd58f6f8f-py3.10.egg-info
running install_scripts
creating /var/tmp/tmp2jczwv76/tfx_user_code_Transf

!!

        ********************************************************************************
        Please avoid running ``setup.py`` directly.
        Instead, use pypa/build, pypa/installer or other
        standards-based tools.

        See https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html for details.
        ********************************************************************************

!!
  self.initialize_options()


running bdist_wheel
running build
running build_py
creating build
creating build/lib
copying taxi_trainer.py -> build/lib
installing to /var/tmp/tmpbxjn66kh
running install
running install_lib
copying build/lib/taxi_trainer.py -> /var/tmp/tmpbxjn66kh
running install_egg_info
running egg_info
creating tfx_user_code_Trainer.egg-info
writing tfx_user_code_Trainer.egg-info/PKG-INFO
writing dependency_links to tfx_user_code_Trainer.egg-info/dependency_links.txt
writing top-level names to tfx_user_code_Trainer.egg-info/top_level.txt
writing manifest file 'tfx_user_code_Trainer.egg-info/SOURCES.txt'
reading manifest file 'tfx_user_code_Trainer.egg-info/SOURCES.txt'
writing manifest file 'tfx_user_code_Trainer.egg-info/SOURCES.txt'
Copying tfx_user_code_Trainer.egg-info to /var/tmp/tmpbxjn66kh/tfx_user_code_Trainer-0.0+393214a367044f6d52018f533ee0f28dff5042920e5d9279e0d8c450f92d0b7f-py3.10.egg-info
running install_scripts
creating /var/tmp/tmpbxjn66kh/tfx_user_code_Trainer-0.0+393214a367044f6d

!!

        ********************************************************************************
        Please avoid running ``setup.py`` directly.
        Instead, use pypa/build, pypa/installer or other
        standards-based tools.

        See https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html for details.
        ********************************************************************************

!!
  self.initialize_options()


In [381]:
#It contains information such as which evaluation metrics to use, how to split the data into training and evaluation sets, and how to preprocess the data for evaluation.
eval_config

model_specs {
  signature_name: "serving_default"
  label_key: "tips"
  preprocessing_function_names: "transform_features"
}
slicing_specs {
}
slicing_specs {
  feature_keys: "trip_start_hour"
}
metrics_specs {
  metrics {
    class_name: "ExampleCount"
  }
  metrics {
    class_name: "BinaryAccuracy"
    threshold {
      value_threshold {
        lower_bound {
          value: 0.5
        }
      }
      change_threshold {
        absolute {
          value: -1e-10
        }
        direction: HIGHER_IS_BETTER
      }
    }
  }
}