In [1]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Overview

This notebook demonstrates how to track metrics and parameters for Vertex AI custom training jobs, and how to perform detailed analysis using this data.

### Objective

In this notebook, you learn how to use Vertex AI SDK for Python to:

This tutorial uses the following Google Cloud ML services and resources:
- Vertex AI Dataset
- Vertex AI Model
- Vertex AI Endpoint
- Vertex AI Custom Training Job

The steps performed include:
- Track training parameters and prediction metrics for a custom training job.
- Extract and perform analysis for all parameters and metrics within an Experiment.

### Dataset

This example uses the Abalone Dataset. For more information about this dataset please visit: https://archive.ics.uci.edu/ml/datasets/abalone

### Costs 


This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI
pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage
pricing](https://cloud.google.com/storage/pricing), and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

## Before you begin

#### Set your project ID

**If you don't know your project ID**, try the following:
* Run `gcloud config list`.
* Run `gcloud projects list`.
* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)

In [2]:
PROJECT_ID = "manipalpr-1669047537901"  # @param {type:"string"}

# Set the project id
! gcloud config set project {PROJECT_ID}

Updated property [core/project].


#### Region

You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations).

In [3]:
REGION = "us-central1"  # @param {type: "string"}

### Create a Cloud Storage bucket

Create a storage bucket to store intermediate artifacts such as datasets.

In [14]:
BUCKET_URI = "gs://fractal-parameters-nov27"  # @param {type:"string"}

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [15]:
! gsutil mb -l $REGION -p $PROJECT_ID $BUCKET_URI

Creating gs://fractal-parameters-nov27/...


### Import libraries and define constants

Import required libraries.


In [16]:
import os

import pandas as pd
from google.cloud import aiplatform
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tensorflow.python.keras.utils import data_utils

## Initialize Vertex AI and set an _experiment_


Define experiment name.

In [17]:
EXPERIMENT_NAME = "fractal-experiment"

Initialize the *client* for Vertex AI.

In [18]:
aiplatform.init(
    project=PROJECT_ID,
    location=REGION,
    staging_bucket=BUCKET_URI,
    experiment=EXPERIMENT_NAME,
)

## Tracking parameters and metrics in Vertex AI custom training jobs

### Download the Dataset to Cloud Storage

In [19]:
!wget https://storage.googleapis.com/download.tensorflow.org/data/abalone_train.csv
!gsutil cp abalone_train.csv {BUCKET_URI}/data/

gcs_csv_path = f"{BUCKET_URI}/data/abalone_train.csv"

--2022-11-27 04:04:48--  https://storage.googleapis.com/download.tensorflow.org/data/abalone_train.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.152.128, 142.250.128.128, 142.251.6.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.152.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 145915 (142K) [text/csv]
Saving to: ‘abalone_train.csv.1’


2022-11-27 04:04:48 (98.8 MB/s) - ‘abalone_train.csv.1’ saved [145915/145915]

Copying file://abalone_train.csv [Content-Type=text/csv]...
/ [1 files][142.5 KiB/142.5 KiB]                                                
Operation completed over 1 objects/142.5 KiB.                                    


### Create a Vertex AI Tabular dataset from CSV data

A Vertex AI dataset can be used to create an AutoML model or a custom model. 

In [20]:
ds = aiplatform.TabularDataset.create(display_name="abalone", gcs_source=[gcs_csv_path])

ds.resource_name

Creating TabularDataset


INFO:google.cloud.aiplatform.datasets.dataset:Creating TabularDataset


Create TabularDataset backing LRO: projects/1078378940735/locations/us-central1/datasets/2293469105354702848/operations/997128747129765888


INFO:google.cloud.aiplatform.datasets.dataset:Create TabularDataset backing LRO: projects/1078378940735/locations/us-central1/datasets/2293469105354702848/operations/997128747129765888


TabularDataset created. Resource name: projects/1078378940735/locations/us-central1/datasets/2293469105354702848


INFO:google.cloud.aiplatform.datasets.dataset:TabularDataset created. Resource name: projects/1078378940735/locations/us-central1/datasets/2293469105354702848


To use this TabularDataset in another session:


INFO:google.cloud.aiplatform.datasets.dataset:To use this TabularDataset in another session:


ds = aiplatform.TabularDataset('projects/1078378940735/locations/us-central1/datasets/2293469105354702848')


INFO:google.cloud.aiplatform.datasets.dataset:ds = aiplatform.TabularDataset('projects/1078378940735/locations/us-central1/datasets/2293469105354702848')


'projects/1078378940735/locations/us-central1/datasets/2293469105354702848'

### Write the training script

Next, you create the training script that is used in the sample custom training job.

In [21]:
%%writefile training_script.py

import pandas as pd
import argparse
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

parser = argparse.ArgumentParser()
parser.add_argument('--epochs', dest='epochs',
                    default=10, type=int,
                    help='Number of epochs.')
parser.add_argument('--num_units', dest='num_units',
                    default=64, type=int,
                    help='Number of unit for first layer.')
args = parser.parse_args()

col_names = ["Length", "Diameter", "Height", "Whole weight", "Shucked weight", "Viscera weight", "Shell weight", "Age"]
target = "Age"

def aip_data_to_dataframe(wild_card_path):
    return pd.concat([pd.read_csv(fp.numpy().decode(), names=col_names)
                      for fp in tf.data.Dataset.list_files([wild_card_path])])

def get_features_and_labels(df):
    return df.drop(target, axis=1).values, df[target].values

def data_prep(wild_card_path):
    return get_features_and_labels(aip_data_to_dataframe(wild_card_path))


model = tf.keras.Sequential([layers.Dense(args.num_units), layers.Dense(1)])
model.compile(loss='mse', optimizer='adam')

model.fit(*data_prep(os.environ["AIP_TRAINING_DATA_URI"]),
          epochs=args.epochs ,
          validation_data=data_prep(os.environ["AIP_VALIDATION_DATA_URI"]))
print(model.evaluate(*data_prep(os.environ["AIP_TEST_DATA_URI"])))

# save as Vertex AI Managed model
tf.saved_model.save(model, os.environ["AIP_MODEL_DIR"])

Overwriting training_script.py


### Launch a custom training job and track its training parameters on Vertex ML Metadata

In [26]:
job = aiplatform.CustomTrainingJob(
    display_name="train-abalone-job-nov27",
    script_path="training_script.py",
    container_uri="us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-8:latest",
    requirements=["gcsfs==0.7.1"],
    model_serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-8:latest",
)

Start a new experiment run to track training parameters and start the training job. Note that this operation will take around 10 mins.

In [27]:
aiplatform.start_run(
    "custom-training-run-nov27-1"
)  # Change this to your desired run name
parameters = {"epochs": 10, "num_units": 64}
aiplatform.log_params(parameters)

model = job.run(
    ds,
    replica_count=1,
    model_display_name="abalone-model",
    args=[f"--epochs={parameters['epochs']}", f"--num_units={parameters['num_units']}"],
)

Associating projects/1078378940735/locations/us-central1/metadataStores/default/contexts/fractal-experiment-custom-training-run-nov27-1 to Experiment: fractal-experiment


INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/1078378940735/locations/us-central1/metadataStores/default/contexts/fractal-experiment-custom-training-run-nov27-1 to Experiment: fractal-experiment


Training script copied to:
gs://fractal-parameters-nov27/aiplatform-2022-11-27-04:08:58.059-aiplatform_custom_trainer_script-0.1.tar.gz.


INFO:google.cloud.aiplatform.utils.source_utils:Training script copied to:
gs://fractal-parameters-nov27/aiplatform-2022-11-27-04:08:58.059-aiplatform_custom_trainer_script-0.1.tar.gz.


Training Output directory:
gs://fractal-parameters-nov27/aiplatform-custom-training-2022-11-27-04:08:58.143 


INFO:google.cloud.aiplatform.training_jobs:Training Output directory:
gs://fractal-parameters-nov27/aiplatform-custom-training-2022-11-27-04:08:58.143 


No dataset split provided. The service will use a default split.


INFO:google.cloud.aiplatform.training_jobs:No dataset split provided. The service will use a default split.


View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/4456786545428398080?project=1078378940735


INFO:google.cloud.aiplatform.training_jobs:View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/4456786545428398080?project=1078378940735


CustomTrainingJob projects/1078378940735/locations/us-central1/trainingPipelines/4456786545428398080 current state:
PipelineState.PIPELINE_STATE_RUNNING


INFO:google.cloud.aiplatform.training_jobs:CustomTrainingJob projects/1078378940735/locations/us-central1/trainingPipelines/4456786545428398080 current state:
PipelineState.PIPELINE_STATE_RUNNING


CustomTrainingJob projects/1078378940735/locations/us-central1/trainingPipelines/4456786545428398080 current state:
PipelineState.PIPELINE_STATE_RUNNING


INFO:google.cloud.aiplatform.training_jobs:CustomTrainingJob projects/1078378940735/locations/us-central1/trainingPipelines/4456786545428398080 current state:
PipelineState.PIPELINE_STATE_RUNNING


CustomTrainingJob projects/1078378940735/locations/us-central1/trainingPipelines/4456786545428398080 current state:
PipelineState.PIPELINE_STATE_RUNNING


INFO:google.cloud.aiplatform.training_jobs:CustomTrainingJob projects/1078378940735/locations/us-central1/trainingPipelines/4456786545428398080 current state:
PipelineState.PIPELINE_STATE_RUNNING


CustomTrainingJob projects/1078378940735/locations/us-central1/trainingPipelines/4456786545428398080 current state:
PipelineState.PIPELINE_STATE_RUNNING


INFO:google.cloud.aiplatform.training_jobs:CustomTrainingJob projects/1078378940735/locations/us-central1/trainingPipelines/4456786545428398080 current state:
PipelineState.PIPELINE_STATE_RUNNING


CustomTrainingJob projects/1078378940735/locations/us-central1/trainingPipelines/4456786545428398080 current state:
PipelineState.PIPELINE_STATE_RUNNING


INFO:google.cloud.aiplatform.training_jobs:CustomTrainingJob projects/1078378940735/locations/us-central1/trainingPipelines/4456786545428398080 current state:
PipelineState.PIPELINE_STATE_RUNNING


View backing custom job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/3588576979764379648?project=1078378940735


INFO:google.cloud.aiplatform.training_jobs:View backing custom job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/3588576979764379648?project=1078378940735


CustomTrainingJob projects/1078378940735/locations/us-central1/trainingPipelines/4456786545428398080 current state:
PipelineState.PIPELINE_STATE_RUNNING


INFO:google.cloud.aiplatform.training_jobs:CustomTrainingJob projects/1078378940735/locations/us-central1/trainingPipelines/4456786545428398080 current state:
PipelineState.PIPELINE_STATE_RUNNING


CustomTrainingJob run completed. Resource name: projects/1078378940735/locations/us-central1/trainingPipelines/4456786545428398080


INFO:google.cloud.aiplatform.training_jobs:CustomTrainingJob run completed. Resource name: projects/1078378940735/locations/us-central1/trainingPipelines/4456786545428398080


Model available at projects/1078378940735/locations/us-central1/models/4120232908113838080


INFO:google.cloud.aiplatform.training_jobs:Model available at projects/1078378940735/locations/us-central1/models/4120232908113838080


### Deploy model and calculate prediction metrics

Next, deploy your Vertex AI Model resource to a Vertex AI Endpoint resource. This operation will take 10-20 mins.

In [None]:
endpoint = model.deploy(machine_type="n1-standard-4")

### Prediction dataset preparation and online prediction

Once model is deployed, perform online prediction using the `abalone_test` dataset and calculate prediction metrics.

Prepare the prediction dataset.

In [None]:
def read_data(uri):
    dataset_path = data_utils.get_file("abalone_test.data", uri)
    col_names = [
        "Length",
        "Diameter",
        "Height",
        "Whole weight",
        "Shucked weight",
        "Viscera weight",
        "Shell weight",
        "Age",
    ]
    dataset = pd.read_csv(
        dataset_path,
        names=col_names,
        na_values="?",
        comment="\t",
        sep=",",
        skipinitialspace=True,
    )
    return dataset


def get_features_and_labels(df):
    target = "Age"
    return df.drop(target, axis=1).values, df[target].values


test_dataset, test_labels = get_features_and_labels(
    read_data(
        "https://storage.googleapis.com/download.tensorflow.org/data/abalone_test.csv"
    )
)

Perform online prediction.

In [None]:
prediction = endpoint.predict(test_dataset.tolist())
prediction

Calculate and track prediction evaluation metrics.

In [None]:
mse = mean_squared_error(test_labels, prediction.predictions)
mae = mean_absolute_error(test_labels, prediction.predictions)

aiplatform.log_metrics({"mse": mse, "mae": mae})

### Extract all parameters and metrics created during this experiment.

In [None]:
aiplatform.get_experiment_df()

### View data in the Cloud Console

Parameters and metrics can also be viewed in the Cloud Console. 


In [None]:
print("Vertex AI Experiments:")
print(
    f"https://console.cloud.google.com/ai/platform/experiments/experiments?folder=&organizationId=&project={PROJECT_ID}"
)

## Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:
Training Job
Model
Cloud Storage Bucket

* Vertex AI Dataset
* Training Job
* Model
* Endpoint
* Cloud Storage Bucket


In [None]:
# Warning: Setting this to true will delete everything in your bucket
delete_bucket = False

# Delete dataset
ds.delete()

# Delete experiment
experiment = aiplatform.Experiment(
    experiment_name=EXPERIMENT_NAME, project=PROJECT_ID, location=REGION
)
experiment.delete()

# Delete the training job
job.delete()

# Undeploy model from endpoint
endpoint.undeploy_all()

# Delete the endpoint
endpoint.delete()

# Delete the model
model.delete()


if delete_bucket or os.getenv("IS_TESTING"):
    ! gsutil -m rm -r $BUCKET_URI