In [1]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

This notebook was authored with assistance from [Ivan Nardini](https://github.com/inardini)

# Vertex AI Experiments


### Install Vertex AI SDK for Python and other required packages


In [2]:
! pip3 install --upgrade --quiet google-cloud-aiplatform

### Authenticate your notebook environment


In [3]:
import sys
from IPython.display import Markdown, display

PROJECT_ID="ai-hangsik"
LOCATION="us-central1"

# For only colab user, no need this process for Colab Enterprise in Vertex AI.
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user(project_id=PROJECT_ID)

# set project.
!gcloud config set project {PROJECT_ID}

Updated property [core/project].


In [4]:
# Create a bucket.
BUCKET_URI = f"gs://mlops-{PROJECT_ID}-1209"
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

Creating gs://mlops-ai-hangsik-1209/...
ServiceException: 409 A Cloud Storage bucket named 'mlops-ai-hangsik-1209' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


In [5]:
shell_output = ! gcloud projects describe  $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")

SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"

print(f"SERVICE_ACCOUNT: {SERVICE_ACCOUNT}")

SERVICE_ACCOUNT: 721521243942-compute@developer.gserviceaccount.com


In [6]:
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_URI
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_URI

No changes made to gs://mlops-ai-hangsik-1209/
No changes made to gs://mlops-ai-hangsik-1209/


In [7]:
import os
import uuid

import google.cloud.aiplatform as aiplatform

### Initialize Vertex AI SDK for Python

Initialize the Vertex AI SDK for Python for your project and corresponding bucket.

In [8]:
aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

## Introduction to Vertex AI Experiments

Learn more about [Experiments]( https://cloud.google.com/vertex-ai/docs/experiments/intro-vertex-ai-experiments).

Learn more about [Vertex ML Metadata](https://cloud.google.com/vertex-ai/docs/ml-metadata/introduction).

#### Create experiment for tracking training related metadata

First, you create an experiment using the `init()` method and then initialize a run within the experiment using `start_run()`.

- `aiplatform.init()` - Create an experiment instance
- `aiplatform.start_run()` - Track a specific run within the experiment.

In [12]:
# Create experiment
EXPERIMENT_NAME = f"forus-experiments-{uuid.uuid1()}"
aiplatform.init(experiment=EXPERIMENT_NAME)
aiplatform.start_run("run-1")

INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/721521243942/locations/us-central1/metadataStores/default/contexts/forus-experiments-b7511da0-b5b5-11ef-8a30-0242ac1c000c-run-1 to Experiment: forus-experiments-b7511da0-b5b5-11ef-8a30-0242ac1c000c


<google.cloud.aiplatform.metadata.experiment_run_resource.ExperimentRun at 0x7e663983ec80>

#### Log parameters for the experiment



In [13]:
metaparams = {}
metaparams["units"] = 128
aiplatform.log_params(metaparams)

hyperparams = {}
hyperparams["epochs"] = 100
hyperparams["batch_size"] = 32
hyperparams["learning_rate"] = 0.01
aiplatform.log_params(hyperparams)

#### Log metrics for the experiment


In [14]:
metrics = {}
metrics["test_acc"] = 98.7
metrics["train_acc"] = 99.3
aiplatform.log_metrics(metrics)

#### Get the experiment results

When you are finished with a run within an experiment, call `end_run()` method to complete the logging for that run.

Next, use the experiment name as a parameter to the method `get_experiment_df()` to get the results of the experiment as a pandas dataframe.

In [15]:
aiplatform.end_run()

experiment_df = aiplatform.get_experiment_df()
experiment_df = experiment_df[experiment_df.experiment_name == EXPERIMENT_NAME]
print(experiment_df.T)

                                                                     0
experiment_name      forus-experiments-b7511da0-b5b5-11ef-8a30-0242...
run_name                                                         run-1
run_type                                          system.ExperimentRun
state                                                         COMPLETE
param.batch_size                                                  32.0
param.learning_rate                                               0.01
param.epochs                                                     100.0
param.units                                                      128.0
metric.train_acc                                                  99.3
metric.test_acc                                                   98.7


#### Start subsequent run in an experiment

Next, create a second run for the same experiment. In this example, you change the metaparameter for `units` from 126 to 256, and log different metric results.

In [16]:
aiplatform.start_run("run-2")

metaparams = {}
metaparams["units"] = 256  # changed the value
aiplatform.log_params(metaparams)

hyperparams = {}
hyperparams["epochs"] = 100
hyperparams["batch_size"] = 32
hyperparams["learning_rate"] = 0.01
aiplatform.log_params(hyperparams)

metrics = {}
metrics["test_acc"] = 98.8  # value changed
metrics["train_acc"] = 99.5  # value changed
aiplatform.log_metrics(metrics)

INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/721521243942/locations/us-central1/metadataStores/default/contexts/forus-experiments-b7511da0-b5b5-11ef-8a30-0242ac1c000c-run-2 to Experiment: forus-experiments-b7511da0-b5b5-11ef-8a30-0242ac1c000c


#### Comparing runs in the same experiment

Finally, use the experiment name as a parameter to the method `get_experiment_df()` to get the results of all the runs within the experiment as a pandas dataframe.

In [17]:
aiplatform.end_run()

experiment_df = aiplatform.get_experiment_df()
experiment_df = experiment_df[experiment_df.experiment_name == EXPERIMENT_NAME]
print(experiment_df.T)

                                                                     0  \
experiment_name      forus-experiments-b7511da0-b5b5-11ef-8a30-0242...   
run_name                                                         run-2   
run_type                                          system.ExperimentRun   
state                                                         COMPLETE   
param.batch_size                                                  32.0   
param.epochs                                                     100.0   
param.learning_rate                                               0.01   
param.units                                                      256.0   
metric.train_acc                                                  99.5   
metric.test_acc                                                   98.8   

                                                                     1  
experiment_name      forus-experiments-b7511da0-b5b5-11ef-8a30-0242...  
run_name                               

#### Delete the experiment

Next, delete the experiment using the `delete()` method.

In [18]:
exp = aiplatform.Experiment(EXPERIMENT_NAME)
try:
    exp.delete()
except Exception as e:
    print(e)

To delete backing tensorboard run, execute the following:
tensorboard_run_artifact = aiplatform.metadata.artifact.Artifact(artifact_name=f"forus-experiments-b7511da0-b5b5-11ef-8a30-0242ac1c000c-run-2-tb-run")
tensorboard_run_resource = aiplatform.TensorboardRun(tensorboard_run_artifact.metadata["resourceName"])
tensorboard_run_resource.delete()
tensorboard_run_artifact.delete()
INFO:google.cloud.aiplatform.base:Deleting Context : projects/721521243942/locations/us-central1/metadataStores/default/contexts/forus-experiments-b7511da0-b5b5-11ef-8a30-0242ac1c000c-run-2
INFO:google.cloud.aiplatform.base:Context deleted. . Resource name: projects/721521243942/locations/us-central1/metadataStores/default/contexts/forus-experiments-b7511da0-b5b5-11ef-8a30-0242ac1c000c-run-2
INFO:google.cloud.aiplatform.base:Deleting Context resource: projects/721521243942/locations/us-central1/metadataStores/default/contexts/forus-experiments-b7511da0-b5b5-11ef-8a30-0242ac1c000c-run-2
INFO:google.cloud.aiplatfo

### Create artifact lineage in experiment runs

In this example, you add artifact lineage to your experiment run. First, create an experiment and then start a run within the experiment.

In [19]:
# Create experiment
EXPERIMENT_NAME = f"artifact-{uuid.uuid1()}"
aiplatform.init(experiment=EXPERIMENT_NAME)
aiplatform.start_run("run-1")

INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/721521243942/locations/us-central1/metadataStores/default/contexts/artifact-c16c1ede-b5b5-11ef-8a30-0242ac1c000c-run-1 to Experiment: artifact-c16c1ede-b5b5-11ef-8a30-0242ac1c000c


<google.cloud.aiplatform.metadata.experiment_run_resource.ExperimentRun at 0x7e66398a98d0>

#### Create a dataset and model artifacts

Next, you create synthetic artifacts in the Vertex ML Metadata to be associated with this run in the experiment, as lineage. You'll create:

- `dataset_artifact`: A dataset that is the input to the experiment run.
- `model_artifact`: A model that is the output from the experiment run.

In [20]:
DATASET_URI = "gs://example/dataset.csv"
MODEL_URI = "gs://example/saved_model.pb"

dataset_artifact = aiplatform.Artifact.create(
    schema_title="system.Dataset", display_name="example_dataset", uri=DATASET_URI
)

model_artifact = aiplatform.Artifact.create(
    schema_title="system.Model", display_name="example_modl", uri=MODEL_URI
)

#### Create the artifact lineage


In [21]:
with aiplatform.start_execution(
    schema_title="system.ContainerExecution", display_name="example_training"
) as execution:
    execution.assign_input_artifacts([dataset_artifact])

    aiplatform.log_params({"units": 256})
    aiplatform.log_metrics({"acc": 96.8})

    execution.assign_output_artifacts([model_artifact])

    aiplatform.log_metrics(
        {"lineage": execution.get_output_artifacts()[0].lineage_console_uri}
    )

#### Get the experiment results


In [22]:
aiplatform.end_run()

experiment_df = aiplatform.get_experiment_df()
experiment_df = experiment_df[experiment_df.experiment_name == EXPERIMENT_NAME]
print(experiment_df.T)

                                                                 0
experiment_name      artifact-c16c1ede-b5b5-11ef-8a30-0242ac1c000c
run_name                                                     run-1
run_type                                      system.ExperimentRun
state                                                     COMPLETE
param.units                                                  256.0
metric.acc                                                    96.8
metric.lineage   https://console.cloud.google.com/vertex-ai/loc...


#### Visualize the artifact lineage

Next, open the link below to visualize the artifact lineage.

In [23]:
print(
    "Open the following link:", execution.get_output_artifacts()[0].lineage_console_uri
)

Open the following link: https://console.cloud.google.com/vertex-ai/locations/us-central1/metadata-stores/default/artifacts/9289d665-7785-4833-b865-5457da022b9c?project=ai-hangsik


#### Delete the artifact lineage

Next, use the `delete()` method to delete the artifact lineage.

In [24]:
try:
    dataset_artifact.delete()
except Exception as e:
    print(e)
try:
    model_artifact.delete()
except Exception as e:
    print(e)

INFO:google.cloud.aiplatform.base:Deleting Artifact : projects/721521243942/locations/us-central1/metadataStores/default/artifacts/b67dd391-a5da-4273-b298-62df7a3564db
INFO:google.cloud.aiplatform.base:Artifact deleted. . Resource name: projects/721521243942/locations/us-central1/metadataStores/default/artifacts/b67dd391-a5da-4273-b298-62df7a3564db
INFO:google.cloud.aiplatform.base:Deleting Artifact resource: projects/721521243942/locations/us-central1/metadataStores/default/artifacts/b67dd391-a5da-4273-b298-62df7a3564db
INFO:google.cloud.aiplatform.base:Delete Artifact backing LRO: projects/721521243942/locations/us-central1/metadataStores/default/artifacts/b67dd391-a5da-4273-b298-62df7a3564db/operations/3125891113722839040
INFO:google.cloud.aiplatform.base:Artifact resource projects/721521243942/locations/us-central1/metadataStores/default/artifacts/b67dd391-a5da-4273-b298-62df7a3564db deleted.
INFO:google.cloud.aiplatform.base:Deleting Artifact : projects/721521243942/locations/us-c

#### Delete the experiment

Next, you delete the experiment using the `delete()` method.

In [25]:
try:
    exp.delete()
except Exception as e:
    print(e)

INFO:google.cloud.aiplatform.base:Deleting Context : projects/721521243942/locations/us-central1/metadataStores/default/contexts/forus-experiments-b7511da0-b5b5-11ef-8a30-0242ac1c000c


### Cloud development in Vertex AI Training

You can track an experiment in your cloud development using Vertex AI Training, by:

In your Python training script, repeat the same steps as in local development:

- Wrap (preamble) the creation of an experiment.
- Instantiate a run per training run in the experiment.
- Within the local training run, log the corresponding parameters and results.
- Create lineage to the artifacts and experiment data.
- Retreive the experiment data.

#### Package layout

Before you start the training, you'll look at how a Python package is assembled for a custom training job. When unarchived, the package contains the following directory/file layout.

- PKG-INFO
- README.md
- setup.cfg
- setup.py
- trainer
  - \_\_init\_\_.py
  - task.py

The files `setup.cfg` and `setup.py` are the instructions for installing the package into the operating environment of the Docker image.

The file `trainer/task.py` is the Python script for executing the custom training job. *Note*, when we referred to it in the worker pool specification, we replace the directory slash with a dot (`trainer.task`) and dropped the file suffix (`.py`).

#### Package Assembly

In the following cells, assemble the training package.

In [26]:
# Make folder for Python training script
! rm -rf custom
! mkdir custom

# Add package information
! touch custom/README.md

setup_cfg = "[egg_info]\n\ntag_build =\n\ntag_date = 0"
! echo "$setup_cfg" > custom/setup.cfg

setup_py = "import setuptools\n\nsetuptools.setup(\n\n    install_requires=[\n\n        'google-cloud-aiplatform',\n\n  ],\n\n    packages=setuptools.find_packages())"
! echo "$setup_py" > custom/setup.py

pkg_info = "Metadata-Version: 1.0\n\nName: Synethic Training Script for Experiments\n\nVersion: 0.0.0\n\nSummary: Demostration training script\n\nHome-page: www.google.com\n\nAuthor: Google\n\nAuthor-email: aferlitsch@google.com\n\nLicense: Public\n\nDescription: Demo\n\nPlatform: Vertex"
! echo "$pkg_info" > custom/PKG-INFO

# Make the training subfolder
! mkdir custom/trainer
! touch custom/trainer/__init__.py

#### Create synthetic training script

First, write a synthetic training script. It won't actually train a model, but instead mimics the training of the model:

- Argument parsing
  - `experiment`: The name of the experiment.
  - `run`: The name of the run within the experiment.
  - `epochs`: The number of epochs.
  - `dataset-uri`: The Cloud Storage location of the training data.
  - `model-dir`: The Cloud Storage location to save the trained model artifacts.
- Training functions
  - `get_data()`:
      - Get the training data.
      - Create the input dataset artifact.
      - Attach dataset artifact as input to execution context.
  - `get_model()`:
      - Get the model architecture.
  - `train_model()`:
      - Train the model.
  - `save_model()`:
      - Save the model.
      - Create the output model artifact.
      - Attach model artifact as output to execution context.
- Initialize the experiment (`init()`) and start a run (`start_run()`) within the experiment.
- Wrap the training with a `start_execution()`.
- Log the lineage to the experiment parameters (`log_metrics({"lineage"...)`).
- End the experiment run (`end_run()`).

In [27]:
%%writefile custom/trainer/task.py

import argparse
import os

import google.cloud.aiplatform as aiplatform

parser = argparse.ArgumentParser()
# Args for experiment
parser.add_argument('--experiment', dest='experiment',
                    required=True, type=str,
                    help='Name of experiment')
parser.add_argument('--run', dest='run',
                    required=True, type=str,
                    help='Name of run within the experiment')

# Hyperparameters for experiment
parser.add_argument('--epochs', dest='epochs',
                    default=10, type=int,
                    help='Number of epochs.')

parser.add_argument('--dataset-uri', dest='dataset_uri',
                    required=True, type=str,
                    help='Location of the dataset')

parser.add_argument('--model-dir', dest='model_dir',
                    default=os.getenv("AIP_MODEL_DIR"), type=str,
                    help='Storage location for the model')
args = parser.parse_args()

def get_data(dataset_uri, execution):
    # get the training data

    dataset_artifact = aiplatform.Artifact.create(
        schema_title="system.Dataset", display_name="example_dataset", uri=dataset_uri
    )

    execution.assign_input_artifacts([dataset_artifact])

    return None

def get_model():
    # get or create the model architecture
    return None

def train_model(dataset, model, epochs):
    aiplatform.log_params({"epochs": epochs})
    # train the model
    return model

def save_model(model, model_dir, execution):
    # save the model

    model_artifact = aiplatform.Artifact.create(
        schema_title="system.Model", display_name="example_model", uri=model_dir
    )
    execution.assign_output_artifacts([model_artifact])

# Create a run within the experiment
aiplatform.init(experiment=args.experiment)
aiplatform.start_run(args.run)

with aiplatform.start_execution(
    schema_title="system.ContainerExecution", display_name="example_training"
) as execution:
    dataset = get_data(args.dataset_uri, execution)
    model = get_model()
    model = train_model(dataset, model, args.epochs)
    save_model(model, args.model_dir, execution)

    # Store the lineage link in the experiment
    aiplatform.log_metrics({"lineage": execution.get_output_artifacts()[0].lineage_console_uri})

aiplatform.end_run()

Writing custom/trainer/task.py


#### Store training script on your Cloud Storage bucket

Next, package the training folder into a compressed tar ball, and then store it in your Cloud Storage bucket.

In [28]:
! rm -f custom.tar custom.tar.gz
! tar cvf custom.tar custom
! gzip custom.tar
! gsutil cp custom.tar.gz $BUCKET_URI/trainer.tar.gz

custom/
custom/setup.py
custom/trainer/
custom/trainer/__init__.py
custom/trainer/task.py
custom/README.md
custom/setup.cfg
custom/PKG-INFO
Copying file://custom.tar.gz [Content-Type=application/x-tar]...
/ [1 files][  1.3 KiB/  1.3 KiB]                                                
Operation completed over 1 objects/1.3 KiB.                                      


#### Create custom training job

A custom training job is created with the `CustomTrainingJob` class, with the following parameters:

- `display_name`: The human readable name for the custom training job.
- `container_uri`: The training container image.

- `python_package_gcs_uri`: The location of the Python training package as a tarball.
- `python_module_name`: The relative path to the training script in the Python package.

*Note:* There is no requirements parameter. You specify any requirements in the `setup.py` script in your Python package.

In [29]:
DISPLAY_NAME = "example"
TRAIN_IMAGE = "us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-12.py310:latest"

job = aiplatform.CustomPythonPackageTrainingJob(
    display_name=DISPLAY_NAME,
    python_package_gcs_uri=f"{BUCKET_URI}/trainer.tar.gz",
    python_module_name="trainer.task",
    container_uri=TRAIN_IMAGE,
)

#### Run the custom training job

Next, run the custom training job to start the training job by invoking the method `run()`, with the following parameters:

- `args`: The arguments to pass to the training script
    - `model_dir`: The Cloud Storage location to store the model.
    - `dataset_uri`: The Cloud Storage location of the dataset.
    - `epochs`: The number of epochs (hyperparameter).
    - `experiment`: The name of the experiment.
    - `run`: The name of the run within the experiment.
- `replica_count`: The number of VM instances.
- `machine_type`: The machine type for each VM instance.

In [None]:
TRAIN_COMPUTE = "n1-standard-4"
EXPERIMENT_NAME = f"example-{uuid.uuid1()}"
aiplatform.init(experiment=EXPERIMENT_NAME)

CMDARGS = [
    "--model-dir=" + BUCKET_URI,
    "--dataset-uri=gs://example/foo.csv",
    "--epochs=5",
    f"--experiment={EXPERIMENT_NAME}",
    "--run=run-1",
]

job.run(
    args=CMDARGS,
    replica_count=1,
    machine_type=TRAIN_COMPUTE,
    service_account=SERVICE_ACCOUNT,
    sync=True,
)

INFO:google.cloud.aiplatform.training_jobs:Training Output directory:
gs://mlops-ai-hangsik-1209/aiplatform-custom-training-2024-12-08-22:43:08.859 
INFO:google.cloud.aiplatform.training_jobs:View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/8801799728074326016?project=721521243942
INFO:google.cloud.aiplatform.training_jobs:CustomPythonPackageTrainingJob projects/721521243942/locations/us-central1/trainingPipelines/8801799728074326016 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.training_jobs:View backing custom job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/7986085245566844928?project=721521243942
INFO:google.cloud.aiplatform.training_jobs:CustomPythonPackageTrainingJob projects/721521243942/locations/us-central1/trainingPipelines/8801799728074326016 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.training_jobs:CustomPythonPackageTrainingJob p

#### Get the experiment results

Next, use the experiment name as a parameter to the method `get_experiment_df()` to get the results of the experiment as a pandas dataframe.

In this example, you stored the resource URI to the lineage as a metric value `lineage` in the execution run.

In [None]:
experiment_df = aiplatform.get_experiment_df()
experiment_df = experiment_df[experiment_df.experiment_name == EXPERIMENT_NAME]
print(experiment_df.T)

#### Visualize the artifact lineage

Next, open the link below to visualize the artifact lineage.

In [None]:
try:
    print("Open the following link", experiment_df["metric.lineage"][0])
except Exception as e:
    print(e)

#### Delete the custom training job

You can delete your custom training job using the `delete()` method.

In [None]:
job.delete()

#### Delete the experiment

Since the experiment was created within Vertex AI Training, to delete the experiment you use the `list()` method to obtain all the experiments for the project, and then filter on the experiment name.

In [None]:
experiments = aiplatform.Experiment.list()
for experiment in experiments:
    if experiment.name == EXPERIMENT_NAME:
        experiment.delete()

# Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial.

In [None]:
! rm -rf custom

delete_bucket = False

if delete_bucket:
    ! gsutil rm -rf {BUCKET_URI}