In [20]:
# Copyright 2024 Forusone

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI TensorBoard custom training with custom container

* https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/tensorboard/tensorboard_custom_training_with_custom_container.ipynb
* [flower dataset](https://www.tensorflow.org/datasets/catalog/tf_flowers)

### Configuration

In [1]:
%pip install --user --quiet google-cloud-aiplatform 


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
PROJECT_ID = "ai-hangsik"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type: "string"}

! gcloud config set project {PROJECT_ID}

Updated property [core/project].


In [3]:
import random
import string

# Generate a uuid of a specifed length(default=8)
def generate_uuid(length: int = 8) -> str:
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))

UUID = generate_uuid()

In [4]:
BUCKET_URI = f"gs://mlops-0221"  # @param {type:"string"}

In [26]:
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

Creating gs://mlops-0221/...
ServiceException: 409 A Cloud Storage bucket named 'mlops-0221' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


### Initialize aiplatform

In [5]:
import os
import google.cloud.aiplatform as aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

### Enable Artifact Registry API

In [7]:
! gcloud services enable artifactregistry.googleapis.com

### Create Docker repository

Create a Docker repository named `DOCKER_REPOSITORY` in your `LOCATION`.
This docker repository is deleted in the cleaning up section at the end.

In [8]:
DOCKER_REPOSITORY = "tensorboard"  # @param {type:"string"}
print("Docker repository to create:", DOCKER_REPOSITORY)

Docker repository to create: tensorboard


In [9]:
! gcloud  artifacts repositories create  $DOCKER_REPOSITORY --project={PROJECT_ID} \
--repository-format=docker \
--location={LOCATION} --description="Repository for TensorBoard Custom Training Job" 

[1;31mERROR:[0m (gcloud.artifacts.repositories.create) ALREADY_EXISTS: the repository already exists


In [10]:
! gcloud artifacts repositories list --project={PROJECT_ID}

Listing items under project ai-hangsik, across all locations.

                                                                                               ARTIFACT_REGISTRY
REPOSITORY                         FORMAT  MODE                 DESCRIPTION                                     LOCATION         LABELS  ENCRYPTION          CREATE_TIME          UPDATE_TIME          SIZE (MB)
cloud-run-source-deploy            DOCKER  STANDARD_REPOSITORY  Cloud Run Source Deployments                    asia-northeast3          Google-managed key  2024-03-01T14:59:17  2024-03-01T23:38:33  1505.522
kubeflow-test                      DOCKER  STANDARD_REPOSITORY                                                  asia-northeast3          Google-managed key  2024-11-10T07:54:48  2024-11-10T09:23:27  604.943
cpr-handler-prediction             DOCKER  STANDARD_REPOSITORY                                                  us-central1              Google-managed key  2025-02-03T22:42:11  2025-02-03T22:43:39  4

## Create a custom container image and push to Artifact Registry


In [11]:
!pwd

/home/jupyter/mlops_vertexai/03.training/tensorboard


In [12]:
# Create a folder for the image.

!mkdir tb-custom-container
%cd tb-custom-container

/home/jupyter/mlops_vertexai/03.training/tensorboard/tb-custom-container


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


### Create a training code


In [16]:
%%writefile task.py

import logging
import os

import tensorflow as tf
import tensorflow_datasets as tfds

IMG_WIDTH = 128

def normalize_img(image):
    """Normalizes image.

    * Resizes image to IMG_WIDTH x IMG_WIDTH pixels
    * Casts values from `uint8` to `float32`
    * Scales values from [0, 255] to [0, 1]

    Returns:
      A tensor with shape (IMG_WIDTH, IMG_WIDTH, 3). (3 color channels)
    """
    image = tf.image.resize_with_pad(image, IMG_WIDTH, IMG_WIDTH)
    return image / 255.


def normalize_img_and_label(image, label):
    """Normalizes image and label.

    * Performs normalize_img on image
    * Passes through label unchanged

    Returns:
      Tuple (image, label) where
      * image is a tensor with shape (IMG_WIDTH, IMG_WIDTH, 3). (3 color
        channels)
      * label is an unchanged integer [0, 4] representing flower type
    """
    return normalize_img(image), label

logging.info('Loading and preprocessing data ...')
dataset = tfds.load('tf_flowers:3.*.*',
                    split='train',
                    try_gcs=True,
                    shuffle_files=True,
                    as_supervised=True)
dataset = dataset.map(normalize_img_and_label,
                      num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.cache()
dataset = dataset.shuffle(1000)
dataset = dataset.batch(128)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

logging.info('Creating and training model ...')

model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(16,
                           3,
                           padding='same',
                           activation='relu',
                           input_shape=(IMG_WIDTH, IMG_WIDTH, 3)),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(32, 3, padding='same', activation='relu'),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(64, 3, padding='same', activation='relu'),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation="relu"),
    tf.keras.layers.Dense(5)  # 5 classes
])

logging.info('Compiling model ...')

model.compile(
    optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'])

log_dir = "logs"

if 'AIP_TENSORBOARD_LOG_DIR' in os.environ:
    log_dir=os.environ['AIP_TENSORBOARD_LOG_DIR']

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, 
                                                      histogram_freq=1)

logging.info('Training model ...')
model.fit(dataset, epochs=13, callbacks=[tensorboard_callback])

logging.info('Model training done')

Overwriting task.py


In [17]:
%%writefile Dockerfile

# Specifies base image and tag
FROM us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-8:latest
RUN pip install tensorflow-datasets
WORKDIR /root

# Installs additional packages as you need.

# Copies the trainer code to the docker image.
COPY task.py /root/task.py

# Sets up the entry point to invoke the trainer.
ENTRYPOINT ["python", "task.py"]

Overwriting Dockerfile


In [18]:
IMAGE_NAME = "tensorboard-custom-container"
IMAGE_TAG = "v1"
IMAGE_URI = "{}-docker.pkg.dev/{}/{}/{}:{}".format(
    LOCATION, PROJECT_ID, DOCKER_REPOSITORY, IMAGE_NAME, IMAGE_TAG
)

! gcloud builds submit --project {PROJECT_ID} --region={LOCATION} --tag {IMAGE_URI} --timeout=20m

Creating temporary archive of 2 file(s) totalling 2.9 KiB before compression.
Uploading tarball of [.] to [gs://ai-hangsik_cloudbuild/source/1740100161.370207-a82e56bab1bb453d9cc6cf781b2dfbbd.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/ai-hangsik/locations/us-central1/builds/f2f48ac2-6636-4345-b89d-c73901d3f6c0].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds;region=us-central1/f2f48ac2-6636-4345-b89d-c73901d3f6c0?project=721521243942 ].
Waiting for build to complete. Polling interval: 1 second(s).
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "f2f48ac2-6636-4345-b89d-c73901d3f6c0"

FETCHSOURCE
Fetching storage object: gs://ai-hangsik_cloudbuild/source/1740100161.370207-a82e56bab1bb453d9cc6cf781b2dfbbd.tgz#1740100161561621
Copying gs://ai-hangsik_cloudbuild/source/1740100161.370207-a82e56bab1bb453d9cc6cf781b2dfbbd.tgz#1740100161561621...
/ [1 files][  1.4 KiB/  1.4 KiB]                       

## Setup service account and permissions

In [19]:
# @title Service account
shell_output = ! gcloud projects describe  $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print(f"SERVICE_ACCOUNT: {SERVICE_ACCOUNT}")

SERVICE_ACCOUNT: 721521243942-compute@developer.gserviceaccount.com


In [20]:
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_URI
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_URI

No changes made to gs://mlops-0221/
No changes made to gs://mlops-0221/


## Create a custom training job with your container
Create a TensorBoard instance to be used by the custom training job.

In [21]:
TENSORBOARD_NAME = "tensorboard-test"  # @param {type:"string"}

tensorboard = aiplatform.Tensorboard.create(
    display_name=TENSORBOARD_NAME, project=PROJECT_ID, location=LOCATION
)

TENSORBOARD_RESOURCE_NAME = tensorboard.gca_resource.name
print("TensorBoard resource name:", TENSORBOARD_RESOURCE_NAME)

Creating Tensorboard
Create Tensorboard backing LRO: projects/721521243942/locations/us-central1/tensorboards/8757086747701018624/operations/311680657069703168
Tensorboard created. Resource name: projects/721521243942/locations/us-central1/tensorboards/8757086747701018624
To use this Tensorboard in another session:
tb = aiplatform.Tensorboard('projects/721521243942/locations/us-central1/tensorboards/8757086747701018624')
TensorBoard resource name: projects/721521243942/locations/us-central1/tensorboards/8757086747701018624


Run the following example request to create your own custom training job using the container you just built and uploaded to Artifact Registry, and stream the training results to TensorBoard.

In [22]:
JOB_NAME = "tensorboard-example-job-{}".format(UUID)
BASE_OUTPUT_DIR = "{}/{}".format(BUCKET_URI, JOB_NAME)

job = aiplatform.CustomContainerTrainingJob(
    display_name=JOB_NAME,
    container_uri=IMAGE_URI,
    project=PROJECT_ID,
    location=LOCATION,
    staging_bucket=BUCKET_URI,
)

job.run(
    machine_type="n1-standard-8",
    replica_count=1,
    service_account=SERVICE_ACCOUNT,
    tensorboard=TENSORBOARD_RESOURCE_NAME,
)

Training Output directory:
gs://mlops-0221/aiplatform-custom-training-2025-02-21-01:13:45.783 
View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/8786164397849444352?project=721521243942
CustomContainerTrainingJob projects/721521243942/locations/us-central1/trainingPipelines/8786164397849444352 current state:
PipelineState.PIPELINE_STATE_RUNNING
View backing custom job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/1005852993939046400?project=721521243942
View tensorboard:
https://us-central1.tensorboard.googleusercontent.com/experiment/projects+721521243942+locations+us-central1+tensorboards+8757086747701018624+experiments+1005852993939046400
CustomContainerTrainingJob projects/721521243942/locations/us-central1/trainingPipelines/8786164397849444352 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomContainerTrainingJob projects/721521243942/locations/us-central1/trainingPipelines/8786164397849444352 current 

In Google Cloud console, you can monitor your training job at Vertex AI > Training > Custom Jobs. In each custom training job, near real time updated TensorBoard is available at `OPEN TENSORBOARD` button. 
Learn more see [View Vertex AI TensorBoard data](https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-view)

## Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, **if you created the individual resources in the notebook** you can delete them as follow:

In [None]:
import os

# Delete docker repository.
! gcloud artifacts repositories delete $DOCKER_REPOSITORY --project {PROJECT_ID} --location {LOCATION} --quiet

# Delete TensorBoard instance.
! gcloud ai tensorboards delete {TENSORBOARD_RESOURCE_NAME}

# Delete custom job.
job.delete()

# Delete GCS bucket.
delete_bucket = False
if delete_bucket:
    ! gsutil -m rm -r $BUCKET_URI