## Run a Custom Training Job in Vertex AI (No Containerized Code)

## 1 - Pip Install Relevant Libraries

In [None]:
import os

# The Vertex AI Workbench Notebook product has specific requirements
IS_WORKBENCH_NOTEBOOK = os.getenv("DL_ANACONDA_HOME") and not os.getenv("VIRTUAL_ENV")
IS_USER_MANAGED_WORKBENCH_NOTEBOOK = os.path.exists(
    "/opt/deeplearning/metadata/env_version"
)

# Vertex AI Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_WORKBENCH_NOTEBOOK:
    USER_FLAG = "--user"

! pip3 install --user --force-reinstall 'google-cloud-aiplatform>=1.15' -q --no-warn-conflicts
! pip3 install google-cloud-pipeline-components -q --no-warn-conflicts
! pip3 install google-cloud-bigquery -q --no-warn-conflicts
! pip3 install {USER_FLAG} tensorflow==2.15.0 --upgrade -q --no-warn-conflicts
! pip3 install {USER_FLAG} pandas
! pip3 install {USER_FLAG} db-dtypes
! pip3 install {USER_FLAG} scikit-learn
! pip3 install {USER_FLAG} kfp

In [None]:
# Automatically restart kernel after installs
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

In [None]:
#doublecheck version of tensorflow 
! pip3 freeze | grep tensorflow

## 2 - Define Constants

In [None]:
# set project ID

import os

project_id = "" #set to your project ID
location = '' #set to your region, for example us-central1

# Get your Google Cloud project ID from gcloud
if not os.getenv("IS_TESTING"):
    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    project_id = shell_output[0]
    print("Project ID: ", project_id)


pipeline_bucket_name = f'${project_id}-mlops-spend'
pipeline_root_path = f'gs://{pipeline_bucket_name}'
service_account = "pipeline-sa@${project_id}.iam.gserviceaccount.com"

## 3 - Import Libraries

In [6]:
# import libraries

import kfp
from google.cloud import aiplatform
from google_cloud_pipeline_components.v1.dataset import TabularDatasetCreateOp
from google_cloud_pipeline_components.v1.automl.training_job import AutoMLTabularTrainingJobRunOp
from google_cloud_pipeline_components.v1.endpoint import EndpointCreateOp, ModelDeployOp
from google_cloud_pipeline_components.v1.model import ModelExportOp
import google.cloud.aiplatform as aiplatform
from google.cloud import storage


## 4 - Define the workflow of the pipeline

In [7]:
# Define the workflow of the pipeline.
@kfp.dsl.pipeline(
    name="automl-tabular-training-v2",
    pipeline_root=pipeline_root_path)
def pipeline(project_id: str):
    ds_op = TabularDatasetCreateOp(
        project=project_id,
        location = location,
        display_name="spend-dataset-pipelines",
        bq_source=f"bq://{project_id}.unified_data.spend"
    )

    training_job_run_op = AutoMLTabularTrainingJobRunOp(
        dataset=ds_op.outputs["dataset"],
        target_column="spend_virtual_currency_value",
        project=project_id,
        display_name="spend-automl-pipelines-minimize-rmse",
        model_display_name="ispend-automl-pipelines-minimize-rmse",
        optimization_prediction_type="regression",
        budget_milli_node_hours=1000,
        optimization_objective="minimize-rmse"
    )

    create_endpoint_op = EndpointCreateOp(
        project=project_id,
        display_name = "spend-automl-pipelines-minimize-rmse-endpoint",
    )

    model_deploy_op = ModelDeployOp(
        model=training_job_run_op.outputs["model"],
        endpoint=create_endpoint_op.outputs['endpoint'],
        dedicated_resources_machine_type = "n1-highmem-4",
        dedicated_resources_accelerator_type = "ACCELERATOR_TYPE_UNSPECIFIED",
        dedicated_resources_min_replica_count=1,
        dedicated_resources_max_replica_count=1,
    )

    model_export_op = ModelExportOp(
        model=training_job_run_op.outputs["model"],
        export_format_id="tf-saved-model",
        artifact_destination=pipeline_root_path
    )
    
    model_export_op.set_caching_options(False)

## 5 - Compile the pipeline

In [11]:
# Compile the pipeline
kfp.compiler.Compiler().compile(
    pipeline_func=pipeline,
    package_path='spend_pipeline.yaml'
)

## 6 - Upload the compiled file to GCS

In [None]:
# Upload the compiled YAML file to Google Cloud Storage
storage_client = storage.Client(project=project_id)
bucket = storage_client.bucket(pipeline_bucket_name)
blob = bucket.blob('spend_pipeline.yaml')

# Upload the file 
blob.upload_from_filename('spend_pipeline.yaml')

## 6 - Prepare the pipeline job

In [None]:
# Initialise Vertex AI
aiplatform.init(
    project=project_id,
    location=location,
)

# Prepare the pipeline job
job = aiplatform.PipelineJob(
    display_name="spend-pipeline",
    template_path="spend_pipeline.yaml",
    pipeline_root=pipeline_root_path,
    parameter_values={
        'project_id': project_id
    }
)

## 7 - Submit the job

In [None]:
# Submit the job
job.submit(
    service_account = service_account
)