In [None]:
# Copyright 2024 Forusone(shins777@gmail.com)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# AutoML training - Tabular Regression
This notebook is simplified version of the below notebook in the official Google github. You can find more divese codes and detailed information from the link.
*  https://colab.sandbox.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/automl/sdk_automl_tabular_regression_online_bq.ipynb?authuser=2#scrollTo=iFuEezd2CzIk
*  https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/automl/
*  https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform

### Dataset

The dataset used for this tutorial is the GSOD dataset from [BigQuery public datasets](https://cloud.google.com/bigquery/public-data). The version of the dataset you use only the fields year, month and day to predict the value of mean daily temperature (mean_temp).

### Install Vertex AI SDK

In [1]:
! pip3 install --upgrade --quiet google-cloud-aiplatform \
                                google-cloud-storage

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/6.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.7/6.5 MB[0m [31m22.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/6.5 MB[0m [31m26.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/6.5 MB[0m [31m26.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m6.5/6.5 MB[0m [31m44.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m6.5/6.5 MB[0m [31m44.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/131.8 kB[0m [31m?[0m eta [36m-:--:--

## Configuration

### Authenticate your notebook environment

In [2]:
import sys
from IPython.display import Markdown, display

PROJECT_ID="ai-hangsik"
LOCATION="us-central1"

# For only colab user, no need this process for Colab Enterprise in Vertex AI.
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user(project_id=PROJECT_ID)

# set project.
!gcloud config set project {PROJECT_ID}

Updated property [core/project].


### Initialize Vertex AI SDK

In [3]:
from google.cloud import aiplatform, bigquery
aiplatform.init(project=PROJECT_ID, location=LOCATION)

## Data preparation

#### Location of BigQuery training data.

In [4]:
IMPORT_FILE = "bq://bigquery-public-data.samples.gsod"

### Create the Dataset

In [5]:
# https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.TabularDataset#google_cloud_aiplatform_TabularDataset_create

dataset = aiplatform.TabularDataset.create(
    display_name="automl_tabular_regression_online",
    bq_source=[IMPORT_FILE],
)

label_column = "mean_temp"

print(dataset.resource_name)

INFO:google.cloud.aiplatform.datasets.dataset:Creating TabularDataset
INFO:google.cloud.aiplatform.datasets.dataset:Create TabularDataset backing LRO: projects/721521243942/locations/us-central1/datasets/2660831883113267200/operations/5526907849436823552
INFO:google.cloud.aiplatform.datasets.dataset:TabularDataset created. Resource name: projects/721521243942/locations/us-central1/datasets/2660831883113267200
INFO:google.cloud.aiplatform.datasets.dataset:To use this TabularDataset in another session:
INFO:google.cloud.aiplatform.datasets.dataset:ds = aiplatform.TabularDataset('projects/721521243942/locations/us-central1/datasets/2660831883113267200')


projects/721521243942/locations/us-central1/datasets/2660831883113267200


## Model Training

### Create and run training pipeline

In [8]:
# https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.AutoMLTabularTrainingJob

job = aiplatform.AutoMLTabularTrainingJob(
    display_name="automl_tabular_regression_online",
    optimization_prediction_type="regression",
    optimization_objective="minimize-rmse",
    column_transformations=[
        {"auto": {"column_name": "year"}},
        {"auto": {"column_name": "month"}},
        {"auto": {"column_name": "day"}},
    ],
)

print(job)

<google.cloud.aiplatform.training_jobs.AutoMLTabularTrainingJob object at 0x78578167ecb0>


### Run the training pipeline

In [None]:
# https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.AutoMLTabularTrainingJob#google_cloud_aiplatform_AutoMLTabularTrainingJob_run

model = job.run(
    dataset=dataset,
    model_display_name="automl_tabular_regression_online",
    training_fraction_split=0.6,
    validation_fraction_split=0.2,
    test_fraction_split=0.2,
    budget_milli_node_hours=8000,
    disable_early_stopping=False,
    target_column="mean_temp",
)

## Review model evaluation scores

In [None]:
# https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.Model
# https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.Model#google_cloud_aiplatform_Model_list_model_evaluations

model_evaluations = model.list_model_evaluations()

if len(model_evaluations) > 0:
    eval_res = model_evaluations[0].to_dict()
    evaluation_metrics = eval_res["metrics"]

print(model_evaluations)
print(evaluation_metrics)

{'rootMeanSquaredError': 20.726614, 'meanAbsoluteError': 15.598564, 'meanAbsolutePercentageError': 9897303.0, 'rSquared': 0.2675524, 'rootMeanSquaredLogError': 0.84477913}


## Deploy the model

In [None]:
endpoint = model.deploy(
    deployed_model_display_name = "automl_tabular_regression_online",
    machine_type="n1-standard-4",
    min_replica_count = 1,
    max_replica_count = 1,
)

INFO:google.cloud.aiplatform.models:Creating Endpoint
INFO:google.cloud.aiplatform.models:Create Endpoint backing LRO: projects/721521243942/locations/us-central1/endpoints/2128312563258097664/operations/1045544745226469376
INFO:google.cloud.aiplatform.models:Endpoint created. Resource name: projects/721521243942/locations/us-central1/endpoints/2128312563258097664
INFO:google.cloud.aiplatform.models:To use this Endpoint in another session:
INFO:google.cloud.aiplatform.models:endpoint = aiplatform.Endpoint('projects/721521243942/locations/us-central1/endpoints/2128312563258097664')
INFO:google.cloud.aiplatform.models:Deploying model to Endpoint : projects/721521243942/locations/us-central1/endpoints/2128312563258097664
INFO:google.cloud.aiplatform.models:Deploy Endpoint model backing LRO: projects/721521243942/locations/us-central1/endpoints/2128312563258097664/operations/9156527674120732672
INFO:google.cloud.aiplatform.models:Endpoint model deployed. Resource name: projects/72152124394

## Send a online prediction request

In [13]:
INSTANCE = {"year": "1932", "month": "11", "day": "6"}
instances_list = [INSTANCE]

In [15]:
endpoint = aiplatform.Endpoint(endpoint_name='2128312563258097664')

prediction = endpoint.predict(instances_list)
print(prediction)

Prediction(predictions=[{'value': 45.2983283996582, 'lower_bound': 12.78389739990234, 'upper_bound': 61.65780258178711}], deployed_model_id='2326501733187977216', metadata=None, model_version_id='1', model_resource_name='projects/721521243942/locations/us-central1/models/7398170640118710272', explanations=None)


## Undeploy the model

In [None]:
endpoint.undeploy_all()

# Cleaning up

In [None]:
# Delete the dataset using the Vertex dataset object
dataset.delete()

# Delete the model using the Vertex model object
model.delete()

# Delete the endpoint using the Vertex endpoint object
endpoint.delete()

# Delete the AutoML trainig job
job.delete()

delete_bucket = False  # set True to delete bucket

if delete_bucket:
    ! gsutil rm -r $BUCKET_URI