In [None]:
# Copyright 2024 Forusone(shins777@gmail.com)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# AutoML training tabular regression model for batch prediction using BigQuery

This notebook simplifies the below notebook for training
*  https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/automl/sdk_automl_tabular_regression_batch_bq.ipynb#scrollTo=title

### Dataset

The dataset used for this tutorial is the [GSOD dataset](https://console.cloud.google.com/marketplace/product/noaa-public/gsod) from [BigQuery public datasets](https://cloud.google.com/bigquery/public-data). In this version of the dataset, you use the year, month, and day fields to predict the mean daily temperature (`mean_temp`).

### Install Vertex AI SDK


In [None]:
# Install the packages
! pip3 install --upgrade --quiet google-cloud-aiplatform \
                                    'google-cloud-bigquery[bqstorage,pandas]' \
                                    google-cloud-storage


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.8/131.8 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.1/240.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h

## Set configuration

### Authenticate your notebook environment

In [None]:
import sys
from IPython.display import Markdown, display

PROJECT_ID="ai-hangsik"
LOCATION="us-central1"

# For only colab user, no need this process for Colab Enterprise in Vertex AI.
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user(project_id=PROJECT_ID)

# set project.
!gcloud config set project {PROJECT_ID}

Updated property [core/project].


### Initialize Vertex AI SDK

In [None]:
from google.cloud import aiplatform, bigquery
aiplatform.init(project=PROJECT_ID, location=LOCATION)

## Data preparation

### Location of BigQuery training data.

Set the `IMPORT_File` variable to the location of the data table in BigQuery.

In [None]:
IMPORT_FILE = "bigquery-public-data.samples.gsod"

#### Prepare the batch prediction data

Create two datasets from the original data.

In [None]:
# Create client in default location
bq_client = bigquery.Client(
    project=PROJECT_ID,
    credentials=aiplatform.initializer.global_config.credentials,
)

In [None]:
# Create training dataset in default location
TRAINING_INPUT_DATASET_ID = "gsod_training_unique"
bq_dataset = bigquery.Dataset(f"{PROJECT_ID}.{TRAINING_INPUT_DATASET_ID}")
bq_dataset = bq_client.create_dataset(bq_dataset)
print(f"Created dataset {bq_client.project}.{bq_dataset.dataset_id}")

# Create test dataset in default location
PREDICTION_INPUT_DATASET_ID = "gsod_prediction_unique"
bq_dataset = bigquery.Dataset(f"{PROJECT_ID}.{PREDICTION_INPUT_DATASET_ID}")
bq_dataset = bq_client.create_dataset(bq_dataset)
print(f"Created dataset {bq_client.project}.{bq_dataset.dataset_id}")

Created dataset ai-hangsik.gsod_training_unique
Created dataset ai-hangsik.gsod_prediction_unique


In [None]:
# Select top 3000 rows of dataset
TRAINING_SIZE = 3000
query = f"""
        SELECT *
        FROM {IMPORT_FILE}
        LIMIT {TRAINING_SIZE}
        """

TRAINING_INPUT_TABLE_ID = f"{PROJECT_ID}.{TRAINING_INPUT_DATASET_ID}.test"
job_config = bigquery.QueryJobConfig(destination=TRAINING_INPUT_TABLE_ID)

query_job = bq_client.query(query, job_config=job_config)  # API request
query_job.result()  # Waits for query to finish

<google.cloud.bigquery.table.RowIterator at 0x7eeec7552590>

In [None]:
# Select a subset of the original dataset for testing
PREDICTION_SIZE = 100
query = f"""
        SELECT *
        FROM {IMPORT_FILE}
        LIMIT {PREDICTION_SIZE}
        OFFSET {TRAINING_SIZE}
        """

PREDICTION_INPUT_TABLE_ID = f"{PROJECT_ID}.{PREDICTION_INPUT_DATASET_ID}.prediction"
job_config = bigquery.QueryJobConfig(destination=PREDICTION_INPUT_TABLE_ID)

query_job = bq_client.query(query, job_config=job_config)  # API request
query_job.result()  # Waits for query to finish

<google.cloud.bigquery.table.RowIterator at 0x7eeec7551d20>

### Create the Dataset

In [None]:
# https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.TabularDataset#google_cloud_aiplatform_TabularDataset_create

dataset = aiplatform.TabularDataset.create(
    display_name="NOAA historical weather data_unique",
    bq_source=[f"bq://{TRAINING_INPUT_TABLE_ID}"],
)

label_column = "mean_temp"

print(dataset.resource_name)

INFO:google.cloud.aiplatform.datasets.dataset:Creating TabularDataset
INFO:google.cloud.aiplatform.datasets.dataset:Create TabularDataset backing LRO: projects/721521243942/locations/us-central1/datasets/7502201482536550400/operations/2192401343694503936
INFO:google.cloud.aiplatform.datasets.dataset:TabularDataset created. Resource name: projects/721521243942/locations/us-central1/datasets/7502201482536550400
INFO:google.cloud.aiplatform.datasets.dataset:To use this TabularDataset in another session:
INFO:google.cloud.aiplatform.datasets.dataset:ds = aiplatform.TabularDataset('projects/721521243942/locations/us-central1/datasets/7502201482536550400')


projects/721521243942/locations/us-central1/datasets/7502201482536550400


In [None]:
COLUMN_SPECS = {
    "year": "auto",
    "month": "auto",
    "day": "auto",
}

label_column = "mean_temp"

### Create and run training pipeline

In [None]:
# https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.AutoMLTabularTrainingJob

training_job = aiplatform.AutoMLTabularTrainingJob(
    display_name="job_automl_tabular_regression_batch",
    optimization_prediction_type="regression",
    optimization_objective="minimize-rmse",
    column_specs=COLUMN_SPECS,
)

print(training_job)

<google.cloud.aiplatform.training_jobs.AutoMLTabularTrainingJob object at 0x7eeec74cf340>


#### Run the training pipeline

In [None]:
# https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.AutoMLTabularTrainingJob#google_cloud_aiplatform_AutoMLTabularTrainingJob_run

model = training_job.run(
    dataset=dataset,
    model_display_name="automl_tabular_regression_batch",
    training_fraction_split=0.6,
    validation_fraction_split=0.2,
    test_fraction_split=0.2,
    budget_milli_node_hours=1000, # milli node hours i.e. 1,000 value in this field means 1 node hour.
    disable_early_stopping=False,
    target_column=label_column,
)

INFO:google.cloud.aiplatform.training_jobs:View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/3820928491365335040?project=721521243942
INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/721521243942/locations/us-central1/trainingPipelines/3820928491365335040 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/721521243942/locations/us-central1/trainingPipelines/3820928491365335040 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/721521243942/locations/us-central1/trainingPipelines/3820928491365335040 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/721521243942/locations/us-central1/trainingPipelines/3820928491365335040 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud

## Review model evaluation scores
After model training is complete, you can review its evaluation scores.

In [None]:
# Get evaluations
model_evaluations = model.list_model_evaluations()

model_evaluation = list(model_evaluations)[0]
print(model_evaluation)

<google.cloud.aiplatform.model_evaluation.model_evaluation.ModelEvaluation object at 0x7eeec71cd510> 
resource name: projects/721521243942/locations/us-central1/models/4744424559690645504@1/evaluations/7446181540179244394


## Send a batch prediction request

Now you can make a batch prediction.

### Create a results dataset

Create a dataset to store the prediction results.

In [None]:
# Create results dataset in default location
RESULTS_DATASET_ID = "gsod_results_unique"
bq_dataset = bigquery.Dataset(f"{PROJECT_ID}.{RESULTS_DATASET_ID}")
bq_dataset = bq_client.create_dataset(bq_dataset)
print(f"Created dataset {bq_client.project}.{bq_dataset.dataset_id}")

Created dataset ai-hangsik.gsod_results_unique


### Make the batch prediction request

You can make a batch prediction by invoking the `batch_predict()` method, with the following parameters:

- `job_display_name`: The human readable name for the batch prediction job.
- `gcs_source`: A list of one or more batch request input files.
- `gcs_destination_prefix`: The Cloud Storage location for storing the batch prediction resuls.
- `instances_format`: The format for the input instances, either 'bigquery', 'csv' or 'jsonl'. Defaults to 'jsonl'.
- `predictions_format`: The format for the output predictions, either 'csv' or 'jsonl'. Defaults to 'jsonl'.
- `machine_type`: The type of machine to use for training.
- `accelerator_type`: The hardware accelerator type.
- `accelerator_count`: The number of accelerators to attach to a worker replica.
- `sync`: Set `True` to wait until the completion of the job.

Batch prediction job takes roughly 1 hour to finish.

In [17]:
# Note: The bigquery_source and bigquery_destination_prefix must be in the same location
PREDICTION_RESULTS_DATASET_ID = f"{PROJECT_ID}.{RESULTS_DATASET_ID}"

batch_predict_job = model.batch_predict(
    job_display_name="tabular_regression_batch_predict_job",
    bigquery_source=f"bq://{PREDICTION_INPUT_TABLE_ID}",
    instances_format="bigquery",
    predictions_format="bigquery",
    bigquery_destination_prefix=f"bq://{PREDICTION_RESULTS_DATASET_ID}",
)

INFO:google.cloud.aiplatform.jobs:Creating BatchPredictionJob
INFO:google.cloud.aiplatform.jobs:BatchPredictionJob created. Resource name: projects/721521243942/locations/us-central1/batchPredictionJobs/7154841260828655616
INFO:google.cloud.aiplatform.jobs:To use this BatchPredictionJob in another session:
INFO:google.cloud.aiplatform.jobs:bpj = aiplatform.BatchPredictionJob('projects/721521243942/locations/us-central1/batchPredictionJobs/7154841260828655616')
INFO:google.cloud.aiplatform.jobs:View Batch Prediction Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/batch-predictions/7154841260828655616?project=721521243942
INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/721521243942/locations/us-central1/batchPredictionJobs/7154841260828655616 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/721521243942/locations/us-central1/batchPredictionJobs/7154841260828655616 current state:
JobState.JOB_STAT

###  View the batch prediction results

Use the BigQuery Python client to query the destination table and return results as a Pandas dataframe.

In [19]:
dataframe = (
    bq_client.query(f"SELECT * FROM `{PREDICTION_RESULTS_DATASET_ID}.*`")
    .result()
    .to_dataframe()
)

dataframe

Unnamed: 0,day,fog,hail,max_gust_wind_speed,max_sustained_wind_speed,max_temperature,max_temperature_explicit,mean_dew_point,mean_sealevel_pressure,mean_station_pressure,...,predicted_mean_temp,rain,snow,snow_depth,station_number,thunder,tornado,total_precipitation,wban_number,year
0,1,False,False,,23.9,37.000000,True,,1019.599976,,...,"{'value': 52.18701171875, 'lower_bound': 34.60...",False,False,,70050,False,False,0.0,99999,1940
1,11,False,False,,5.1,46.000000,True,,,,...,"{'value': 51.26826477050781, 'lower_bound': 34...",False,False,,70700,False,False,0.0,99999,1940
2,26,False,False,,11.1,74.800003,True,,,,...,"{'value': 37.13731002807617, 'lower_bound': 18...",False,False,,942945,False,False,0.0,99999,1940
3,27,False,False,,14.0,75.699997,True,,,,...,"{'value': 28.0535888671875, 'lower_bound': 8.9...",False,False,,943350,False,False,0.0,99999,1940
4,9,False,False,,9.9,39.900002,True,,,,...,"{'value': 61.446102142333984, 'lower_bound': 4...",False,False,,943740,False,False,0.0,99999,1940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,13,True,True,,15.9,24.400000,True,24.700001,1012.000000,,...,"{'value': 34.59357452392578, 'lower_bound': 18...",True,True,,749105,True,True,,99999,1939
96,26,True,True,,9.9,54.299999,True,54.900002,1019.099976,,...,"{'value': 43.83852767944336, 'lower_bound': 29...",True,True,,749105,True,True,,99999,1939
97,9,False,False,,14.0,69.300003,True,66.699997,1008.200012,,...,"{'value': 61.446102142333984, 'lower_bound': 4...",False,False,,749105,False,False,0.0,99999,1939
98,12,False,False,,26.0,40.299999,True,36.700001,1008.200012,,...,"{'value': 45.62776565551758, 'lower_bound': 24...",False,False,,749105,False,False,,99999,1939




# Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:

- Model
- AutoML Training Job
- Batch Job

In [None]:
# Delete BigQuery datasets
bq_client.delete_dataset(
    f"{PROJECT_ID}.{TRAINING_INPUT_DATASET_ID}",
    delete_contents=True,
    not_found_ok=True,
)

bq_client.delete_dataset(
    f"{PROJECT_ID}.{PREDICTION_INPUT_DATASET_ID}",
    delete_contents=True,
    not_found_ok=True,
)

bq_client.delete_dataset(
    f"{PROJECT_ID}.{RESULTS_DATASET_ID}", delete_contents=True, not_found_ok=True
)

# Delete Vertex AI resources
dataset.delete()
model.delete()
training_job.delete()
batch_predict_job.delete()