In [None]:
# Copyright 2024 Forusone (shins777@gmail.com)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# AutoML tabular training - Classification
This notebook is simplified version of the below notebook in the official Google github. You can find more divese codes and detailed information from the link.
*  https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/pipelines/google_cloud_pipeline_components_automl_tabular.ipynb
*  https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/pipelines/


### Install Python package


In [None]:
! pip3 install --upgrade --quiet google-cloud-aiplatform \
                                 google-cloud-storage \
                                 kfp \
                                 google-cloud-pipeline-components

### Check package version

In [None]:
! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
! python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"

KFP SDK version: 2.10.1
google_cloud_pipeline_components version: 2.18.0


### Set configuration

#### Authentication to access to the GCP

In [None]:
import sys
from IPython.display import Markdown, display

PROJECT_ID="ai-hangsik"
LOCATION="us-central1"

# For only colab user, no need this process for Colab Enterprise in Vertex AI.
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user(project_id=PROJECT_ID)

# set project.
!gcloud config set project {PROJECT_ID}


Updated property [core/project].


### Import libraries

In [None]:
from typing import NamedTuple

import google.cloud.aiplatform as aiplatform
import kfp
from google.cloud import bigquery
from kfp import compiler, dsl
from kfp.dsl import ( Artifact,
                      ClassificationMetrics,
                      Input,
                      Metrics,
                      Output,
                      component)

### Data Preparation

#### Create a bucket

In [None]:
# Create a bucket.
BUCKET_URI = f"gs://mlops-{PROJECT_ID}-1209-pipeline"
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

Creating gs://mlops-ai-hangsik-1209-pipeline/...


#### Copy dta set into the bucket
*  gs://cloud-samples-data/vertex-ai/pipeline-deployment/datasets/california_housing/california_housing_train.csv

In [None]:
# ! gsutil cp gs://cloud-ml-data/NL-classification/happiness.csv {BUCKET_URI}/data/
PIPELINE_ROOT = f"{BUCKET_URI}/pipeline_root/"

TRAIN_FILE_NAME = "california_housing_train.csv"
! gsutil cp gs://cloud-samples-data/vertex-ai/pipeline-deployment/datasets/california_housing/california_housing_train.csv {PIPELINE_ROOT}/data/

GCS_CSV_PATH = f"{PIPELINE_ROOT}/data/{TRAIN_FILE_NAME}"

Copying gs://cloud-samples-data/vertex-ai/pipeline-deployment/datasets/california_housing/california_housing_train.csv [Content-Type=text/csv]...
/ [1 files][  1.6 MiB/  1.6 MiB]                                                
Operation completed over 1 objects/1.6 MiB.                                      


#### Set pipeline configuration

### Initialize Vertex AI SDK

In [None]:
aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

In [None]:

@kfp.dsl.pipeline(name="automl-tabular-training")
def automl_pipeline(project: str = PROJECT_ID, region: str = LOCATION):
    from google_cloud_pipeline_components.v1.automl.training_job import \
        AutoMLTabularTrainingJobRunOp
    from google_cloud_pipeline_components.v1.dataset import \
        TabularDatasetCreateOp
    from google_cloud_pipeline_components.v1.endpoint import (EndpointCreateOp,
                                                              ModelDeployOp)

    dataset_create_op = TabularDatasetCreateOp(
        project=project, display_name="housing", gcs_source=GCS_CSV_PATH
    )

    training_op = AutoMLTabularTrainingJobRunOp(
        project=project,
        display_name="train-automl-cal_housing",
        optimization_prediction_type="regression",
        optimization_objective="minimize-rmse",
        column_transformations=[
            {"numeric": {"column_name": "longitude"}},
            {"numeric": {"column_name": "latitude"}},
            {"numeric": {"column_name": "housing_median_age"}},
            {"numeric": {"column_name": "total_rooms"}},
            {"numeric": {"column_name": "total_bedrooms"}},
            {"numeric": {"column_name": "population"}},
            {"numeric": {"column_name": "households"}},
            {"numeric": {"column_name": "median_income"}},
            {"numeric": {"column_name": "median_house_value"}},
        ],
        dataset=dataset_create_op.outputs["dataset"],
        target_column="median_house_value",
    )

    endpoint_op = EndpointCreateOp(
        project=project,
        location=region,
        display_name="train-automl-cal_housing_endpoint",
    )

    ModelDeployOp(
        model=training_op.outputs["model"],
        endpoint=endpoint_op.outputs["endpoint"],
        dedicated_resources_machine_type="n1-standard-4",
        dedicated_resources_min_replica_count=1,
        dedicated_resources_max_replica_count=1,
    )

### Compile the pipeline

In [None]:

from kfp import compiler

compiler.Compiler().compile(
    pipeline_func=automl_pipeline,
    package_path="tabular_regression_pipeline.yaml",
)

In [None]:
import datetime

now = datetime.datetime.now()
now_format = now.strftime('%Y%m%d-%H%M%S')

DISPLAY_NAME = "cal_housing_" + now_format

job = aiplatform.PipelineJob(
    display_name=DISPLAY_NAME,
    template_path="tabular_regression_pipeline.yaml",
    pipeline_root=PIPELINE_ROOT,
    enable_caching=False,
)

job.run()

INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/721521243942/locations/us-central1/pipelineJobs/automl-tabular-training-20241207090310
INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:
INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/721521243942/locations/us-central1/pipelineJobs/automl-tabular-training-20241207090310')
INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/automl-tabular-training-20241207090310?project=721521243942
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/721521243942/locations/us-central1/pipelineJobs/automl-tabular-training-20241207090310 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/721521243942/location

In [None]:
#endpoint.undeploy(deployed_model_id=prediction.deployed_model_id)


In [None]:
# # Delete the training job
# job.delete()

# # Delete the model
# model.delete()

# # Delete the endpoint
# endpoint.delete()

# # Warning: Setting this to true will delete everything in your bucket
# delete_bucket = False

# if delete_bucket:
#     ! gsutil -m rm -r $BUCKET_URI

In [None]:
IMPORT_FILE = "petfinder-tabular-classification.csv"
! gsutil cp gs://cloud-samples-data/ai-platform-unified/datasets/tabular/{IMPORT_FILE} {BUCKET_URI}/data/

gcs_source = f"{BUCKET_URI}/data/{IMPORT_FILE}"

Copying gs://cloud-samples-data/ai-platform-unified/datasets/tabular/petfinder-tabular-classification.csv [Content-Type=text/csv]...
- [1 files][872.8 KiB/872.8 KiB]                                                
Operation completed over 1 objects/872.8 KiB.                                    


### Set constants

In [None]:
# set path for storing the pipeline artifacts

# PIPELINE_NAME: Set name for the pipeline.
# PIPELINE_ROOT: Cloud Storage bucket path to store pipeline artifacts.

PIPELINE_NAME = "automl-tabular-beans-training"
PIPELINE_ROOT = "{}/pipeline_root/beans".format(BUCKET_URI)

### Initialize Vertex AI SDK

In [None]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION)

### Create a Managed tabular dataset from a CSV

In [None]:
ds = dataset = aiplatform.TabularDataset.create(
    display_name="petfinder-tabular-dataset",
    gcs_source=gcs_source,
)

ds.resource_name

INFO:google.cloud.aiplatform.datasets.dataset:Creating TabularDataset
INFO:google.cloud.aiplatform.datasets.dataset:Create TabularDataset backing LRO: projects/721521243942/locations/asia-northeast3/datasets/5353566096106455040/operations/2516156493863059456
INFO:google.cloud.aiplatform.datasets.dataset:TabularDataset created. Resource name: projects/721521243942/locations/asia-northeast3/datasets/5353566096106455040
INFO:google.cloud.aiplatform.datasets.dataset:To use this TabularDataset in another session:
INFO:google.cloud.aiplatform.datasets.dataset:ds = aiplatform.TabularDataset('projects/721521243942/locations/asia-northeast3/datasets/5353566096106455040')


'projects/721521243942/locations/asia-northeast3/datasets/5353566096106455040'

In [None]:
job = aiplatform.AutoMLTabularTrainingJob(
    display_name="train-petfinder-automl-1",
    optimization_prediction_type="classification",
    column_transformations=[
        {"categorical": {"column_name": "Type"}},
        {"numeric": {"column_name": "Age"}},
        {"categorical": {"column_name": "Breed1"}},
        {"categorical": {"column_name": "Color1"}},
        {"categorical": {"column_name": "Color2"}},
        {"categorical": {"column_name": "MaturitySize"}},
        {"categorical": {"column_name": "FurLength"}},
        {"categorical": {"column_name": "Vaccinated"}},
        {"categorical": {"column_name": "Sterilized"}},
        {"categorical": {"column_name": "Health"}},
        {"numeric": {"column_name": "Fee"}},
        {"numeric": {"column_name": "PhotoAmt"}},
    ],
)

# This takes about an hour to run
model = job.run(
    dataset=ds,
    target_column="Adopted",
    training_fraction_split=0.8,
    validation_fraction_split=0.1,
    test_fraction_split=0.1,
    model_display_name="adopted-prediction-model",
    disable_early_stopping=False,
)

INFO:google.cloud.aiplatform.training_jobs:View Training:
https://console.cloud.google.com/ai/platform/locations/asia-northeast3/training/3291437535770705920?project=721521243942
INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/721521243942/locations/asia-northeast3/trainingPipelines/3291437535770705920 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/721521243942/locations/asia-northeast3/trainingPipelines/3291437535770705920 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/721521243942/locations/asia-northeast3/trainingPipelines/3291437535770705920 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/721521243942/locations/asia-northeast3/trainingPipelines/3291437535770705920 current state:
PipelineState.PIPELINE_STATE_RUNNI