In [None]:
# Copyright 2024 Forusone(shins777@gmail.com)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# AutoML training - Tabular Classification
This notebook is simplified version of the below notebook in the official Google github. You can find more divese codes and detailed information from the link.
*  https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/automl/automl-tabular-classification.ipynb
*  https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/automl/
*  https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform

#### Dataset

The dataset we are using is the PetFinder Dataset, available locally in Colab. To learn more about this dataset, visit https://www.kaggle.com/c/petfinder-adoption-prediction.



## Install Python packages


In [1]:
! pip3 install --quiet --upgrade google-cloud-aiplatform \
                                 google-cloud-storage

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.8/131.8 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25h

## Set configuration

### Authentication to access to the GCP

In [2]:
import sys
from IPython.display import Markdown, display

PROJECT_ID="ai-hangsik"
LOCATION="us-central1"

# For only colab user, no need this process for Colab Enterprise in Vertex AI.
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user(project_id=PROJECT_ID)

# set project.
!gcloud config set project {PROJECT_ID}


Updated property [core/project].


### Initialize Vertex AI SDK

In [3]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION)

## Data preparation

### Create a bucket

In [4]:
# Create a bucket.
BUCKET_URI = f"gs://mlops-{PROJECT_ID}-1209"
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

Creating gs://mlops-ai-hangsik-1209/...


### Copy dta set into the bucket

In [12]:
IMPORT_FILE = "petfinder-tabular-classification.csv"
! gsutil cp gs://cloud-samples-data/ai-platform-unified/datasets/tabular/{IMPORT_FILE} {BUCKET_URI}/automl/data/

gcs_source = f"{BUCKET_URI}/automl/data/{IMPORT_FILE}"

Copying gs://cloud-samples-data/ai-platform-unified/datasets/tabular/petfinder-tabular-classification.csv [Content-Type=text/csv]...
/ [1 files][872.8 KiB/872.8 KiB]                                                
Operation completed over 1 objects/872.8 KiB.                                    


### Create a Managed tabular dataset from a CSV

Choose relevant dataset for origin data.

*   [TabularDataset](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.TabularDataset)
*   [TextDataset](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.TextDataset)
*   [TimeSeriesDataset](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.TimeSeriesDataset)
*   [ImageDataset](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.ImageDataset)
*   [VideoDataset](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.VideoDataset)




In [16]:
# https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.TabularDataset
# https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.TabularDataset#google_cloud_aiplatform_TabularDataset_create

ds = dataset = aiplatform.TabularDataset.create(
    display_name="automl_tabular_classification_petfinder",
    gcs_source=gcs_source,
)

ds.resource_name

INFO:google.cloud.aiplatform.datasets.dataset:Creating TabularDataset
INFO:google.cloud.aiplatform.datasets.dataset:Create TabularDataset backing LRO: projects/721521243942/locations/us-central1/datasets/1183651205335744512/operations/4255203904658079744
INFO:google.cloud.aiplatform.datasets.dataset:TabularDataset created. Resource name: projects/721521243942/locations/us-central1/datasets/1183651205335744512
INFO:google.cloud.aiplatform.datasets.dataset:To use this TabularDataset in another session:
INFO:google.cloud.aiplatform.datasets.dataset:ds = aiplatform.TabularDataset('projects/721521243942/locations/us-central1/datasets/1183651205335744512')


'projects/721521243942/locations/us-central1/datasets/1183651205335744512'

### Model training

The following classes are related to create a job in AutoML training.  
Note: Recommend to use Gemini(LLM model) for the model to analyze text data by AutoML Text training

*   [AutoMLTabularTrainingJob](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.AutoMLTabularTrainingJob)
*   [AutoMLForecastingTrainingJob](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.AutoMLForecastingTrainingJob)
*   [AutoMLImageTrainingJob](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.AutoMLImageTrainingJob)
*   [AutoMLVideoTrainingJob](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.AutoMLVideoTrainingJob)



#### Create a training job

In [17]:

# https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.AutoMLTabularTrainingJob

job = aiplatform.AutoMLTabularTrainingJob(
    display_name="automl_tabular_classification_petfinder",
    optimization_prediction_type="classification",
    column_transformations=[
        {"categorical": {"column_name": "Type"}},
        {"numeric": {"column_name": "Age"}},
        {"categorical": {"column_name": "Breed1"}},
        {"categorical": {"column_name": "Color1"}},
        {"categorical": {"column_name": "Color2"}},
        {"categorical": {"column_name": "MaturitySize"}},
        {"categorical": {"column_name": "FurLength"}},
        {"categorical": {"column_name": "Vaccinated"}},
        {"categorical": {"column_name": "Sterilized"}},
        {"categorical": {"column_name": "Health"}},
        {"numeric": {"column_name": "Fee"}},
        {"numeric": {"column_name": "PhotoAmt"}},
    ],
)

#### Run a training job

In [18]:
# This takes about sevral hours to run, It depends on the amount of data.

# https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.AutoMLTabularTrainingJob#google_cloud_aiplatform_AutoMLTabularTrainingJob_run
# Return : https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.Model

model = job.run(
    dataset=ds,
    target_column="Adopted",
    training_fraction_split=0.8,
    validation_fraction_split=0.1,
    test_fraction_split=0.1,
    model_display_name="automl_tabular_classification_petfinder",
    disable_early_stopping=False,
)

INFO:google.cloud.aiplatform.training_jobs:View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/669446691182608384?project=721521243942
INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/721521243942/locations/us-central1/trainingPipelines/669446691182608384 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/721521243942/locations/us-central1/trainingPipelines/669446691182608384 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/721521243942/locations/us-central1/trainingPipelines/669446691182608384 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/721521243942/locations/us-central1/trainingPipelines/669446691182608384 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aipl

### Create endpoint

In [20]:
# Model
# https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.Model
# https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.Model#google_cloud_aiplatform_Model_delete

# Endpoint
# https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.Endpoint
# https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.Endpoint#google_cloud_aiplatform_Endpoint_deploy
endpoint = model.deploy(
    deployed_model_display_name = "automl_tabular_classification_petfinder",
    machine_type="n1-standard-4",
    min_replica_count = 1,
    max_replica_count = 1,
)


INFO:google.cloud.aiplatform.models:Creating Endpoint
INFO:google.cloud.aiplatform.models:Create Endpoint backing LRO: projects/721521243942/locations/us-central1/endpoints/3054928186589577216/operations/3380027833320472576
INFO:google.cloud.aiplatform.models:Endpoint created. Resource name: projects/721521243942/locations/us-central1/endpoints/3054928186589577216
INFO:google.cloud.aiplatform.models:To use this Endpoint in another session:
INFO:google.cloud.aiplatform.models:endpoint = aiplatform.Endpoint('projects/721521243942/locations/us-central1/endpoints/3054928186589577216')
INFO:google.cloud.aiplatform.models:Deploying model to Endpoint : projects/721521243942/locations/us-central1/endpoints/3054928186589577216
INFO:google.cloud.aiplatform.models:Deploy Endpoint model backing LRO: projects/721521243942/locations/us-central1/endpoints/3054928186589577216/operations/8903692776290385920
INFO:google.cloud.aiplatform.models:Endpoint model deployed. Resource name: projects/72152124394

### Prediction

In [21]:

# Endpoint
# https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.Endpoint
# https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.Endpoint#google_cloud_aiplatform_Endpoint_predict

prediction = endpoint.predict(
    [
        {
            "Type": "Cat",
            "Age": "3",
            "Breed1": "Tabby",
            "Gender": "Male",
            "Color1": "Black",
            "Color2": "White",
            "MaturitySize": "Small",
            "FurLength": "Short",
            "Vaccinated": "No",
            "Sterilized": "No",
            "Health": "Healthy",
            "Fee": "100",
            "PhotoAmt": "2",
        }
    ]
)

print(prediction)


Prediction(predictions=[{'classes': ['Yes', 'No'], 'scores': [0.7114446759223938, 0.2885552048683167]}], deployed_model_id='1862630971568816128', metadata=None, model_version_id='1', model_resource_name='projects/721521243942/locations/us-central1/models/225062333624352768', explanations=None)


### Undeploy the model

In [None]:
endpoint.undeploy(deployed_model_id=prediction.deployed_model_id)


### Cleaning up

In [None]:

# Delete the training job
job.delete()

# Delete the model
model.delete()

# Delete the endpoint
endpoint.delete()

# Warning: Setting this to true will delete everything in your bucket
delete_bucket = False

if delete_bucket:
    ! gsutil -m rm -r $BUCKET_URI