In [None]:
# Copyright 2024 Forusone
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License. 

## KFP modularization

In [1]:
# Install the packages
%pip install --user --quiet google-cloud-aiplatform \
                         google-cloud-storage \
                         google-cloud-pipeline-components \
                         kfp


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
! python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"

KFP SDK version: 2.10.1
google_cloud_pipeline_components version: 2.19.0


In [3]:
import sys
from IPython.display import Markdown, display

PROJECT_ID="ai-hangsik"
LOCATION="us-central1"

# For only colab user, no need this process for Colab Enterprise in Vertex AI.
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user(project_id=PROJECT_ID)

# set project.
!gcloud config set project {PROJECT_ID}

Updated property [core/project].


In [81]:
import json
from typing import NamedTuple, List

from google.cloud import aiplatform

from kfp import compiler, dsl
from kfp import client,compiler, dsl
from kfp.dsl import Artifact, Metrics, Dataset, Input, Model, Output, component

aiplatform.init(project=PROJECT_ID, location=LOCATION)

In [6]:
# Create a bucket.
BUCKET_URI = f"gs://mlops-poc-0303"
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

Creating gs://mlops-poc-0303/...
ServiceException: 409 A Cloud Storage bucket named 'mlops-poc-0303' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


In [7]:
PIPELINE_ROOT = f"{BUCKET_URI}/pipeline/iris/"

In [8]:
shell_output = ! gcloud projects describe  $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")

SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print(f"SERVICE_ACCOUNT: {SERVICE_ACCOUNT}")

SERVICE_ACCOUNT: 721521243942-compute@developer.gserviceaccount.com


In [9]:
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_URI
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_URI
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewerroles/logging.logWriter

No changes made to gs://mlops-poc-0303/
No changes made to gs://mlops-poc-0303/


### Modules

In [82]:
@dsl.component(base_image = "python:3.10",
               packages_to_install=['pandas'])

def create_dataset(dataset: Output[Dataset]):
    import pandas as pd

    csv_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
    col_names = [
        'Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Labels'
    ]
    df = pd.read_csv(csv_url, names=col_names)

    with open(dataset.path, 'w') as f:
        df.to_csv(f)

compiler.Compiler().compile(create_dataset, "create_dataset.yaml")

In [83]:
@dsl.component(base_image = "python:3.10",
               packages_to_install=['pandas', 'scikit-learn'])

def normalize_dataset(
    input_dataset: Input[Dataset],
    normalized_dataset: Output[Dataset],
    standard_scaler: bool,
    min_max_scaler: bool,
):
    if standard_scaler is min_max_scaler:
        raise ValueError(
            'Exactly one of standard_scaler or min_max_scaler must be True.')

    import pandas as pd
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.preprocessing import StandardScaler

    with open(input_dataset.path) as f:
        df = pd.read_csv(f)
    labels = df.pop('Labels')

    if standard_scaler:
        scaler = StandardScaler()
    if min_max_scaler:
        scaler = MinMaxScaler()

    df = pd.DataFrame(scaler.fit_transform(df))
    df['Labels'] = labels
    with open(normalized_dataset.path, 'w') as f:
        df.to_csv(f)

compiler.Compiler().compile(normalize_dataset, "normalize_dataset.yaml")

In [100]:
@dsl.component(base_image="python:3.10",
               packages_to_install=['pandas', 'scikit-learn'])

def train_model(
    n_neighbors: int,
    normalized_dataset: Input[Dataset],
    model: Output[Model],
    # metrics: Output[Metrics],

):
    # import pickle
    import joblib
    import os
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.neighbors import KNeighborsClassifier

    with open(normalized_dataset.path) as f:
        df = pd.read_csv(f)

    y = df.pop('Labels')
    X = df

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    clf = KNeighborsClassifier(n_neighbors=n_neighbors)
    clf.fit(X_train, y_train)

    # metrics.log_metric("accuracy", "99")
    # metrics.log_metric("framework", "sklearn")
    # metrics.log_metric("dataset_size", "100")
    # metrics.log_metric("AUC", "0.4")    
    


    # Export the model to a file
    os.makedirs(model.path, exist_ok=True)
    joblib.dump(clf, os.path.join(model.path, "model.joblib"))        
        
compiler.Compiler().compile(train_model, "train_model.yaml")    

In [101]:
@dsl.component(base_image="python:3.10",
               packages_to_install=['google-cloud-aiplatform', 'pandas', 'scikit-learn'])

def deploy_model(
    model: Input[Model],
    project_id: str,
    
    model_display_name:str,
    model_serving_container_image_uri:str,
    model_serving_machine_type:str,
    
    vertex_endpoint: Output[Artifact],
    vertex_model: Output[Model],
):

    from google.cloud import aiplatform

    aiplatform.init(project=project_id)

    deployed_model = aiplatform.Model.upload(
        display_name=model_display_name,
        artifact_uri=model.uri,
        serving_container_image_uri=model_serving_container_image_uri,
    )
    
    endpoint = deployed_model.deploy(machine_type=model_serving_machine_type)

    vertex_endpoint.uri = endpoint.resource_name
    vertex_model.uri = deployed_model.resource_name

compiler.Compiler().compile(deploy_model, "deploy_model.yaml")    

In [102]:
def build_pipeline (pipeline_name:str,
                    pipeline_desc:str,
                    pipeline_root:str, 
                    
                    create_dataset_file:str,
                    normalize_dataset_file:str,
                    train_model_file:str,
                    deploy_model_file:str,
                    
                    model_display_name:str,
                    model_serving_container_image_uri:str,
                    model_serving_machine_type:str,                    
                                        
                    ):

    @dsl.pipeline(
        name = pipeline_name,
        description = pipeline_desc,
        pipeline_root = pipeline_root,)

    def module_pipeline(standard_scaler: bool,
                        min_max_scaler: bool,
                        n_neighbors: int, ):

        from kfp import components

        create_dataset_comp = components.load_component_from_file(create_dataset_file)
        normalize_dataset_comp = components.load_component_from_file(normalize_dataset_file)
        train_model_comp = components.load_component_from_file(train_model_file)
        deploy_model_comp = components.load_component_from_file(deploy_model_file)

        # 1. create dataset
        create_dataset_task = create_dataset_comp()
        
        # 2. normalize dataset
        normalize_dataset_task = normalize_dataset_comp(
            input_dataset=create_dataset_task.outputs['dataset'],
            standard_scaler=standard_scaler,
            min_max_scaler=min_max_scaler)

        # 3. model training
        train_model_task = train_model_comp(
            normalized_dataset=normalize_dataset_task.outputs['normalized_dataset'],
            n_neighbors=n_neighbors
        )

        # 4. deploy model
        deploy_model_comp(model= train_model_task.outputs['model'],
                         project_id=PROJECT_ID,
                         model_display_name = model_display_name,
                         model_serving_container_image_uri = model_serving_container_image_uri,
                         model_serving_machine_type=model_serving_machine_type,
                    )
            
    compiler.Compiler().compile(pipeline_func=module_pipeline, package_path="kfp_module_pipeline.yaml")


In [107]:
build_pipeline(pipeline_name = "kfp_module_pipeline",
               pipeline_desc = "desc for kfp_module_pipeline",
               pipeline_root = PIPELINE_ROOT,

               create_dataset_file = 'create_dataset.yaml',
               normalize_dataset_file = 'normalize_dataset.yaml',
               train_model_file = 'train_model.yaml' ,
               deploy_model_file = 'deploy_model.yaml',

                model_display_name = 'kfp_module_model',
               
                # https://cloud.google.com/vertex-ai/docs/predictions/pre-built-containers#expandable-4
                model_serving_container_image_uri ='us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-5:latest', 
               
                # https://cloud.google.com/compute/docs/general-purpose-machines#e2_machine_types_table
                model_serving_machine_type='e2-standard-4',          
              )

In [108]:

job = aiplatform.PipelineJob(

    display_name="kfp_module_pipeline",
    template_path="kfp_module_pipeline.yaml",
    parameter_values = {
        'min_max_scaler': True,
        'standard_scaler': False,
        'n_neighbors': 3
    },
    
    pipeline_root=PIPELINE_ROOT,
    enable_caching = True
)

job.run(service_account = SERVICE_ACCOUNT)

Creating PipelineJob
PipelineJob created. Resource name: projects/721521243942/locations/us-central1/pipelineJobs/kfp-module-pipeline-20250305183059
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/721521243942/locations/us-central1/pipelineJobs/kfp-module-pipeline-20250305183059')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/kfp-module-pipeline-20250305183059?project=721521243942
PipelineJob projects/721521243942/locations/us-central1/pipelineJobs/kfp-module-pipeline-20250305183059 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/721521243942/locations/us-central1/pipelineJobs/kfp-module-pipeline-20250305183059 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/721521243942/locations/us-central1/pipelineJobs/kfp-module-pipeline-20250305183059 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/721521243942/locations/us-