In [None]:
# Copyright 2024 Forusone
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License. 

## KFP modularization

This notebook explains how to moudularize the Kubeflow pipeline with compiled files. 

In [1]:
# Install the packages
%pip install --user --quiet  google-cloud-aiplatform \
                             google-cloud-storage \
                             google-cloud-pipeline-components \
                             kfp


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
! python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"

KFP SDK version: 2.10.1
google_cloud_pipeline_components version: 2.19.0


### Configuration

In [3]:
import sys
from IPython.display import Markdown, display

PROJECT_ID="ai-hangsik"
LOCATION="us-central1"

# For only colab user, no need this process for Colab Enterprise in Vertex AI.
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user(project_id=PROJECT_ID)

# set project.
!gcloud config set project {PROJECT_ID}

Updated property [core/project].


### Initialize Vertex AI

In [4]:
import json
from typing import NamedTuple, List

from google.cloud import aiplatform

from kfp import compiler, dsl
from kfp import client,compiler, dsl
from kfp.dsl import Artifact, Metrics, Dataset, Input, Model, Output, component

aiplatform.init(project=PROJECT_ID, location=LOCATION)

### Create a bucket for pipeline root to store artifacts

In [5]:
# Create a bucket.
BUCKET_URI = f"gs://mlops-poc-0303"
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

Creating gs://mlops-poc-0303/...
ServiceException: 409 A Cloud Storage bucket named 'mlops-poc-0303' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


In [6]:
PIPELINE_ROOT = f"{BUCKET_URI}/pipeline/iris/"

### Set access to the service account.

In [7]:
shell_output = ! gcloud projects describe  $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")

SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print(f"SERVICE_ACCOUNT: {SERVICE_ACCOUNT}")

SERVICE_ACCOUNT: 721521243942-compute@developer.gserviceaccount.com


In [8]:
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_URI
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_URI
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewerroles/logging.logWriter

No changes made to gs://mlops-poc-0303/
No changes made to gs://mlops-poc-0303/


## Modules

### Component 1 : Template to create dataset. 
Generate a yaml file inclusing code to create dataset.

In [9]:
@dsl.component(base_image = "python:3.10",
               packages_to_install=['pandas'])

def create_dataset(
                data_args : dict,
                data_artifact : Output[Artifact],
                dataset: Output[Dataset]
):
        
    import pandas as pd
    import json
    import ast
    
    csv_url = data_args['csv_url']
    
    # convert str to list.
    col_names = ast.literal_eval(data_args['col_names'])

    # Write a data args with type of Artifact
    with open(data_artifact.path, 'w') as f:
        f.write(json.dumps(data_args))
    
    df = pd.read_csv(csv_url, names=col_names)
    
    # Write a dataset with type of Dataset.
    with open(dataset.path, 'w') as f:
        df.to_csv(f)

compiler.Compiler().compile(create_dataset, "create_dataset.yaml")

### Component 2 : Template to normalize dataset. 
Generate a yaml file inclusing code to normalize dataset.

In [10]:
@dsl.component(base_image = "python:3.10",
               packages_to_install=['pandas', 'scikit-learn'])

def normalize_dataset(
    input_dataset: Input[Dataset],
    normalized_dataset: Output[Dataset],
    normalize_args:dict,
):
    import pandas as pd
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.preprocessing import StandardScaler

    standard_scaler = normalize_args['standard_scaler']
    min_max_scaler = normalize_args['min_max_scaler']

    if standard_scaler is min_max_scaler:
        raise ValueError(
            'Exactly one of standard_scaler or min_max_scaler must be True.')
    
    with open(input_dataset.path) as f:
        df = pd.read_csv(f)
    labels = df.pop('Labels')
    
    if standard_scaler:
        scaler = StandardScaler()
    if min_max_scaler:
        scaler = MinMaxScaler()

    df = pd.DataFrame(scaler.fit_transform(df))
    df['Labels'] = labels
    with open(normalized_dataset.path, 'w') as f:
        df.to_csv(f)

compiler.Compiler().compile(normalize_dataset, "normalize_dataset.yaml")

### Component 3 : Template to train a model
Generate a yaml file inclusing code to train a model

In [11]:
@dsl.component(base_image="python:3.10",
               packages_to_install=['pandas', 'scikit-learn'])

def train_model(
    train_args:dict,
    normalized_dataset: Input[Dataset],
    model: Output[Model],
    metrics: Output[Metrics],

):
    import joblib
    import os
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.neighbors import KNeighborsClassifier

    test_size = float(train_args['hyper_params']['in_test_size'].strip())
    n_neighbors = int(train_args['hyper_params']['n_neighbors'].strip())
    
    with open(normalized_dataset.path) as f:
        df = pd.read_csv(f)

    y = df.pop('Labels')
    X = df

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size= test_size)

    clf = KNeighborsClassifier(n_neighbors=n_neighbors)
    clf.fit(X_train, y_train)

    metrics.log_metric("accuracy", 99)
    metrics.log_metric("framework", "sklearn")
    metrics.log_metric("dataset_size", 100)
    metrics.log_metric("AUC", 0.4)    
    
    print(f"Metrics URI : {metrics.uri}")
    
    # Export the model to a file
    os.makedirs(model.path, exist_ok=True)
    joblib.dump(clf, os.path.join(model.path, "model.joblib"))        
        
compiler.Compiler().compile(train_model, "train_model.yaml")    

### Component 4 : Template to deploy a model
Generate a yaml file inclusing code to deploy a model

In [12]:
@dsl.component(base_image="python:3.10",
               packages_to_install=['google-cloud-aiplatform', 'pandas', 'scikit-learn'])

def deploy_model(
    deploy_args:dict,
    model: Input[Model],
    vertex_endpoint: Output[Artifact],
    vertex_model: Output[Model],
):

    from google.cloud import aiplatform
    
    project_id = deploy_args['project_id']
    display_name = deploy_args['display_name']
    container_image_uri = deploy_args['container_image_uri']
    machine_type = deploy_args['machine_type']    
    
    aiplatform.init(project=project_id)

    deployed_model = aiplatform.Model.upload(
        display_name=display_name,
        artifact_uri=model.uri,
        serving_container_image_uri=container_image_uri,
    )
    
    endpoint = deployed_model.deploy(machine_type=machine_type)

    vertex_endpoint.uri = endpoint.resource_name
    vertex_model.uri = deployed_model.resource_name

compiler.Compiler().compile(deploy_model, "deploy_model.yaml")    

### Build a pipeline
This is a wrapper function for pipeline buider. 

In [13]:
def build_pipeline (pipeline_name:str,
                    pipeline_desc:str,
                    pipeline_root:str, 
                    component_yaml_files : dict,
                    
#                     create_dataset_file:str,
#                     normalize_dataset_file:str,
#                     train_model_file:str,
#                     deploy_model_file:str,
                                 
                    ):

    @dsl.pipeline(
        name = pipeline_name,
        description = pipeline_desc,
        pipeline_root = pipeline_root,)

    def module_pipeline(
                        data_args: dict,
                        normalize_args:dict,
                        train_args:dict,
                        deploy_args:dict,

        ):

        from kfp import components

        create_dataset_comp = components.load_component_from_file(component_yaml_files['create_dataset_file'])
        normalize_dataset_comp = components.load_component_from_file(component_yaml_files['normalize_dataset_file'])
        train_model_comp = components.load_component_from_file(component_yaml_files['train_model_file'])
        deploy_model_comp = components.load_component_from_file(component_yaml_files['deploy_model_file'])

        # 1. create dataset
        create_dataset_task = create_dataset_comp(data_args=data_args)
        
        # 2. normalize dataset
        normalize_dataset_task = normalize_dataset_comp(
            input_dataset=create_dataset_task.outputs['dataset'],
            normalize_args = normalize_args,

        )

        # 3. model training
        train_model_task = train_model_comp(
            normalized_dataset=normalize_dataset_task.outputs['normalized_dataset'],
            train_args = train_args
        )

        # 4. deploy model
        deploy_model_comp(model= train_model_task.outputs['model'],
                          deploy_args = deploy_args,
                    )
            
    compiler.Compiler().compile(pipeline_func=module_pipeline, package_path="kfp_module_pipeline.yaml")


### Build a pipeline

In [14]:

component_yaml_files = {'create_dataset_file' : 'create_dataset.yaml',
                          'normalize_dataset_file' : 'normalize_dataset.yaml',
                          'train_model_file' : 'train_model.yaml' ,
                          'deploy_model_file' : 'deploy_model.yaml',
            }

build_pipeline(pipeline_name = "kfp_module_pipeline",
               pipeline_desc = "desc for kfp_module_pipeline",
               pipeline_root = PIPELINE_ROOT,
               component_yaml_files = component_yaml_files, 
               # create_dataset_file = 'create_dataset.yaml',
               # normalize_dataset_file = 'normalize_dataset.yaml',
               # train_model_file = 'train_model.yaml' ,
               # deploy_model_file = 'deploy_model.yaml',
   
)

### Run a pipeline

In [16]:

job = aiplatform.PipelineJob(

    display_name="kfp_module_pipeline",
    template_path="kfp_module_pipeline.yaml",
    
    parameter_values = {
        'data_args' : {
            'csv_url': 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',
            'col_names': "['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Labels']",
        },
        
        'normalize_args' : {
            'standard_scaler': False,
            'min_max_scaler': True,
        },
        
        'train_args' : {
            'hyper_params' : {
                'n_neighbors': "3",
                'in_test_size' : "0.2"
            },
        },
        
        'deploy_args' : {
            'project_id' : 'ai-hangsik',
            'display_name' : 'kfp_module_model',
            'container_image_uri': 'us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-5:latest',
            'machine_type': 'e2-standard-4',
        },
        
    },
    
    pipeline_root=PIPELINE_ROOT,
    enable_caching = True
)

job.run(service_account = SERVICE_ACCOUNT)

Creating PipelineJob
PipelineJob created. Resource name: projects/721521243942/locations/us-central1/pipelineJobs/kfp-module-pipeline-20250311015708
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/721521243942/locations/us-central1/pipelineJobs/kfp-module-pipeline-20250311015708')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/kfp-module-pipeline-20250311015708?project=721521243942
PipelineJob projects/721521243942/locations/us-central1/pipelineJobs/kfp-module-pipeline-20250311015708 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/721521243942/locations/us-central1/pipelineJobs/kfp-module-pipeline-20250311015708 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/721521243942/locations/us-central1/pipelineJobs/kfp-module-pipeline-20250311015708 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/721521243942/locations/us-

RuntimeError: Job failed with:
code: 9
message: " The DAG failed because some tasks failed. The failed tasks are: [train-model].; Job (project_id = ai-hangsik, job_id = 4658415400046821376) is failed due to the above error.; Failed to handle the job: {project_number = 721521243942, job_id = 4658415400046821376}"


In [75]:
import ast

a = "['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Labels']"

b = ast.literal_eval(a)

b
type(b)


list