In [None]:
# Copyright 2024 Forusone
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Lightweight KFP Pipelines
* In this tutorial, you learn to use the KFP SDK to build lightweight Python function-based components, and then you learn to use Vertex AI Pipelines to execute the pipeline.
* This lab simplifies the original notebook [Lightweight kfp](https://colab.sandbox.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/pipelines/lightweight_functions_component_io_kfp.ipynb)

## Install Vertex AI SDK for Python and other required packages

In [2]:
%pip install --user --quiet google-cloud-aiplatform \
                                 google-cloud-storage \
                                 kfp \
                                 "numpy<2" \
                                 google-cloud-pipeline-components


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
# @title Authentication to GCP
import sys

if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user()

In [5]:
# @title Set GCP information
PROJECT_ID = "ai-hangsik"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}
BUCKET_URI = f"gs://mlops-0221"

In [6]:
# @title Create a bucket.

!gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

Creating gs://mlops-0221/...
ServiceException: 409 A Cloud Storage bucket named 'mlops-0221' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


In [7]:
# @title Service account
shell_output = ! gcloud projects describe  $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")

SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"

print(f"SERVICE_ACCOUNT: {SERVICE_ACCOUNT}")

SERVICE_ACCOUNT: 721521243942-compute@developer.gserviceaccount.com


In [8]:
!gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_URI
!gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_URI

No changes made to gs://mlops-0221/
No changes made to gs://mlops-0221/


In [43]:
# @title Import libraries
from typing import NamedTuple

import kfp
from google.cloud import aiplatform
from kfp import compiler, dsl
from kfp.dsl import (Artifact, 
                     Dataset, 
                     Input, 
                     InputPath, 
                     Model, 
                     Output,
                     OutputPath, 
                     component)

In [44]:
# @title Pipelines constants
PIPELINE_ROOT = f"{BUCKET_URI}/pipeline/shakespeare"

In [45]:
# @title Initialize Vertex AI SDK
aiplatform.init(project=PROJECT_ID, staging_bucket=BUCKET_URI)

In [46]:
# @title Define Python function-based pipeline components
@component(base_image="python:3.9")
def preprocess(
    message: str,
    out_dataset1: Output[Dataset],
    out_dataset2: Output[Dataset],    
    out_param_path: OutputPath(str),
):
    """'
    Mock' preprocessing step.
    Writes out the passed in message to the output "Dataset"s and the output message.
    """
    out_dataset1.metadata["hello"] = "there"
    out_dataset2.metadata["world"] = "OK"

    with open(out_dataset1.path, "w") as f:
        f.write(message)

    with open(out_dataset2.path, "w") as f:
        f.write(message)        
        
    with open(out_param_path, "w") as f:
        f.write(message)
        

In [47]:
# @title Define train component

@component(base_image="python:3.9")
def train(
    message: str,
    
    in_dataset1: Input[Dataset],
    in_dataset2: Input[Dataset],

    imported_dataset: Input[Dataset],

    model: Output[Model],

    num_steps: int = 3,

) -> NamedTuple(
    "Outputs",
    [
        ("output_message", str),  # Return parameter.
        ("generic_artifact", Artifact),  # Return generic Artifact.
    ],
):
    """'Mock' Training step.
    Combines the contents of dataset_one and dataset_two into the
    output Model.
    Constructs a new output_message consisting of message repeated num_steps times.
    """

    with open(in_dataset1.path) as input_file:
        read_in_dataset1 = input_file.read()
        print(f"read_in_dataset1 : {read_in_dataset1}")

    with open(in_dataset2.path) as input_file:
        read_in_dataset2 = input_file.read()
        print(f"read_in_dataset2 : {read_in_dataset2}")

    with open(model.path, "w") as f:
        f.write("My Model")
        print("Model Saved:", model)

    model.metadata["accuracy"] = 0.9
    model.metadata["framework"] = "Tensorflow"
    model.metadata["time_to_train_in_seconds"] = 257

    output_message = " ".join([message for _ in range(num_steps)])
    artifact_contents = f"{read_in_dataset1}\n{read_in_dataset2}"

    return (output_message, artifact_contents)


In [48]:
# @title Define read_artifact_input component
@component(base_image="python:3.9")
def read_artifact_input(
    generic: Input[Artifact],
):
    with open(generic.path) as input_file:
        generic_contents = input_file.read()
        print(f"generic contents: {generic_contents}")

In [49]:
# @title Define a pipeline that uses your components and the Importer
@dsl.pipeline(
    # Default pipeline root. You can override it when submitting the pipeline.
    pipeline_root=PIPELINE_ROOT,
    # A name for the pipeline. Use to determine the pipeline Context.
    name="metadata-pipeline-v2",
)
def pipeline(message: str):
    # https://www.kubeflow.org/docs/components/pipelines/user-guides/components/importer-component/
    importer = kfp.dsl.importer( 
        artifact_uri="gs://ml-pipeline-playground/shakespeare1.txt",
        artifact_class=Dataset,
        reimport=False,
    )
    preprocess_task = preprocess(message=message)
    
    train_task = train(
        message=preprocess_task.outputs["out_param_path"],        
        in_dataset1=preprocess_task.outputs["out_dataset1"],
        in_dataset2=preprocess_task.outputs["out_dataset2"],
        imported_dataset=importer.output,
        
        num_steps=5,
    )
    read_task = read_artifact_input(  # noqa: F841
        generic=train_task.outputs["generic_artifact"]
    )

In [52]:
# @title Compile the pipeline
compiler.Compiler().compile(
    pipeline_func=pipeline, package_path="lightweight_kfp.yaml"
)

In [53]:
# @title Run the pipeline
DISPLAY_NAME = "shakespeare"

job = aiplatform.PipelineJob(
    display_name=DISPLAY_NAME,
    template_path="lightweight_kfp.yaml",
    pipeline_root=PIPELINE_ROOT,
    parameter_values={"message": "Hello, World"},
    enable_caching=False,
)

job.run()

Creating PipelineJob
PipelineJob created. Resource name: projects/721521243942/locations/us-central1/pipelineJobs/metadata-pipeline-v2-20250220140150
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/721521243942/locations/us-central1/pipelineJobs/metadata-pipeline-v2-20250220140150')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/metadata-pipeline-v2-20250220140150?project=721521243942
PipelineJob projects/721521243942/locations/us-central1/pipelineJobs/metadata-pipeline-v2-20250220140150 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/721521243942/locations/us-central1/pipelineJobs/metadata-pipeline-v2-20250220140150 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/721521243942/locations/us-central1/pipelineJobs/metadata-pipeline-v2-20250220140150 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/721521243942/locatio

In [None]:
# @title Delete the pipeline job
job.delete()

INFO:google.cloud.aiplatform.base:Deleting PipelineJob : projects/721521243942/locations/us-central1/pipelineJobs/metadata-pipeline-v2-20250102224741
INFO:google.cloud.aiplatform.base:PipelineJob deleted. . Resource name: projects/721521243942/locations/us-central1/pipelineJobs/metadata-pipeline-v2-20250102224741
INFO:google.cloud.aiplatform.base:Deleting PipelineJob resource: projects/721521243942/locations/us-central1/pipelineJobs/metadata-pipeline-v2-20250102224741
INFO:google.cloud.aiplatform.base:Delete PipelineJob backing LRO: projects/721521243942/locations/us-central1/operations/1685094924975865856
INFO:google.cloud.aiplatform.base:PipelineJob resource projects/721521243942/locations/us-central1/pipelineJobs/metadata-pipeline-v2-20250102224741 deleted.


In [None]:
# @title Cleaning up
delete_bucket = False

if delete_bucket:
    ! gsutil rm -r $BUCKET_URI

! rm lightweight_pipeline.yaml