In [None]:
# Copyright 2024 Forusone
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# ML Metadata on Vertex AI


## Install Vertex AI SDK

In [10]:
# Install the packages
%pip install --user --quiet --upgrade google-cloud-aiplatform \
                         google-cloud-storage \
                         google-cloud-pipeline-components \
                         kfp

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/7.3 MB[0m [31m54.1 MB/s[0m eta [36m0:00:00[0m
[0m

In [11]:
! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
! python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"

KFP SDK version: 2.10.1
google_cloud_pipeline_components version: 2.19.0


## Configuration

### Authenticate your notebook environment

In [3]:
import sys
from IPython.display import Markdown, display

PROJECT_ID="ai-hangsik"
LOCATION="us-central1"

# For only colab user, no need this process for Colab Enterprise in Vertex AI.
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user(project_id=PROJECT_ID)

# set project.
!gcloud config set project {PROJECT_ID}

Updated property [core/project].


### Initialize Vertex AI SDK

In [28]:
import json
from typing import NamedTuple

from google.cloud import aiplatform
#from kfp import compiler, dsl

import kfp
from kfp.dsl import component
from google.cloud import aiplatform, bigquery



### Create a bucket

In [4]:
# Create a bucket.
BUCKET_URI = f"gs://mlops-0324"
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

Creating gs://mlops-0324/...


### Specifying a service account to use for a pipeline run

In [5]:
shell_output = ! gcloud projects describe  $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")

SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print(f"SERVICE_ACCOUNT: {SERVICE_ACCOUNT}")

SERVICE_ACCOUNT: 721521243942-compute@developer.gserviceaccount.com


### Set access for Service account

In [6]:
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_URI
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_URI
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewerroles/logging.logWriter

In [34]:
import json
from typing import NamedTuple
from google.cloud import aiplatform

import kfp
from kfp import client,compiler, dsl

from kfp.dsl import Artifact, Metrics, Dataset, Input, Model, Output, component

from google.cloud import aiplatform

In [58]:
@dsl.component(base_image='python:3.10',
               packages_to_install=['google-cloud-aiplatform', 'pandas', 'scikit-learn']
               )
def preprocess_data(
    input_data_path: str,
    output_data_path: Output[Dataset],
    project: str,
    location: str,
):
    import pandas as pd
    import os
    from google.cloud import aiplatform

    try:

      aiplatform.init(project=project, location=location)

      execution = aiplatform.start_execution(
          schema_title="preprocess_execution",
          metadata={"input_path": input_data_path},
      )

      # Simulate data preprocessing
      df = pd.read_csv(input_data_path)
      df['processed_feature'] = df.iloc[:, 0] * 2  # Example processing
      df.to_csv(output_data_path, index=False)

      aiplatform.log_execution_artifacts(
          execution=execution,
          artifacts={
              "artifacts_processed_data": aiplatform.Artifact(uri=output_data_path, schema_title="Dataset")
          }
      )

      aiplatform.end_execution(execution=execution)

    except Exception  as e:
      print(f"Error: {e}")



In [60]:
@dsl.component(base_image='python:3.10',
               packages_to_install=['google-cloud-aiplatform', 'pandas', 'scikit-learn']

               )
def train_model(
    dataset_path: Input[Dataset],
    model_path: Output[Dataset],
    learning_rate: float,
    epochs: int,
    project: str,
    location: str
):
    import joblib
    import random
    from google.cloud import aiplatform

    try:
      aiplatform.init(project=project, location=location)

      execution = aiplatform.start_execution(
          schema_title="training_execution",
          metadata={
              "learning_rate": learning_rate,
              "epochs": epochs,
              "dataset": dataset_path,
          },
      )

      # Simulate model training
      print(f"Training model with learning rate: {learning_rate}, epochs: {epochs}")
      model = {"trained_data": random.random()} # replace with real training

      os.makedirs(model_path, exist_ok=True)
      joblib.dump(clf, os.path.join(model_path, model))

      # with open(model_path, 'wb') as f:
      #     pickle.dump(model, f)

      aiplatform.log_execution_artifacts(
          execution=execution,
          artifacts={
              "artifact_trained_model": aiplatform.Artifact(uri=model_path, schema_title="Model")
          }
      )

      aiplatform.end_execution(execution=execution)

    except Exception  as e:
      print(f"Error: {e}")

In [61]:
@dsl.pipeline(name="Data Preprocessing and Model Training", pipeline_root=BUCKET_URI)
def my_pipeline(project: str,
                location: str,
                initial_data_path:str,
                learning_rate: float = 0.01,
                epochs: int = 10):

    preprocess_task = preprocess_data(
        input_data_path=initial_data_path,
        project=project,
        location=location,
    )

    train_task = train_model(
        dataset_path=preprocess_task.output,
        learning_rate=learning_rate,
        epochs=epochs,
        project=project,
        location=location,
    )

compiler.Compiler().compile(
        pipeline_func=my_pipeline, package_path="my_pipeline.json"
    )

In [62]:
PROJECT_ID = "ai-hangsik"
REGION = "us-central1"
initial_data_path = "gs://mlops-0324/training_data/iris.csv"

pipeline_job = aiplatform.PipelineJob(
    display_name="metadata",
    template_path="my_pipeline.json",
    pipeline_root=BUCKET_URI,
    parameter_values={
        "initial_data_path" : "gs://mlops-0324/training_data/iris.csv",
        "project": PROJECT_ID,
        "location": REGION,
    },
    enable_caching = True

)

pipeline_job.run()

In [None]:
@component(base_image="python:3.10")
def get_input(text: str) -> str:

    print(f"### input: {text}")

    return text

compiler.Compiler().compile(get_input, "get_input.yaml")

# You can load from yaml file in the pipeline.
# loaded_component = components.load_component_from_file('./get_input.yaml')

### Define *translation* component

In [None]:
@component(base_image="python:3.10",
           packages_to_install=["google-cloud-translate"])
def translate(
    text: str,
    project:str,

) -> NamedTuple(
    "Outputs",
    [
        ("output_1", str),
        ("output_2", str),
    ],
):

    from google.cloud import translate_v3

    client = translate_v3.TranslationServiceClient()
    parent = f"projects/{project}/locations/global"

     # Call translation api
    response_en = client.translate_text(
        contents=[text],
        source_language_code="ko-KR",
        target_language_code="en",
        parent=parent,
        mime_type="text/plain",
    )

    # Call translation api
    response_ja = client.translate_text(
        contents=[text],
        source_language_code="ko",
        target_language_code="ja",
        parent=parent,
        mime_type="text/plain",
    )

    o1 = f"translation 1: {response_en.translations[0].translated_text}"
    o2 = f"translation 2: {response_ja.translations[0].translated_text}"

    print(f"### output 1: {o1}; output 2: {o2}")

    return (o1, o2)

compiler.Compiler().compile(translate, "translate.yaml")


### Define *collect* component

In [None]:
@component(base_image="python:3.10")

def collect(original: str, tran_output_1: str, tran_output_2: str) -> str:

    import logging

    logger = logging.getLogger()
    output = f"original: {original}; translation_1: {tran_output_1}; translation_2: {tran_output_2}"

    logger.info(f"### original: {original}")
    logger.info(f"### tran_output_1: {tran_output_1}")
    logger.info(f"### translation_2: {tran_output_2}")

    return output

compiler.Compiler().compile(collect, "collect.yaml")


### Define a pipeline that uses the components

In [None]:
@dsl.pipeline(
    name="translation-pipeline",
    description="pipeline to translate and collect",
    pipeline_root=PIPELINE_ROOT,
)

def translation_pipeline(text: str,
                         project: str,):

    input_text = get_input(text=text)
    translated_texts = translate(text=input_text.output, project=project)
    consumer_task = collect(original= input_text.output,
                              tran_output_1 = translated_texts.outputs["output_1"],
                              tran_output_2 = translated_texts.outputs["output_2"],)

## Compile the pipeline

In [None]:
compiler.Compiler().compile(pipeline_func=translation_pipeline, package_path="translation_pipeline.json")

## Run the pipeline

### Run a pipeline

In [None]:

text ="머신러닝에서 파이프라인을 만들기 위한 방법들이 무엇이 있나요 ?"

job = aiplatform.PipelineJob(

    display_name="translation_pipeline",
    template_path="translation_pipeline.json",
    parameter_values = {"text": text,
                        "project": PROJECT_ID},
    pipeline_root=PIPELINE_ROOT,

)

job.run(service_account = SERVICE_ACCOUNT)

Creating PipelineJob
PipelineJob created. Resource name: projects/721521243942/locations/us-central1/pipelineJobs/translation-pipeline-20250221055257
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/721521243942/locations/us-central1/pipelineJobs/translation-pipeline-20250221055257')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/translation-pipeline-20250221055257?project=721521243942
PipelineJob projects/721521243942/locations/us-central1/pipelineJobs/translation-pipeline-20250221055257 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/721521243942/locations/us-central1/pipelineJobs/translation-pipeline-20250221055257 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/721521243942/locations/us-central1/pipelineJobs/translation-pipeline-20250221055257 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/721521243942/locatio

### Enable_caching

In [None]:
text ="머신러닝에서 파이프라인을 만들기 위한 방법들이 무엇이 있나요 ?"

job = aiplatform.PipelineJob(

    display_name="translation_pipeline",
    template_path="translation_pipeline.json",
    parameter_values = {"text": text,
                        "project": PROJECT_ID},
    pipeline_root=PIPELINE_ROOT,
    enable_caching = True

)

job.run(service_account = SERVICE_ACCOUNT)