In [None]:
# Copyright 2024 Forusone (shins777@gmail.com)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Kubeflow basic


### Install Python package


In [None]:
! pip3 install --user --upgrade --quiet google-cloud-aiplatform \
                                 google-cloud-storage \
                                 kfp \
                                 "numpy<2" \
                                 google-cloud-pipeline-components

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone


### Set configuration

#### Authentication to access to the GCP

In [None]:
import sys
from IPython.display import Markdown, display

PROJECT_ID="ai-hangsik"
LOCATION="asia-northeast3"

# For only colab user, no need this process for Colab Enterprise in Vertex AI.
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user(project_id=PROJECT_ID)

# set project.
!gcloud config set project {PROJECT_ID}


Updated property [core/project].


#### Create a bucket

In [None]:
# Create a bucket.
BUCKET_URI = f"gs://mlops-{PROJECT_ID}-unique"
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

Creating gs://mlops-ai-hangsik-unique/...


In [None]:
# Set pipeline root to store artifacts.
PIPELINE_ROOT=f"{BUCKET_URI}/pipeline_root/forusone"

#### Initalize Vertex AI

In [None]:
from typing import NamedTuple

import kfp
from google.cloud import aiplatform
from kfp import compiler, dsl
from kfp.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                     OutputPath, component)

# Initialize Vertex AI.
aiplatform.init(project=PROJECT_ID, staging_bucket=BUCKET_URI)


### Set pre-built containers

Set the pre-built Docker container image for training and prediction.


*  For the latest list, see [Pre-built containers for training](https://cloud.google.com/vertex-ai/docs/training/pre-built-containers).


*  For the latest list, see [Pre-built containers for prediction](https://cloud.google.com/vertex-ai/docs/predictions/pre-built-containers).

In [None]:
IRIS_DATA_FILENAME = 'gs://cloud-samples-data/ai-platform/iris/iris_data.csv'
IRIS_TARGET_FILENAME = 'gs://cloud-samples-data/ai-platform/iris/iris_target.csv'

# TRAIN_IMAGE = aiplatform.helpers.get_prebuilt_prediction_container_uri(
#     framework="xgboost", framework_version="1.1", accelerator="cpu"
# )
# TRAIN_IMAGE

'us-docker.pkg.dev/vertex-ai/prediction/xgboost-cpu.1-1:latest'

In [None]:
import pandas as pd
import xgboost as xgb

@component(
    base_image="python:3.8",
    packages_to_install=[
        "numpy==1.18.5",
        "pandas==1.0.4",
        "scikit-learn==0.23.1",
        "xgboost==1.1.1",
    ],
)

def get_data(train_path: str,
  label_path: str ) -> NamedTuple(
    "Outputs",
    [
        ("matrix", xgb.DMatrix),
        ("train", pd.DataFrame),
        ("label", pd.DataFrame)
    ]
  ):

  import pandas as pd
  import xgboost as xgb
  from sklearn.model_selection import train_test_split

  """
  Get the data
  Args:
      train_path: the path of the train data
      label_path: the path of the label data
  Returns:
      the train data and the label data
  """
  # Load data into pandas, then use `.values` to get NumPy arrays
  data = pd.read_csv(train_path).values
  labels = pd.read_csv(label_path).values

  # Convert one-column 2D array into 1D array for use with XGBoost
  labels = labels.reshape((labels.size,))
  train_data, test_data, train_labels, test_labels = train_test_split(
      data, labels, test_size=0.2, random_state=7
  )

  # Load data into DMatrix object
  dtrain = xgb.DMatrix(train_data, label=train_labels)

  return (dtrain, test_data, test_labels)

TypeError: Artifacts must have both a schema_title and a schema_version, separated by `@`. Got: DMatrix

In [None]:

@component(base_image="python:3.9")
def preprocessing(text: str) -> str:
    import time

    current_time = time.localtime()
    formatted_time = time.strftime("%Y-%m-%d %H:%M:%S", current_time)

    text = f"{formatted_time}:{text}"
    return text

compiler.Compiler().compile(preprocessing, "preprocessing_comp.yaml")

In [None]:

@component(packages_to_install=["google-cloud-aiplatform"])
def executing(
    text: str,
) -> NamedTuple(
    "Outputs",
    [
        ("output_one", str),  # Return parameters
        ("output_two", str),
    ],
):
    # the import is not actually used for this simple example, but the import
    # is successful, as it was included in the `packages_to_install` list.
    from google.cloud import storage  # noqa: F401

    o1 = f"output one from text: {text}"
    o2 = f"output two from text: {text}"
    print("output one: {}; output_two: {}".format(o1, o2))
    return (o1, o2)