# Install KFP and db_types

In [None]:
USER_FLAG = "--user"
!pip3 install {USER_FLAG} kfp==1.8.9
!pip3 install {USER_FLAG} db_dtypes

# Restart notebook kernel to load new modules

In [None]:
import os
if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython
    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

# Set environment variables for PROJECT_ID and BUCKET_NAME

In [None]:
PROJECT_ID = ""
# Get your Google Cloud project ID from gcloud
if not os.getenv("IS_TESTING"):
    shell_output=!gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print(f"Project ID: {PROJECT_ID}")
    
BUCKET_NAME="gs://" + PROJECT_ID + "-bucket"
print(f"Bucket Name: {BUCKET_NAME}")

# Set environment variables for PATH, REGION & PIPELINE_ROOT

In [None]:
PATH=%env PATH
%env PATH={PATH}:/home/jupyter/.local/bin
REGION="us-central1"
PIPELINE_ROOT = f"{BUCKET_NAME}/pipeline_root/"
PIPELINE_ROOT

# LAB BEGINS HERE
Import required modules

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from kfp.v2 import compiler, dsl
from kfp.v2.dsl import pipeline, component, Artifact, Dataset, Input, Metrics, Model, Output, InputPath, OutputPath
from google.cloud import aiplatform
# We'll use this namespace for metadata querying
from google.cloud import aiplatform_v1

# Pipeline Step 1:  [Component] Load data from BQ
- Extracts training data from BQ table referenced as input to component
- Loads data into Dataframe
- Outputs data from component as CSV data

In [None]:
@component(
    packages_to_install=["google-cloud-bigquery", "pandas", "pyarrow", "db_dtypes"],
    base_image="python:3.9",
    # output_component_file="create_dataset.yaml"
    )
def get_dataframe(
    bq_table: str,
    output_data_path: OutputPath("Dataset")
    ):
    from google.cloud import bigquery
    import pandas as pd
    bqclient = bigquery.Client(project="#PROJECT_ID HERE")
    table = bigquery.TableReference.from_string(
        bq_table
    )
    rows = bqclient.list_rows(
        table
    )
    dataframe = rows.to_dataframe(
        create_bqstorage_client=True,
    )
    dataframe = dataframe.sample(frac=1, random_state=2)
    dataframe.to_csv(output_data_path)

# Pipeline Step 2:  [Component] Train Scikit-learn model
- Takes CSV data from step 1 as input
- Train Scikit-learn decision tree model
- Output model

In [None]:
@component(
    packages_to_install=["sklearn", "pandas", "joblib"],
    base_image="python:3.9",
    output_component_file="beans_model_component.yaml",
)
def sklearn_train(
    dataset: Input[Dataset],
    metrics: Output[Metrics],
    model: Output[Model]
):
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.metrics import roc_curve
    from sklearn.model_selection import train_test_split
    from joblib import dump
    import pandas as pd
    df = pd.read_csv(dataset.path)
    labels = df.pop("Class").tolist()
    data = df.values.tolist()
    x_train, x_test, y_train, y_test = train_test_split(data, labels)
    skmodel = DecisionTreeClassifier()
    skmodel.fit(x_train,y_train)
    score = skmodel.score(x_test,y_test)
    print('accuracy is:',score)
    metrics.log_metric("accuracy",(score * 100.0))
    metrics.log_metric("framework", "Scikit Learn")
    metrics.log_metric("dataset_size", len(df))
    dump(skmodel, model.path + ".joblib")

# Pipeline Step 3:  [Component] Upload & Deploy model to Vertex AI
- Takes model from step 2 as input
- Upload model to Vertex AI
- Deploy model as Vertex AI endpoint

In [None]:
@component(
    packages_to_install=["google-cloud-aiplatform"],
    base_image="python:3.9",
    output_component_file="beans_deploy_component.yaml",
)
def deploy_model(
    model: Input[Model],
    project: str,
    region: str,
    vertex_endpoint: Output[Artifact],
    vertex_model: Output[Model]
):
    from google.cloud import aiplatform
    aiplatform.init(project=project, location=region)
    deployed_model = aiplatform.Model.upload(
        display_name="beans-model-pipeline",
        artifact_uri = model.uri.replace("model", ""),
        serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.0-24:latest"
    )
    endpoint = deployed_model.deploy(machine_type="n1-standard-4")
    # Save data to the output params
    vertex_endpoint.uri = endpoint.resource_name
    vertex_model.uri = deployed_model.resource_name

# Pipeline Definition: Create a pipeline from the 3 components

In [None]:
@pipeline(
    # Default pipeline root. You can override it when submitting the pipeline.
    pipeline_root=PIPELINE_ROOT,
    # A name for the pipeline.
    name="mlmd-pipeline",
)
def pipeline(
    bq_table: str = "",
    output_data_path: str = "data.csv",
    project: str = PROJECT_ID,
    region: str = REGION
):
    dataset_task = get_dataframe(bq_table)
    model_task = sklearn_train(
        dataset_task.output
    )
    deploy_task = deploy_model(
        model=model_task.outputs["model"],
        project=project,
        region=region
    )

# Compile pipeline to JSON

In [None]:
compiler.Compiler().compile(
    pipeline_func=pipeline, package_path="mlmd_pipeline.json"
)

# Execute Pipeline Runs

In [None]:
from datetime import datetime
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
print(TIMESTAMP)

# Create Pipeline Job 1: Small Dataset Table

In [None]:
run1 = aiplatform.PipelineJob(
    display_name="mlmd-pipeline",
    template_path="mlmd_pipeline.json",
    job_id="mlmd-pipeline-small-{0}".format(TIMESTAMP),
    parameter_values={"bq_table":"{0}.beans.dry_bean_tbl_small".format(PROJECT_ID)},
    enable_caching=True,
)

# Create Pipeline Job 2:  Full Dataset Table

In [None]:
run2 = aiplatform.PipelineJob(
    display_name="mlmd-pipeline",
    template_path="mlmd_pipeline.json",
    job_id="mlmd-pipeline-large-{0}".format(TIMESTAMP),
    parameter_values={"bq_table":"{0}.beans.dry_bean_tbl".format(PROJECT_ID)},
    enable_caching=True,
)

# Execute Jobs

In [None]:
run1.submit()
#run2.submit()