In [11]:
import kfp

from typing import NamedTuple

from kfp.dsl import pipeline
from kfp.dsl import component
from kfp.dsl import OutputPath
from kfp.dsl import InputPath
from kfp.dsl import (Artifact,
                        Dataset,
                        Input,
                        Model,
                        Output,
                        Metrics,
                        ClassificationMetrics,
                        Markdown,
                        Condition)


from kfp.dsl import Output
from kfp.dsl import Metrics

from kfp import compiler
#from kfp.google.client import AIPlatformClient


from google.cloud import aiplatform
from google.cloud.aiplatform import pipeline_jobs

#from google_cloud_pipeline_components import aiplatform as gcc_aip

from google_cloud_pipeline_components.v1.model import ModelUploadOp
from typing import NamedTuple

import os
from google.oauth2 import service_account


In [13]:
# Path to service account key file
key_path = "/Users/shlba/Desktop/Docs/Study/code/ml_pipelines_kfp/deeplearning-sahil-e50332de6687.json"


# Create credentials using service account key file
credentials = service_account.Credentials.from_service_account_file(
    key_path,
    scopes=["https://www.googleapis.com/auth/cloud-platform"]
)

In [14]:
PROJECT_ID = "deeplearning-sahil"
PIPELINE_ROOT = "gs://sb-vertex/"
REGION = "us-central1"
SERVICE_ACCOUNT = "kfp-mlops@deeplearning-sahil.iam.gserviceaccount.com"

### Data

In [15]:
@component(base_image="python:3.9", 
    packages_to_install = [
        "pandas==2.0.0",
        "scikit-learn==1.5.1",
        "numpy==1.23.0"
    ],
)

def get_data(
    dataset_train: Output[Dataset],
    dataset_test: Output[Dataset],
):

    from sklearn import datasets
    from sklearn.model_selection import train_test_split as tts
    import pandas as pd


    # dataset https://www.kaggle.com/uciml/breast-cancer-wisconsin-data
    data_raw = datasets.load_breast_cancer()
    data = pd.DataFrame(data_raw.data, columns=data_raw.feature_names)
    data["target"] = data_raw.target

    train, test = tts(data, test_size=0.3)

    train.to_csv(dataset_train.path)
    test.to_csv(dataset_test.path)

### Training

In [16]:
@component(base_image="python:3.9", 
    packages_to_install = [
        "pandas==2.0.0",
        "numpy==1.23.0",
        "xgboost==1.7.5",
        "scikit-learn==1.5.1", #xgboost requires scikitlearn
    ],
)
def train_model(
    dataset: Input[Dataset],
    model: Output[Model]
):

    from xgboost import XGBClassifier
    import pandas as pd
    import joblib
    import logging

    logging.basicConfig(level=logging.DEBUG)

    data = pd.read_csv(dataset.path)

    xgb_model = XGBClassifier(
        objective="binary:logistic"
    )
    xgb_model.fit(
        data.drop(columns=["target"]),
        data.target,
    )

    score = xgb_model.score(
        data.drop(columns=["target"]),
        data.target,
    )

    model.metadata["train_score"] = float(score)
    model.metadata["framework"] = "XGBoost"

    print(model.path)

    #model.save_model(model_artifact.path)
    joblib.dump(xgb_model, model.path+'.joblib')

### Evaluation

In [17]:
@component(base_image="python:3.9", 
    packages_to_install = [
        "pandas==2.0.0",
        "numpy==1.23.0",
        "xgboost==1.7.5",
        "scikit-learn==1.5.1", #xgboost requires scikitlearn
    ],
)
def eval_model(
    test_set: Input[Dataset],
    xgb_model: Input[Model],
    metrics: Output[ClassificationMetrics],
    smetrics: Output[Metrics]
) -> NamedTuple("Outputs", [("deploy", str)]):
    from xgboost import XGBClassifier
    import pandas as pd
    import numpy as np
    import joblib

    data = pd.read_csv(test_set.path)
    #model = XGBClassifier()
    #model.load_model(xgb_model.path)
    model = joblib.load(xgb_model.path+'.joblib')
    score = model.score(
        data.drop(columns=["target"]),
        data.target,
    )

    from sklearn.metrics import roc_curve
    y_scores =  model.predict_proba(data.drop(columns=["target"]))[:, 1]
    fpr, tpr, thresholds = roc_curve(
         y_true=data.target.to_numpy(), y_score=y_scores, pos_label=True
    )
    # Create a mask for non-Infinity thresholds
    valid_mask = ~np.isinf(thresholds)
    
    # Apply the mask to filter out Infinity values
    fpr = fpr[valid_mask]
    tpr = tpr[valid_mask]
    thresholds = thresholds[valid_mask]
    
    metrics.log_roc_curve(fpr.tolist(), tpr.tolist(), thresholds.tolist())

    from sklearn.metrics import confusion_matrix
    y_pred = model.predict(data.drop(columns=["target"]))

    metrics.log_confusion_matrix(
       ["False", "True"],
       confusion_matrix(
           data.target, y_pred
       ).tolist()
    )

    xgb_model.metadata["test_score"] = float(score)
    smetrics.log_metric("score", float(score))


    deploy = "true"
    #compare threshold or to previous

    return (deploy,)

### Deployment

In [18]:
@component(base_image="python:3.9", packages_to_install=["google-cloud-aiplatform==1.3.0"])
def deploy(
    model: Input[Model],
    project: str,
    region: str,):

  import logging
  from google.cloud import aiplatform
  aiplatform.init(project=project, location=region)

  logging.basicConfig(level=logging.DEBUG)
  logging.debug(model)

  print(model)
  print(model.uri)

  import os
  path,file = os.path.split(model.uri)
  print(path)
  logging.info(path)
  import datetime

  # datetime.datetime.now().strftime('%Y%m%d%H%M%S')
  # serving image https://cloud.google.com/vertex-ai/docs/predictions/pre-built-containers#xgboost
  deployed_model = aiplatform.Model.upload(
        display_name="xgboost-pipeline",
        artifact_uri = path,
        serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/xgboost-cpu.1-4:latest"
  )

### Pipeline

In [19]:
@pipeline(
    # Default pipeline root. You can override it when submitting the pipeline.
    # pipeline_root=PIPELINE_ROOT + "xgboost-pipeline",
    # A name for the pipeline. Use to determine the pipeline Context.
    name="xgboost-pipeline-with-deployment",
)
def pipeline():
    dataset_op = get_data()
    training_op = train_model(dataset=dataset_op.outputs["dataset_train"])
    eval_op = eval_model(
        test_set=dataset_op.outputs["dataset_test"],
        xgb_model=training_op.outputs["model"]
    )

    with Condition(
        eval_op.outputs["deploy"] == "true",
        name="deploy",
    ):

        deploy_op = deploy(model=training_op.outputs["model"],
                          project=PROJECT_ID,
                          region=REGION)

  with Condition(


In [20]:
compiler.Compiler().compile(
    pipeline_func=pipeline,
    package_path='xgb_pipeline.json')

In [22]:
# Initialize Vertex AI with credentials
aiplatform.init(
    project=PROJECT_ID,
    location=REGION,
    credentials=credentials
)

job = pipeline_jobs.PipelineJob(
    display_name="xgb-pipeline",
    template_path="xgb_pipeline.json",
    pipeline_root=PIPELINE_ROOT,
    credentials=credentials
)

job.run(service_account=SERVICE_ACCOUNT)

Creating PipelineJob
PipelineJob created. Resource name: projects/57434141298/locations/us-central1/pipelineJobs/xgboost-pipeline-with-deployment-20250504233448
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/57434141298/locations/us-central1/pipelineJobs/xgboost-pipeline-with-deployment-20250504233448')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/xgboost-pipeline-with-deployment-20250504233448?project=57434141298
PipelineJob created. Resource name: projects/57434141298/locations/us-central1/pipelineJobs/xgboost-pipeline-with-deployment-20250504233448
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/57434141298/locations/us-central1/pipelineJobs/xgboost-pipeline-with-deployment-20250504233448')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/xgboost-pipeline-with-deployment-20250504233448