In [96]:
from datetime import datetime

from google.cloud import aiplatform

In [97]:
REGION = "us-central1"
PROJECT_ID = !(gcloud config get-value project)
PROJECT_ID = PROJECT_ID[0]

In [98]:
# Set `PATH` to include the directory containing KFP CLI
PATH = %env PATH
%env PATH=/home/jupyter/.local/bin:{PATH}

env: PATH=/home/jupyter/.local/bin:/home/jupyter/.local/bin:/home/jupyter/.local/bin:/home/jupyter/.local/bin:/home/jupyter/.local/bin:/home/jupyter/.local/bin:/home/jupyter/.local/bin:/home/jupyter/.local/bin:/usr/local/cuda/bin:/opt/conda/bin:/opt/conda/condabin:/usr/local/bin:/usr/bin:/bin:/usr/local/games:/usr/games:/home/jupyter/.local/bin:


In [99]:
!cat trainer_image_vertex/Dockerfile

FROM gcr.io/deeplearning-platform-release/tf-gpu.2-8
RUN pip install -U fire cloudml-hypertune
WORKDIR /app
COPY model.py .

ENTRYPOINT ["python", "model.py"]

In [100]:
#!pip install google_cloud_pipeline_components

In [101]:
IMAGE_NAME = "trainer_image_kidneytx_vertex"
TAG = "latest"
TRAINING_CONTAINER_IMAGE_URI = f"gcr.io/{PROJECT_ID}/{IMAGE_NAME}:{TAG}"
TRAINING_CONTAINER_IMAGE_URI

'gcr.io/qwiklabs-asl-00-c812c3b423f2/trainer_image_kidneytx_vertex:latest'

In [102]:
!gcloud builds submit --timeout 15m --tag $TRAINING_CONTAINER_IMAGE_URI trainer_image_vertex

Creating temporary tarball archive of 2 file(s) totalling 316 bytes before compression.
Uploading tarball of [trainer_image_vertex] to [gs://qwiklabs-asl-00-c812c3b423f2_cloudbuild/source/1686853771.516691-4e4d276082564549aae9e05cf9de6828.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/qwiklabs-asl-00-c812c3b423f2/locations/global/builds/29b3f829-f51d-4665-8758-8447d0b7d6e4].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds/29b3f829-f51d-4665-8758-8447d0b7d6e4?project=469700469475 ].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "29b3f829-f51d-4665-8758-8447d0b7d6e4"

FETCHSOURCE
Fetching storage object: gs://qwiklabs-asl-00-c812c3b423f2_cloudbuild/source/1686853771.516691-4e4d276082564549aae9e05cf9de6828.tgz#1686853771758819
Copying gs://qwiklabs-asl-00-c812c3b423f2_cloudbuild/source/1686853771.516691-4e4d276082564549aae9e05cf9de6828.tgz#1686853771758819...
/ [1 files][  404.0 B/  404.0 B]        

In [30]:
SERVING_CONTAINER_IMAGE_URI = (
    "us-docker.pkg.dev/vertex-ai/prediction/tf-gpu.2-8:latest"

)

In [23]:
!mkdir -p pipeline_vertex

In [12]:
!rm ./pipeline_vertex/pipeline_prebuilt.py

In [13]:
BUCKET = !gcloud storage ls
BUCKET = BUCKET[-1].split("//")[-1]
OUTDIR = gs://${BUCKET}spectrain_cnn/kfp_hp_tuning_$TIMESTAMP

%env BUCKET={BUCKET}
%env OUTDIR={OUTDIR}

env: BUCKET=spectrain_new/
env: OUTDIR=spectrain_proc_img_trained


In [14]:
%%writefile ./pipeline_vertex/pipeline_prebuilt.py
# Copyright 2021 Google LLC

# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at

# https://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS"
# BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
"""Kubeflow Covertype Pipeline."""
import os

from google.cloud.aiplatform import hyperparameter_tuning as hpt
from google_cloud_pipeline_components.aiplatform import (
    EndpointCreateOp,
    ModelDeployOp,
    ModelUploadOp,
)
from google_cloud_pipeline_components.experimental import (
    hyperparameter_tuning_job,
)
from google_cloud_pipeline_components.experimental.custom_job import (
    CustomTrainingJobOp,
)
from kfp.v2 import dsl

PIPELINE_ROOT = os.getenv("PIPELINE_ROOT")
PROJECT_ID = os.getenv("PROJECT_ID")
REGION = os.getenv("REGION")

TRAINING_CONTAINER_IMAGE_URI = os.getenv("TRAINING_CONTAINER_IMAGE_URI")
SERVING_CONTAINER_IMAGE_URI = os.getenv("SERVING_CONTAINER_IMAGE_URI")
SERVING_MACHINE_TYPE = os.getenv("SERVING_MACHINE_TYPE", "n1-standard-16")

TRAINING_FILE_PATH = os.getenv("TRAINING_FILE_PATH")
VALIDATION_FILE_PATH = os.getenv("VALIDATION_FILE_PATH")

MAX_TRIAL_COUNT = int(os.getenv("MAX_TRIAL_COUNT", "5"))
PARALLEL_TRIAL_COUNT = int(os.getenv("PARALLEL_TRIAL_COUNT", "5"))
THRESHOLD = float(os.getenv("THRESHOLD", "0.6"))

PIPELINE_NAME = os.getenv("PIPELINE_NAME", "covertype")
BASE_OUTPUT_DIR = os.getenv("BASE_OUTPUT_DIR", PIPELINE_ROOT)
MODEL_DISPLAY_NAME = os.getenv("MODEL_DISPLAY_NAME", PIPELINE_NAME)


# NEEDED FOR BHAVANI CODE
BUCKET = os.getenv("BUCKET")
OUTDIR = os.getenv("OUTDIR")


@dsl.pipeline(
    name=f"{PIPELINE_NAME}-kfp-pipeline",
    description="Kubeflow pipeline that tunes, trains, and deploys on Vertex",
    pipeline_root=PIPELINE_ROOT,
)
def create_pipeline():

    worker_pool_specs = [
        {
            "machine_spec": {
                "machine_type": "n1-standard-4",
                "accelerator_type": "NVIDIA_TESLA_V100",
                "accelerator_count": 1,
            },
            "replica_count": 1,
            "container_spec": {
                "image_uri": TRAINING_CONTAINER_IMAGE_URI,
                "args": [
                    f"--train_data_path=gs://{BUCKET}/bhavani/train_images",
                    f"--eval_data_path=gs://{BUCKET}/bhavani/valid_images",
                    f"--output_dir={OUTDIR}",
                    "--batch_size=10",
                    "--num_epochs=1",
                    "--train_examples=1",
                    "--eval_steps=1",
                    
                    # below are defaults from Bhavani
                    "--nnsize_1=512",
                    "--nnsize_2=64",
                    "--ksize=4",
                    "--pool_ksize=64",
                    "--filt_size1=64",
                    "--filt_size2=32",
                    
                    # hptune
                    "--hptune",
                ],
            },
        }
    ]

#    metric_spec = hyperparameter_tuning_job.serialize_metrics(
#        {"auc": "maximize"}
#    )

#    parameter_spec = hyperparameter_tuning_job.serialize_parameters(
#        {
#            "alpha": hpt.DoubleParameterSpec(
#                min=1.0e-4, max=1.0e-1, scale="linear"
#            ),
#            "max_iter": hpt.DiscreteParameterSpec(
#                values=[1, 2], scale="linear"
#            ),
#        }
#    )

#    hp_tuning_task = hyperparameter_tuning_job.HyperparameterTuningJobRunOp(
#        display_name=f"{PIPELINE_NAME}-kfp-tuning-job",
#        project=PROJECT_ID,
#        location=REGION,
#        worker_pool_specs=worker_pool_specs,
#        study_spec_metrics=metric_spec,
#        study_spec_parameters=parameter_spec,
#        max_trial_count=MAX_TRIAL_COUNT,
#        parallel_trial_count=PARALLEL_TRIAL_COUNT,
#        base_output_directory=PIPELINE_ROOT,
#    )

#    trials_task = hyperparameter_tuning_job.GetTrialsOp(
#        gcp_resources=hp_tuning_task.outputs["gcp_resources"]
#    )

#    best_hyperparameters_task = (
#        hyperparameter_tuning_job.GetBestHyperparametersOp(
#            trials=trials_task.output, study_spec_metrics=metric_spec
#        )
#    )

    # Construct new worker_pool_specs and
    # train new model based on best hyperparameters
#    worker_pool_specs_task = hyperparameter_tuning_job.GetWorkerPoolSpecsOp(
#        best_hyperparameters=best_hyperparameters_task.output,
#        worker_pool_specs=[
#            {
#                "machine_spec": {"machine_type": "n1-standard-4"},
#                "replica_count": 1,
#                "container_spec": {
#                    "image_uri": TRAINING_CONTAINER_IMAGE_URI,
#                    "args": [
#                    f"--train_data_path=gs://{BUCKET}/bhavani/train_images",
#                    f"--eval_data_path=gs://{BUCKET}/bhavani/valid_images",
#                    f"--output_dir={OUTDIR}",
#                    "--batch_size=10",
#                    "--num_epochs=1",
#                    "--train_examples=1",
#                    "--eval_steps=1",
#                    
#                    # below are defaults from Bhavani
#                    "--nnsize_1=512",
#                    "--nnsize_2=64",
#                    "--ksize=4",
#                    "--pool_ksize=64",
#                    "--filt_size1=64",
#                    "--filt_size2=32",
#                    
#                    # hptune
#                    "--nohptune",
#                    ],
#                },
#            }
#        ],
#    )
    
    worker_pool_specs=[
        {
            "machine_spec": {"machine_type": "n1-standard-4"},
            "replica_count": 1,
            "container_spec": {
                "image_uri": TRAINING_CONTAINER_IMAGE_URI,
                "args": [
                f"--train_data_path=gs://{BUCKET}/bhavani/train_images",
                f"--eval_data_path=gs://{BUCKET}/bhavani/valid_images",
                f"--output_dir={OUTDIR}",
                "--batch_size=10",
                "--num_epochs=1",
                "--train_examples=1",
                "--eval_steps=1",

                # below are defaults from Bhavani
                "--nnsize_1=512",
                "--nnsize_2=64",
                "--ksize=4",
                "--pool_ksize=64",
                "--filt_size1=64",
                "--filt_size2=32",

                # hptune
                "--nohptune",
                ],
            },
        }
    ]

    training_task = CustomTrainingJobOp(
        project=PROJECT_ID,
        location=REGION,
        display_name=f"{PIPELINE_NAME}-kfp-training-job",
#        worker_pool_specs=worker_pool_specs_task.output,
        worker_pool_specs=worker_pool_specs,
        base_output_directory=BASE_OUTPUT_DIR,
    )

    model_upload_task = ModelUploadOp(
        project=PROJECT_ID,
        display_name=f"{PIPELINE_NAME}-kfp-model-upload-job",
        artifact_uri=f"{BASE_OUTPUT_DIR}/model",
        serving_container_image_uri=SERVING_CONTAINER_IMAGE_URI,
    )
    model_upload_task.after(training_task)

    endpoint_create_task = EndpointCreateOp(
        project=PROJECT_ID,
        display_name=f"{PIPELINE_NAME}-kfp-create-endpoint-job",
    )
    endpoint_create_task.after(model_upload_task)

    model_deploy_op = ModelDeployOp(  # pylint: disable=unused-variable
        model=model_upload_task.outputs["model"],
        endpoint=endpoint_create_task.outputs["endpoint"],
        deployed_model_display_name=MODEL_DISPLAY_NAME,
        dedicated_resources_machine_type=SERVING_MACHINE_TYPE,
        dedicated_resources_min_replica_count=1,
        dedicated_resources_max_replica_count=1,
    )


Writing ./pipeline_vertex/pipeline_prebuilt.py


In [15]:
ARTIFACT_STORE = f"gs://{PROJECT_ID}-kfp-artifact-store"
PIPELINE_ROOT = f"{ARTIFACT_STORE}/pipeline"
DATA_ROOT = f"{ARTIFACT_STORE}/data"

TRAINING_FILE_PATH = f"{DATA_ROOT}/training/dataset.csv"
VALIDATION_FILE_PATH = f"{DATA_ROOT}/validation/dataset.csv"

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
BASE_OUTPUT_DIR = f"{ARTIFACT_STORE}/models/{TIMESTAMP}"

%env PIPELINE_ROOT={PIPELINE_ROOT}
%env PROJECT_ID={PROJECT_ID}
%env REGION={REGION}
%env SERVING_CONTAINER_IMAGE_URI={SERVING_CONTAINER_IMAGE_URI}
%env TRAINING_CONTAINER_IMAGE_URI={TRAINING_CONTAINER_IMAGE_URI}
%env TRAINING_FILE_PATH={TRAINING_FILE_PATH}
%env VALIDATION_FILE_PATH={VALIDATION_FILE_PATH}
%env BASE_OUTPUT_DIR={BASE_OUTPUT_DIR}

env: PIPELINE_ROOT=gs://qwiklabs-asl-00-c812c3b423f2-kfp-artifact-store/pipeline
env: PROJECT_ID=qwiklabs-asl-00-c812c3b423f2
env: REGION=us-central1
env: SERVING_CONTAINER_IMAGE_URI=us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.0-20:latest
env: TRAINING_CONTAINER_IMAGE_URI=gcr.io/qwiklabs-asl-00-c812c3b423f2/trainer_image_kidneytx_vertex:latest
env: TRAINING_FILE_PATH=gs://qwiklabs-asl-00-c812c3b423f2-kfp-artifact-store/data/training/dataset.csv
env: VALIDATION_FILE_PATH=gs://qwiklabs-asl-00-c812c3b423f2-kfp-artifact-store/data/validation/dataset.csv
env: BASE_OUTPUT_DIR=gs://qwiklabs-asl-00-c812c3b423f2-kfp-artifact-store/models/20230615074214


In [16]:
!gsutil ls | grep ^{ARTIFACT_STORE}/$ || gsutil mb -l {REGION} {ARTIFACT_STORE}

gs://qwiklabs-asl-00-c812c3b423f2-kfp-artifact-store/


In [17]:
PIPELINE_JSON = "covertype_kfp_pipeline.json"

In [18]:
from kfp.v2 import compiler

from pipeline_vertex.pipeline_prebuilt import create_pipeline

compiler.Compiler().compile(
    pipeline_func=create_pipeline, 
    package_path=PIPELINE_JSON,
)



In [24]:
!cat {PIPELINE_JSON}

{
  "pipelineSpec": {
    "components": {
      "comp-custom-training-job": {
        "executorLabel": "exec-custom-training-job",
        "inputDefinitions": {
          "parameters": {
            "base_output_directory": {
              "type": "STRING"
            },
            "display_name": {
              "type": "STRING"
            },
            "enable_web_access": {
              "type": "STRING"
            },
            "encryption_spec_key_name": {
              "type": "STRING"
            },
            "labels": {
              "type": "STRING"
            },
            "location": {
              "type": "STRING"
            },
            "network": {
              "type": "STRING"
            },
            "project": {
              "type": "STRING"
            },
            "reserved_ip_ranges": {
              "type": "STRING"
            },
            "restart_job_on_worker_restart": {
              "type": "STRING"
            },
            "service_acc

In [40]:
aiplatform.init(project=PROJECT_ID, location=REGION)

pipeline = aiplatform.PipelineJob(
    display_name="covertype_kfp_pipeline",
    template_path=PIPELINE_JSON,
    enable_caching=True,
)

pipeline.run()

Creating PipelineJob
PipelineJob created. Resource name: projects/469700469475/locations/us-central1/pipelineJobs/covertype-kfp-pipeline-20230615115251
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/469700469475/locations/us-central1/pipelineJobs/covertype-kfp-pipeline-20230615115251')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/covertype-kfp-pipeline-20230615115251?project=469700469475
PipelineJob projects/469700469475/locations/us-central1/pipelineJobs/covertype-kfp-pipeline-20230615115251 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/469700469475/locations/us-central1/pipelineJobs/covertype-kfp-pipeline-20230615115251 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/469700469475/locations/us-central1/pipelineJobs/covertype-kfp-pipeline-20230615115251 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/46970046

RuntimeError: Job failed with:
code: 9
message: "The DAG failed because some tasks failed. The failed tasks are: [custom-training-job].; Job (project_id = qwiklabs-asl-00-c812c3b423f2, job_id = 5854290838221225984) is failed due to the above error.; Failed to handle the job: {project_number = 469700469475, job_id = 5854290838221225984}"
