### ref https://github.com/amygdala/code-snippets/blob/master/ml/vertex_pipelines/pytorch/cifar/pytorch_cifar10_vertex_pipelines.ipynb

In [1]:
USER_FLAG = '--user'

In [2]:
!pip3 install --user -U google-cloud-aiplatform 
!pip3 install --user -U kfp
!pip3 install --user -U google_cloud_pipeline_components

Collecting google-cloud-aiplatform
  Downloading google_cloud_aiplatform-1.15.0-py2.py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: google-cloud-aiplatform
[0mSuccessfully installed google-cloud-aiplatform-1.15.0
Collecting kfp
  Downloading kfp-1.8.12.tar.gz (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.2/301.2 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting absl-py<2,>=0.9
  Downloading absl_py-1.1.0-py3-none-any.whl (123 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.7/123.7 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyYAML<6,>=5.3
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m636.6/636.6 kB[0m [31m36.0 MB/s[0m e

In [3]:
# kfpを新規installした場合こちらを実行
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

In [2]:
import json
from typing import NamedTuple


from kfp import dsl
from kfp.v2 import compiler
from typing import NamedTuple
from kfp.v2 import dsl
from kfp.v2.dsl import (
    component,
    InputPath,
    OutputPath,
    Input,
    Output,
    Artifact,
    Dataset,
    Model,
    ClassificationMetrics,
    Metrics,
)

from kfp.v2.google.client import AIPlatformClient

from google_cloud_pipeline_components import aiplatform as gcc_aip
from google.cloud import aiplatform

In [3]:
PROJECT_ID = 'argolis-demo-project'  # <---CHANGE THIS

In [4]:
BUCKET_NAME = "gs://argolis-demo-senchan"  # @param {type:"string"}
REGION = "us-central1"  # @param {type:"string"}

In [5]:
PATH=%env PATH
%env PATH={PATH}:/home/jupyter/.local/bin

USER = 'argolis-demo' # <---CHANGE THIS
PIPELINE_ROOT = '{}/pipeline_root/{}'.format(BUCKET_NAME, USER)

PIPELINE_ROOT

env: PATH=/opt/conda/bin:/opt/conda/condabin:/opt/conda/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/home/jupyter/.local/bin


'gs://argolis-demo-senchan/pipeline_root/argolis-demo'

In [6]:
CONTAINER_URI = "gcr.io/google-samples/pytorch-pl:v2"
GPU_CONTAINER_URI = "gcr.io/google-samples/pytorch-pl-gpu:v5"

In [7]:
@component(
    base_image=CONTAINER_URI,
    output_component_file="cifar_preproc.yaml",
)
def cifar_preproc(
    cifar_dataset: Output[Dataset],
):

    import subprocess
    import logging
    from pathlib import Path

    import torchvision
    import webdataset as wds
    from sklearn.model_selection import train_test_split

    logging.getLogger().setLevel(logging.INFO)
    logging.info("Dataset path is: %s", cifar_dataset.path)
    output_pth = cifar_dataset.path

    Path(output_pth).mkdir(parents=True, exist_ok=True)

    trainset = torchvision.datasets.CIFAR10(
        root="./", train=True, download=True
    )
    testset = torchvision.datasets.CIFAR10(
        root="./", train=False, download=True
    )

    Path(output_pth + "/train").mkdir(parents=True, exist_ok=True)
    Path(output_pth + "/val").mkdir(parents=True, exist_ok=True)
    Path(output_pth + "/test").mkdir(parents=True, exist_ok=True)

    random_seed = 25
    y = trainset.targets
    trainset, valset, y_train, y_val = train_test_split(
        trainset,
        y,
        stratify=y,
        shuffle=True,
        test_size=0.2,
        random_state=random_seed,
    )

    for name in [(trainset, "train"), (valset, "val"), (testset, "test")]:
        with wds.ShardWriter(
            output_pth + "/" + str(name[1]) + "/" + str(name[1]) + "-%d.tar",
            maxcount=1000,
        ) as sink:
            for index, (image, cls) in enumerate(name[0]):
                sink.write(
                    {"__key__": "%06d" % index, "ppm": image, "cls": cls}
                )

    entry_point = ["ls", "-R", output_pth]
    run_code = subprocess.run(entry_point, stdout=subprocess.PIPE)
    print(run_code.stdout)


In [8]:
@component(
    output_component_file="cifar_config.yaml",
)
def cifar_config(
    mar_model_name: str,
    version: str,
    port: int,
    cifar_config: Output[Artifact],
):
    import os
    from pathlib import Path

    Path(cifar_config.path).mkdir(parents=True, exist_ok=True)

    config_properties = f"""inference_address=http://0.0.0.0:{port}
management_address=http://0.0.0.0:8081
metrics_address=http://0.0.0.0:8082
enable_metrics_api=true
metrics_format=prometheus
number_of_netty_threads=4
job_queue_size=10
service_envelope=kfserving
model_store=/home/model-server/model-store
model_snapshot={{"name":"startup.cfg","modelCount":1,"models":{{"{mar_model_name}":{{"{version}":{{"defaultVersion":true,"marName":"{mar_model_name}.mar","minWorkers":1,"maxWorkers":5,"batchSize":1,"maxBatchDelay":5000,"responseTimeout":120}}}}}}}}
"""

    # write to artifact dir
    properties_path = os.path.join(cifar_config.path, "config.properties")
    with open(properties_path, "w") as f:
        f.write(config_properties)

    torchserve_dockerfile_str = f"""FROM pytorch/torchserve:0.5.1-gpu

RUN pip install --upgrade pip
RUN pip install grpcio==1.32.0
RUN pip install pytorch-lightning

COPY config.properties /home/model-server/config.properties
COPY {mar_model_name}.mar /home/model-server/model-store/
"""
    # write to artifact dir
    dockerfile_path = os.path.join(cifar_config.path, "Dockerfile")
    with open(dockerfile_path, "w") as f:
        f.write(torchserve_dockerfile_str)


In [9]:
@component(
    base_image=GPU_CONTAINER_URI,
    output_component_file="cifar_train.yaml",
)
def cifar_train(
    model_name: str,
    max_epochs: int,
    model_display_name: str,
    tensorboard_instance:str,
    cifar_dataset: Input[Dataset],
    cifar_model: Output[Model],
):

    import pytorch_lightning as pl
    import logging
    import os
    from subprocess import Popen, DEVNULL
    import sys

    from pytorch_pipeline.components.trainer.component import Trainer
    from argparse import ArgumentParser
    from pytorch_lightning.loggers import TensorBoardLogger
    from pytorch_lightning.callbacks import (
        EarlyStopping,
        LearningRateMonitor,
        ModelCheckpoint,
    )

    logging.getLogger().setLevel(logging.INFO)
    logging.info("datset root path: %s", cifar_dataset.path)
    logging.info("model root path: %s", cifar_model.path)
    model_output_root = cifar_model.path

    # Argument parser for user defined paths
    parser = ArgumentParser()

    parser.add_argument(
        "--tensorboard_root",
        type=str,
        default=f"{model_output_root}/tensorboard",
        help="Tensorboard Root path (default: output/tensorboard)",
    )

    parser.add_argument(
        "--checkpoint_dir",
        type=str,
        default=f"{model_output_root}/train/models",
        help="Path to save model checkpoints ",
    )

    parser.add_argument(
        "--dataset_path",
        type=str,
        default=cifar_dataset.path,
        help="Cifar10 Dataset path (default: output/processing)",
    )

    parser.add_argument(
        "--model_name",
        type=str,
        default="resnet.pth",
        help="Name of the model to be saved as (default: resnet.pth)",
    )

    sys.argv = sys.argv[:1]

    parser = pl.Trainer.add_argparse_args(parent_parser=parser)
    args = vars(parser.parse_args())

    # Enabling Tensorboard Logger, ModelCheckpoint, Earlystopping
    lr_logger = LearningRateMonitor()
    tboard = TensorBoardLogger(f"{model_output_root}/tensorboard")

    early_stopping = EarlyStopping(
        monitor="val_loss", mode="min", patience=5, verbose=True
    )
    checkpoint_callback = ModelCheckpoint(
        dirpath=f"{model_output_root}/train/models",
        filename="cifar10_{epoch:02d}",
        save_top_k=1,
        verbose=True,
        monitor="val_loss",
        mode="min",
    )

    # Setting the trainer-specific arguments
    trainer_args = {
        "logger": tboard,
        "profiler": "pytorch",
        "checkpoint_callback": True,
        "max_epochs": max_epochs,
        "callbacks": [lr_logger, early_stopping, checkpoint_callback],
        "gpus": 1,
    }

    # Setting the datamodule specific arguments
    data_module_args = {"train_glob": cifar_dataset.path}

    if tensorboard_instance:
      try:
        logging.warning('setting up Vertex tensorboard experiment')
        tb_gs = f"{model_output_root}/tensorboard".replace("/gcs/", "gs://")
        logging.info('tb gs path: %s', tb_gs)
        tb_args = ["/opt/conda/bin/tb-gcp-uploader", "--tensorboard_resource_name", tensorboard_instance, 
                        "--logdir", tb_gs, "--experiment_name", model_display_name,
                        # '--one_shot=True'
                        ]
        logging.warning('tb args: %s', tb_args)
        Popen(tb_args, stdout=DEVNULL, stderr=DEVNULL)
      except Exception as e:
        logging.warning(e)

    # Initiating the training process
    logging.info("about to call the Trainer...")

    trainer = Trainer(
        module_file="cifar10_train.py",
        data_module_file="cifar10_datamodule.py",
        module_file_args=parser,
        data_module_args=data_module_args,
        trainer_args=trainer_args,
    )

 


In [10]:
@component(
    base_image=CONTAINER_URI,
    output_component_file="mar.yaml",
)
def generate_mar_file(
    model_name: str,
    mar_model_name: str,
    handler: str,
    version: str,
    cifar_model: Input[Model],
    cifar_mar: Output[Model],
):

    import logging
    import pytorch_lightning as pl
    import os
    import subprocess

    from pathlib import Path

    def _validate_mar_config(mar_config):
        mandatory_args = [
            "MODEL_NAME",
            "SERIALIZED_FILE",
            "MODEL_FILE",
            "HANDLER",
            "VERSION",
        ]
        missing_list = []
        for key in mandatory_args:
            if key not in mar_config:
                missing_list.append(key)

        if missing_list:
            logging.warning(
                "The following Mandatory keys are missing in the config file {} ".format(
                    missing_list
                )
            )
            raise Exception(
                "Following Mandatory keys are missing in the config file {} ".format(
                    missing_list
                )
            )

    logging.getLogger().setLevel(logging.INFO)

    model_output_root = cifar_model.path
    mar_output_root = cifar_mar.path
    export_path = f"{mar_output_root}/model-store"
    try:
        Path(export_path).mkdir(parents=True, exist_ok=True)
    except Exception as e:
        logging.warning(e)
        # retry after pause
        import time

        time.sleep(2)
        Path(export_path).mkdir(parents=True, exist_ok=True)

    mar_config = {
        "MODEL_NAME": mar_model_name,
        "MODEL_FILE": "pytorch_pipeline/examples/cifar10/cifar10_train.py",
        "HANDLER": handler,
        "SERIALIZED_FILE": os.path.join(
            f"{model_output_root}/train/models",
            model_name,
        ),
        "VERSION": version,
        "EXPORT_PATH": f"{cifar_mar.path}/model-store",
    }
    logging.warning("mar_config: %s", mar_config)
    print(f"mar_config: {mar_config}")
    try:
        logging.info("validating config")
        _validate_mar_config(mar_config)
    except Exception as e:
        logging.warning(e)

    archiver_cmd = "torch-model-archiver --force --model-name {MODEL_NAME} --serialized-file {SERIALIZED_FILE} --model-file {MODEL_FILE} --handler {HANDLER} -v {VERSION}".format(
        MODEL_NAME=mar_config["MODEL_NAME"],
        SERIALIZED_FILE=mar_config["SERIALIZED_FILE"],
        MODEL_FILE=mar_config["MODEL_FILE"],
        HANDLER=mar_config["HANDLER"],
        VERSION=mar_config["VERSION"],
    )
    if "EXPORT_PATH" in mar_config:
        archiver_cmd += " --export-path {EXPORT_PATH}".format(
            EXPORT_PATH=mar_config["EXPORT_PATH"]
        )

    if "EXTRA_FILES" in mar_config:
        archiver_cmd += " --extra_files {EXTRA_FILES}".format(
            EXTRA_FILES=mar_config["EXTRA_FILES"]
        )

    if "REQUIREMENTS_FILE" in mar_config:
        archiver_cmd += " -r {REQUIREMENTS_FILE}".format(
            REQUIREMENTS_FILE=mar_config["REQUIREMENTS_FILE"]
        )

    print("Running Archiver cmd: ", archiver_cmd)
    logging.warning("archiver command: %s", archiver_cmd)

    try:
        return_code = subprocess.Popen(archiver_cmd, shell=True).wait()
        if return_code != 0:
            error_msg = (
                "Error running command {archiver_cmd} {return_code}".format(
                    archiver_cmd=archiver_cmd, return_code=return_code
                )
            )
            print(error_msg)
    except Exception as e:
        logging.warning(e)


In [12]:
from google.protobuf.duration_pb2 import Duration

In [11]:
@component(
    base_image="gcr.io/deeplearning-platform-release/tf2-gpu.2-3:latest",
    output_component_file="build_image.yaml",
)
def build_torchserve_image(
    model_name: str,
    cifar_mar: Input[Model],
    cifar_config: Input[Artifact],
    project: str,
) -> NamedTuple("Outputs", [("serving_container_uri", str),],):

    from datetime import datetime
    import logging
    import os

    import google.auth
    from google.cloud.devtools import cloudbuild_v1
    from google.protobuf.duration_pb2 import Duration

    logging.getLogger().setLevel(logging.INFO)
    credentials, project_id = google.auth.default()
    client = cloudbuild_v1.services.cloud_build.CloudBuildClient()

    mar_model_name = f"{model_name}.mar"
    build_version = datetime.now().strftime("%Y%m%d%H%M%S")

    dockerfile_path = os.path.join(cifar_config.path, "Dockerfile")
    gs_dockerfile_path = dockerfile_path.replace("/gcs/", "gs://")
    config_prop_path = os.path.join(cifar_config.path, "config.properties")
    gs_config_prop_path = config_prop_path.replace("/gcs/", "gs://")

    export_path = f"{cifar_mar.path}/model-store"
    model_path = os.path.join(export_path, mar_model_name)
    gs_model_path = model_path.replace("/gcs/", "gs://")
    logging.warning("gs_model_path: %s", gs_model_path)

    image_uri = f"gcr.io/{project}/torchservetest:{build_version}"
    logging.info("image uri: %s", image_uri)

    build = cloudbuild_v1.Build(images=[image_uri])
    build.steps = [
        {
            "name": "gcr.io/cloud-builders/gsutil",
            "args": [
                "cp",
                gs_config_prop_path,
                "config.properties",
            ],
        },
        {
            "name": "gcr.io/cloud-builders/gsutil",
            "args": ["cp", f"{gs_model_path}", f"{mar_model_name}"],
        },
        {
            "name": "gcr.io/cloud-builders/gsutil",
            "args": [
                "cp",
                gs_dockerfile_path,
                "Dockerfile",
            ],
        },
        {
            "name": "gcr.io/cloud-builders/docker",
            "args": ["build", "-t", image_uri, "."],
        },
    ]
    timeout = Duration()
    timeout.seconds = 7200
    build.timeout = timeout
    
    operation = client.create_build(project_id=project, build=build)
    print("IN PROGRESS:")
    print(operation.metadata)

    result = operation.result()
    # Print the completed status
    print("RESULT:", result.status)
    return (image_uri,)


In [12]:
from datetime import datetime
ts = datetime.now().strftime("%Y%m%d%H%M%S")
MODEL_NAME = f'resnet{ts}'
PORT = 8080
MAR_MODEL_NAME = 'cifar10'

In [18]:
print(PORT)

8080


In [22]:
print(MODEL_NAME)

resnet20220714090809


In [25]:

@dsl.pipeline(
    name="pytorch-cifar-pipeline",
    pipeline_root=PIPELINE_ROOT,
)
def pytorch_cifar_pipeline(
    project: str = PROJECT_ID,
    model_name: str = "resnet.pth",
    model_display_name: str = MODEL_NAME,
    max_epochs: int = 1,
    mar_model_name: str = MAR_MODEL_NAME,
    handler: str = "image_classifier",
    version: str = "1.0",
    port: int = PORT,
    tensorboard_instance: str = ''
):

    cifar_config_task = cifar_config(mar_model_name, version, port)
    cifar_preproc_task = cifar_preproc()

    cifar_train_task = cifar_train(
        model_name=model_name,
        max_epochs=max_epochs,
        model_display_name=model_display_name,
        tensorboard_instance=tensorboard_instance,
        cifar_dataset=cifar_preproc_task.outputs["cifar_dataset"],
    ).set_memory_limit('32G').set_gpu_limit(1)
    cifar_train_task.add_node_selector_constraint(
        # You can change this to use a different accelerator. Ensure you have quota for it.
        "cloud.google.com/gke-accelerator", "nvidia-tesla-v100"
    )

    cifar_mar_task = generate_mar_file(
        model_name,
        mar_model_name,
        handler,
        version,
        cifar_train_task.outputs["cifar_model"],
    )

    build_image_task = build_torchserve_image(
        mar_model_name, cifar_mar_task.outputs["cifar_mar"], 
        cifar_config_task.outputs['cifar_config'],
        project
    )

    #gcc_aip.ModelUploadOp.component_spec.implementation.container.image = "gcr.io/ml-pipeline/google-cloud-pipeline-components:0.1.7"
    gcc_aip.ModelUploadOp.component_spec.implementation.container.image = "gcr.io/ml-pipeline/google-cloud-pipeline-components:1.0.13"
    model_upload_op = gcc_aip.ModelUploadOp(
        project=project,
        display_name=model_display_name,
        serving_container_image_uri=build_image_task.outputs['serving_container_uri'],
        serving_container_predict_route="/predictions/{}".format(MAR_MODEL_NAME),
        serving_container_health_route="/ping",
        #serving_container_ports=[PORT]
        serving_container_ports=[{"containerPort" : PORT}]
    )
    
    #gcc_aip.EndpointCreateOp.component_spec.implementation.container.image = "gcr.io/ml-pipeline/google-cloud-pipeline-components:0.1.7"
    gcc_aip.EndpointCreateOp.component_spec.implementation.container.image = "gcr.io/ml-pipeline/google-cloud-pipeline-components:1.0.13"
    endpoint_create_op = gcc_aip.EndpointCreateOp(
        project=project,
        display_name=model_display_name,
    )

    #gcc_aip.ModelDeployOp.component_spec.implementation.container.image = "gcr.io/ml-pipeline/google-cloud-pipeline-components:0.1.7"
    gcc_aip.ModelDeployOp.component_spec.implementation.container.image = "gcr.io/ml-pipeline/google-cloud-pipeline-components:1.0.13"
    model_deploy_op = gcc_aip.ModelDeployOp(
        #project=project,
        endpoint=endpoint_create_op.outputs["endpoint"],
        model=model_upload_op.outputs["model"],
        deployed_model_display_name=model_display_name,
        #dedicated_resources_machine_type="n1-standard-4",
        dedicated_resources_machine_type="n1-standard-32",
        dedicated_resources_min_replica_count=1,
        #dedicated_resources_accelerator_type='NVIDIA_TESLA_P100',  # CHANGE THIS as necessary
        #dedicated_resources_accelerator_count=1,
        traffic_split = {"0": 100}
    )



In [26]:
from kfp.v2 import compiler as v2compiler
v2compiler.Compiler().compile(pipeline_func=pytorch_cifar_pipeline,
                              package_path='pytorch_pipeline_spec.json')



In [27]:
job = aiplatform.PipelineJob(
    display_name=MODEL_NAME,
    template_path="pytorch_pipeline_spec.json",
    pipeline_root=PIPELINE_ROOT,
    parameter_values={
        "model_name": "resnet.pth", "max_epochs": 5,
        "project": PROJECT_ID, "model_display_name": MODEL_NAME,
        # "tensorboard_instance": TENSORBOARD_INSTANCE
    },
)

job.run(sync=False
       )

Creating PipelineJob
PipelineJob created. Resource name: projects/278305018396/locations/us-central1/pipelineJobs/pytorch-cifar-pipeline-20220715023900
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/278305018396/locations/us-central1/pipelineJobs/pytorch-cifar-pipeline-20220715023900')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/pytorch-cifar-pipeline-20220715023900?project=278305018396
PipelineJob projects/278305018396/locations/us-central1/pipelineJobs/pytorch-cifar-pipeline-20220715023900 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/278305018396/locations/us-central1/pipelineJobs/pytorch-cifar-pipeline-20220715023900 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/278305018396/locations/us-central1/pipelineJobs/pytorch-cifar-pipeline-20220715023900 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/27830501

In [16]:
ENDPOINT_ID = '73139513479659520'

In [None]:
!gcloud ai endpoints predict {ENDPOINT_ID} --json-request=input.json

Please specify a region:
 [1] asia-east1
 [2] asia-east2
 [3] asia-northeast1
 [4] asia-northeast3
 [5] asia-south1
 [6] asia-southeast1
 [7] australia-southeast1
 [8] europe-west1
 [9] europe-west2
 [10] europe-west3
 [11] europe-west4
 [12] europe-west6
 [13] northamerica-northeast1
 [14] northamerica-northeast2
 [15] southamerica-east1
 [16] us-central1
 [17] us-east1
 [18] us-east4
 [19] us-west1
 [20] us-west2
 [21] us-west4
 [22] cancel
Please enter your numeric choice:  