In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Online Prediction PSC based private endpint

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/prediction/get_started_with_psc_private_endpoint.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Fprediction%2Fget_started_with_psc_private_endpoint.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/prediction/get_started_with_psc_private_endpoint.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/prediction/get_started_with_psc_private_endpoint.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

## Overview

Compared to the current PSA Private Endpoint, PSC based Private Endpoint has the following benefits:
1. Simpler setup process: Currently, the only extra step user need to do is to create an Endpoint in their VPC. And this will be done by PSC automatically before our GA launch.

2. No more IP exhuasted issue: GKE cluster will be hosted in tenant project VPC, so we can create much bigger cluster and won't affected by ip exhuasted issue in User's VPC.

3. Unified experience with public endpoint: The API is the same as public endpoint, so user can use our SDK/client library. We also provide quota, IAM and monitoring metrics as public endpoint does.


## Get started

### Install Vertex AI SDK for Python and other required packages

In [2]:
! pip3 install --upgrade --user --quiet google-cloud-aiplatform

[0m

### Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.


In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information and initialize Vertex AI SDK for Python

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [3]:
PROJECT_ID = "ai-hangsik"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

In [4]:
# Create GCS Bucket
BUCKET_URI = "gs://20250211_psc_endpoint"  # @param {type:"string"}
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

Creating gs://20250211_psc_endpoint/...
ServiceException: 409 A Cloud Storage bucket named '20250211_psc_endpoint' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


In [5]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

## Prepare Test Models

We prepared some test models, feel free to use your own models.

In [6]:
# Copy Models to the Bucket
! gsutil cp -r "gs://cloud-samples-data/vertex-ai/prediction/test-models-requests/*" {BUCKET_URI}

Copying gs://cloud-samples-data/vertex-ai/prediction/test-models-requests/churn/assets/country.txt [Content-Type=application/octet-stream]...
Copying gs://cloud-samples-data/vertex-ai/prediction/test-models-requests/churn/assets/language.txt [Content-Type=application/octet-stream]...
Copying gs://cloud-samples-data/vertex-ai/prediction/test-models-requests/churn/assets/operating_system.txt [Content-Type=application/octet-stream]...
/ [3 files][  2.0 KiB/  2.0 KiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying gs://cloud-samples-data/vertex-ai/prediction/test-models-requests/churn/assets/user_pseudo_id.txt [Content-Type=application/octet-stream]...
Copying gs://cloud-samples-data/vertex-ai/prediction/test-models-requests/churn/ex

### Upload Model

In [7]:
# Depending on which model you wanna use, uncomment the corresponding section below and run the block.

# TF Model
DISPLAY_NAME = "tensorflow model"  # @param {type:"string"}
ARTIFACT_URI = BUCKET_URI + "/tensorflow"
IMAGE_URI = "us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-12:latest"
REQUEST_FILE = "tensorflow_request.json"


# Pytorch Model
# DISPLAY_NAME="Pytorch model"
# ARTIFACT_URI=BUCKET_URI+"/pytorch"
# IMAGE_URI="us-docker.pkg.dev/vertex-ai/prediction/pytorch-cpu.2-0:latest"
# REQUEST_FILE="pytorch_request.json"


# Sklearn Model
# DISPLAY_NAME="Sklearn model"
# ARTIFACT_URI=BUCKET_URI+"/sklearn"
# IMAGE_URI="us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-2:latest"
# REQUEST_FILE="sklearn_request.json"


# xgboost Model
# DISPLAY_NAME="xgboost model"
# ARTIFACT_URI=BUCKET_URI+"/xgboost"
# IMAGE_URI="us-docker.pkg.dev/vertex-ai/prediction/xgboost-cpu.1-7:latest"
# REQUEST_FILE="xgboost_request.json"

In [8]:
model = aiplatform.Model.upload(
    display_name=DISPLAY_NAME,
    artifact_uri=ARTIFACT_URI,
    serving_container_image_uri=IMAGE_URI,
    sync=False,
)

model.wait()

Creating Model
Create Model backing LRO: projects/721521243942/locations/us-central1/models/3752849789390159872/operations/565909735644069888
Model created. Resource name: projects/721521243942/locations/us-central1/models/3752849789390159872@1
To use this Model in another session:
model = aiplatform.Model('projects/721521243942/locations/us-central1/models/3752849789390159872@1')


### Create PSC based Prediction Private Endpoint


In [9]:
psc_endpoint = aiplatform.PrivateEndpoint.create(
    display_name="psc-endpoint",
    project=PROJECT_ID,
    location=LOCATION,
    private_service_connect_config=aiplatform.PrivateEndpoint.PrivateServiceConnectConfig(
        project_allowlist=[PROJECT_ID],
    ),
)

Creating PrivateEndpoint
Create PrivateEndpoint backing LRO: projects/721521243942/locations/us-central1/endpoints/3843955322867679232/operations/561406136016699392
PrivateEndpoint created. Resource name: projects/721521243942/locations/us-central1/endpoints/3843955322867679232
To use this PrivateEndpoint in another session:
endpoint = aiplatform.PrivateEndpoint('projects/721521243942/locations/us-central1/endpoints/3843955322867679232')


Alternatively, send http call to create endpoint. You need to manually replace ALL the variables below

In [None]:
# ! curl -X POST -H "Content-Type: application/json" -H "Authorization: Bearer `gcloud auth print-access-token`" https://${LOCATION}-aiplatform.googleapis.com/v1/projects/${PROJECT_ID}/locations/{LOCATION}/endpoints -d \
# '{ \
#     displayName: "psc-endpoint", \
#     privateServiceConnectConfig: { \
#       enablePrivateServiceConnect: true, \
#       projectAllowlist: ["{PROJECT_ID}"] \
#     }, \
# }'

### Deploy Model

In [10]:
psc_endpoint.deploy(model=model, traffic_percentage=100, machine_type="e2-standard-8")

psc_endpoint.list_models()

Deploying Model projects/721521243942/locations/us-central1/models/3752849789390159872 to PrivateEndpoint : projects/721521243942/locations/us-central1/endpoints/3843955322867679232
Deploy PrivateEndpoint model backing LRO: projects/721521243942/locations/us-central1/endpoints/3843955322867679232/operations/3189256518587383808
PrivateEndpoint model deployed. Resource name: projects/721521243942/locations/us-central1/endpoints/3843955322867679232


[id: "3250416955961638912"
 model: "projects/721521243942/locations/us-central1/models/3752849789390159872"
 display_name: "tensorflow model"
 create_time {
   seconds: 1739236172
   nanos: 649325000
 }
 dedicated_resources {
   machine_spec {
     machine_type: "e2-standard-8"
   }
   min_replica_count: 1
   max_replica_count: 1
 }
 private_endpoints {
   service_attachment: "projects/p9573c9b9f79e4e12-tp/regions/us-central1/serviceAttachments/gkedpm-d0ab76b2a55711bace1541472ae116"
 }
 model_version_id: "1"
 status {
   available_replica_count: 1
 }]

### Create Forwarding Rule in Consumer Project

First, find the service attachment from the endpoint and deployed model.

In [11]:
service_attachment = psc_endpoint.list_models()[0].private_endpoints.service_attachment
print(service_attachment)

projects/p9573c9b9f79e4e12-tp/regions/us-central1/serviceAttachments/gkedpm-d0ab76b2a55711bace1541472ae116


Then, create an address and a forwarding rule targeting at the service attachment. In this example, default network and subnet are used, replace it with your VPC network and subnet if running in your VPC.

In [19]:
#! gcloud compute forwarding-rules delete  op-psc-endpoint
#! gcloud compute addresses delete psc-prediction

Did you mean region [us-central1] for address: [psc-prediction] (Y/n)?  ^C


Command killed by keyboard interrupt



In [20]:
! gcloud compute addresses create psc-prediction \
    --region={LOCATION} \
    --subnet=default2

! gcloud compute forwarding-rules create op-psc-endpoint \
    --network=default2 \
    --address=psc-prediction \
    --target-service-attachment={service_attachment} \
    --region={LOCATION}

Created [https://www.googleapis.com/compute/v1/projects/ai-hangsik/regions/us-central1/addresses/psc-prediction].
Created [https://www.googleapis.com/compute/v1/projects/ai-hangsik/regions/us-central1/forwardingRules/op-psc-endpoint].


Save the IP address above.

In [21]:
IP_ADDRESS = ! gcloud compute forwarding-rules describe op-psc-endpoint --region={LOCATION} --format='value(IPAddress)'
IP_ADDRESS = IP_ADDRESS[0]
print(IP_ADDRESS)

10.128.0.5


## Make Predictions

From this point, all the code below must be run from a GCP VM in the same VPC, same region as your PSC Endpoint.

If you're using Vertex AI Workbench or Colab Enterprise, you should be good.

If you're creating a GCE VM, please make sure Cloud Platform access scope is enabled.

In [16]:
# Download the requests files:
! gsutil cp {BUCKET_URI}/requests/* ./

Copying gs://20250211_psc_endpoint/requests/pytorch_request.json...
Copying gs://20250211_psc_endpoint/requests/sklearn_request.json...             
Copying gs://20250211_psc_endpoint/requests/tensorflow_request.json...          
Copying gs://20250211_psc_endpoint/requests/vision_small_request.json...        
- [4 files][ 16.8 KiB/ 16.8 KiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying gs://20250211_psc_endpoint/requests/xgboost_request.json...
- [5 files][ 16.9 KiB/ 16.9 KiB]                                                
Operation completed over 5 objects/16.9 KiB.                                     


In [26]:
import os

if not os.getenv("IS_TESTING"):
    import json

    import urllib3

    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

    with open(REQUEST_FILE) as json_file:
        data = json.load(json_file)
        # print(data)
        response = psc_endpoint.predict(
            instances=data["instances"], endpoint_override=IP_ADDRESS
        )
        print(response)

Prediction(predictions=[[-357.108429], [-171.621658]], deployed_model_id='3250416955961638912', metadata=None, model_version_id='1', model_resource_name='projects/721521243942/locations/us-central1/models/3752849789390159872', explanations=None)


### Predict Requests

Alternatively, you can send HTTP requests directly to the IP address. Make sure to replace all variabled in the requests

In [27]:
ENDPOINT_RESOURCE_NAME = psc_endpoint.resource_name
ENDPOINT_RESOURCE_NAME

'projects/721521243942/locations/us-central1/endpoints/3843955322867679232'

In [28]:
import os

if not os.getenv("IS_TESTING"):
    # Predict
    # ! curl --insecure -H "Content-Type: application/json" -H "Authorization: Bearer `gcloud auth print-access-token`"  https://{IP_ADDRESS}/v1/{ENDPOINT_RESOURCE_NAME}:predict -d@{REQUEST_FILE}

    # # RawPredict
    ! curl -v --insecure -H "Content-Type: application/json" -H "Authorization: Bearer `gcloud auth print-access-token`" https://{IP_ADDRESS}/v1/{ENDPOINT_RESOURCE_NAME}:rawPredict -d@{REQUEST_FILE}

*   Trying 10.128.0.5:443...
* Connected to 10.128.0.5 (10.128.0.5) port 443 (#0)
* ALPN, offering h2
* ALPN, offering http/1.1
* successfully set certificate verify locations:
*  CAfile: /etc/ssl/certs/ca-certificates.crt
*  CApath: /etc/ssl/certs
* TLSv1.3 (OUT), TLS handshake, Client hello (1):
* TLSv1.3 (IN), TLS handshake, Server hello (2):
* TLSv1.3 (OUT), TLS change cipher, Change cipher spec (1):
* TLSv1.3 (OUT), TLS handshake, Client hello (1):
* TLSv1.3 (IN), TLS handshake, Server hello (2):
* TLSv1.3 (IN), TLS handshake, Encrypted Extensions (8):
* TLSv1.3 (IN), TLS handshake, Certificate (11):
* TLSv1.3 (IN), TLS handshake, CERT verify (15):
* TLSv1.3 (IN), TLS handshake, Finished (20):
* TLSv1.3 (OUT), TLS handshake, Finished (20):
* SSL connection using TLSv1.3 / TLS_AES_256_GCM_SHA384
* ALPN, server accepted to use h2
* Server certificate:
*  subject: C=US; ST=CA; L=Sunnyvale; O=Google LLC; CN=aiplatform.googleapis.com
*  start date: Nov 20 06:20:58 2023 GMT
*  expire da

### Deploy another model and update traffic split

Deploy another model, and update the traffic split to be 50:50, after the deployment is done, you can rerun the prediction again for multiple times, you should be able to see the deployed_model_id are different.

In [None]:
psc_endpoint.deploy(model=model, traffic_percentage=50, machine_type="e2-standard-8")

In [None]:
import os

if not os.getenv("IS_TESTING"):
    import json

    import urllib3

    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

    counter = {}
    with open(REQUEST_FILE) as json_file:
        data = json.load(json_file)
        for i in range(1000):
            response = psc_endpoint.predict(
                instances=data["instances"], endpoint_override=IP_ADDRESS
            )
            if response.deployed_model_id in counter.keys():
                counter[response.deployed_model_id] += 1
            else:
                counter[response.deployed_model_id] = 1
    print(counter)

You can update the traffic split with the following command and run the code above again.

In [None]:
import os

if not os.getenv("IS_TESTING"):
    deployed_model_id_0 = list(counter)[0]
    deployed_model_id_1 = list(counter)[1]

    psc_endpoint.update(
        traffic_split={deployed_model_id_0: 20, deployed_model_id_1: 80}
    )

## Cleanup

In [None]:
psc_endpoint.undeploy_all()
psc_endpoint.delete()
model.delete()

In [None]:
! gcloud compute forwarding-rules delete op-psc-endpoint --region={LOCATION}  --quiet

! gcloud compute addresses delete psc-prediction --region={LOCATION} --quiet

Delete the bucket if needed.

In [None]:
! gsutil rm -r {BUCKET_URI}

Optionally, you can use the following command to clean up all private endpoint and models if needed.

In [None]:
for pe in aiplatform.PrivateEndpoint.list():
    pe.undeploy_all()
    pe.delete()