In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Online Prediction PSC based private endpint

* [PSC on Vertex AI](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/prediction/get_started_with_psc_private_endpoint.ipynb)

### Configuration

In [2]:
! pip3 install --upgrade --user --quiet google-cloud-aiplatform

[0m

### Initialization

In [3]:
PROJECT_ID = "ai-hangsik"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

In [4]:
# Create GCS Bucket
BUCKET_URI = "gs://20250211_psc_endpoint"  # @param {type:"string"}
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

Creating gs://20250211_psc_endpoint/...
ServiceException: 409 A Cloud Storage bucket named '20250211_psc_endpoint' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


In [5]:
from google.cloud import aiplatform
aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

In [6]:
# @title Gets the default SERVICE_ACCOUNT.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)

Using this default Service Account: 721521243942-compute@developer.gserviceaccount.com


### Upload and deploy model

In [11]:
# @title Set accelerator.
import datetime
now = datetime.datetime.now()

# Find Vertex AI prediction supported accelerators and regions [here](https://cloud.google.com/vertex-ai/docs/predictions/configure-compute).
MACHINE_TYPE = "g2-standard-12" # @param {type:"string"}
ACCELERATOR_TYPE = "NVIDIA_L4" # @param {type:"string"}
ACCELERATOR_COUNT = 1 # @param {type:"string"}

MODEL_BUCKET_URI ="gs://sllm_0106/llama3.1_8b_inst" # @param {type:"string"}
VLLM_DOCKER_URI = "us-docker.pkg.dev/deeplearning-platform-release/vertex-model-garden/vllm-inference.cu121.0-5.ubuntu2204.py310" # @param {type:"string"}

MODEL_ID = "vLLM-Meta-Llama-3.1-8B-Instruct" # @param {type:"string"}
MODEL_DISPLAY_NAME = f"{MODEL_ID}-{now}"
ENDPOINT_PSC_NAME = f"{MODEL_ID}-psc-endpoint-{now}" # @param {type:"string"}

In [8]:
from typing import Tuple
from google.cloud import aiplatform

# See https://docs.vllm.ai/en/latest/serving/engine_args.html

vllm_args = [
    "python",
    "-m",
    "vllm.entrypoints.api_server",
    "--host=0.0.0.0",
    "--port=8080",
    f"--model={MODEL_ID}",
    f"--tensor-parallel-size={ACCELERATOR_COUNT}",
    "--swap-space=16",
    f"--gpu-memory-utilization=0.95",
    f"--max-model-len=8192",
    f"--dtype=auto",
    f"--max-loras=1",
    f"--max-cpu-loras=8",
    f"--max-num-seqs=256",
    "--disable-log-stats",
#     "--trust-remote-code",
#     "--enforce-eager",
#     "--enable-lora",
#     "--model-type=llama",
 ]

env_vars = {
    "MODEL_ID": MODEL_ID,
    "DEPLOY_SOURCE": "notebook",
}

In [9]:
# @title Model upload
# https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.Model#google_cloud_aiplatform_Model_upload

model = aiplatform.Model.upload(
    display_name=MODEL_DISPLAY_NAME,
    artifact_uri=MODEL_BUCKET_URI,
    serving_container_image_uri=VLLM_DOCKER_URI,
    serving_container_args=vllm_args,
    serving_container_ports=[8080],
    # serving_container_predict_route="/generate",
    serving_container_predict_route="/v1/chat/completions",
    serving_container_health_route="/metrics",
    serving_container_environment_variables=env_vars,
    serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
    serving_container_deployment_timeout=7200,
)
print(
    f"Deploying {MODEL_DISPLAY_NAME} on {MACHINE_TYPE} with {ACCELERATOR_COUNT} {ACCELERATOR_TYPE} GPU(s)."
)

Creating Model
Create Model backing LRO: projects/721521243942/locations/us-central1/models/2705762876026519552/operations/1570212452547690496
Model created. Resource name: projects/721521243942/locations/us-central1/models/2705762876026519552@1
To use this Model in another session:
model = aiplatform.Model('projects/721521243942/locations/us-central1/models/2705762876026519552@1')
Deploying vLLM-Meta-Llama-3.1-8B-Instruct-2025-02-11 01:26:32.513605 on g2-standard-12 with 1 NVIDIA_L4 GPU(s).


In [12]:
psc_endpoint = aiplatform.PrivateEndpoint.create(
    display_name=ENDPOINT_PSC_NAME,
    project=PROJECT_ID,
    location=LOCATION,
    private_service_connect_config=aiplatform.PrivateEndpoint.PrivateServiceConnectConfig(
        project_allowlist=[PROJECT_ID],
    ),
)

Creating PrivateEndpoint
Create PrivateEndpoint backing LRO: projects/721521243942/locations/us-central1/endpoints/7111316852524974080/operations/1209924482358050816
PrivateEndpoint created. Resource name: projects/721521243942/locations/us-central1/endpoints/7111316852524974080
To use this PrivateEndpoint in another session:
endpoint = aiplatform.PrivateEndpoint('projects/721521243942/locations/us-central1/endpoints/7111316852524974080')


In [None]:
# endpoint = aiplatform.Endpoint.create(
#         display_name = ENDPOINT_DISPLAY_NAME,
#         dedicated_endpoint_enabled=False,
#     )

In [None]:
model.deploy(
    endpoint=psc_endpoint,
    machine_type=MACHINE_TYPE,
    accelerator_type=ACCELERATOR_TYPE,
    accelerator_count=ACCELERATOR_COUNT,
    deploy_request_timeout=1800,
    service_account=SERVICE_ACCOUNT,
)
print("endpoint_name:", psc_endpoint.name)

In [14]:
print("endpoint_name:", psc_endpoint.name)

endpoint_name: 7111316852524974080


In [15]:
psc_endpoint.list_models()

[id: "1439969905758699520"
 model: "projects/721521243942/locations/us-central1/models/2705762876026519552"
 display_name: "vLLM-Meta-Llama-3.1-8B-Instruct-2025-02-11 01:26:32.513605"
 create_time {
   seconds: 1739237451
   nanos: 149682000
 }
 dedicated_resources {
   machine_spec {
     machine_type: "g2-standard-12"
     accelerator_type: NVIDIA_L4
     accelerator_count: 1
   }
   min_replica_count: 1
   max_replica_count: 1
 }
 service_account: "721521243942-compute@developer.gserviceaccount.com"
 private_endpoints {
   service_attachment: "projects/p9573c9b9f79e4e12-tp/regions/us-central1/serviceAttachments/gkedpm-857e734e4abcae27fc309cb9fa21bc"
 }
 model_version_id: "1"
 status {
   available_replica_count: 1
 }]

In [16]:
service_attachment = psc_endpoint.list_models()[0].private_endpoints.service_attachment
print(service_attachment)

projects/p9573c9b9f79e4e12-tp/regions/us-central1/serviceAttachments/gkedpm-857e734e4abcae27fc309cb9fa21bc


In [None]:
#! gcloud compute forwarding-rules delete  op-psc-llm-endpoint
#! gcloud compute addresses delete psc-llm-prediction

In [20]:
! gcloud compute addresses create psc-llm-prediction \
    --region={LOCATION} \
    --subnet=default2

! gcloud compute forwarding-rules create op-psc-llm-endpoint \
    --network=default2 \
    --address=psc-llm-prediction \
    --target-service-attachment={service_attachment} \
    --region={LOCATION}

Created [https://www.googleapis.com/compute/v1/projects/ai-hangsik/regions/us-central1/addresses/psc-llm-prediction].
Created [https://www.googleapis.com/compute/v1/projects/ai-hangsik/regions/us-central1/forwardingRules/op-psc-llm-endpoint].


In [86]:
IP_ADDRESS = ! gcloud compute forwarding-rules describe op-psc-llm-endpoint --region={LOCATION} --format='value(IPAddress)'
IP_ADDRESS = IP_ADDRESS[0]
print(IP_ADDRESS)

10.128.0.7


### Request using PSC connection

In [88]:
from enum import Enum
from pydantic import BaseModel

class CarType(str, Enum):
    sedan = "sedan"
    suv = "SUV"
    truck = "Truck"
    coupe = "Coupe"

class CarDescription(BaseModel):
    brand: str
    model: str
    car_type: CarType

# json_schema = CarDescription.model_json_schema()
json_schema = CarDescription.schema()


In [89]:

prompt = "Generate a JSON with the brand, model and car_type of the most iconic car of Hyundai from the 90's"

prediction_input = {
    "messages": [{
        "role": "user",
        "content": prompt,
    }],
    "guided_json": json_schema
}


In [87]:
ENDPOINT_ID = "7111316852524974080"

psc_endpoint = aiplatform.PrivateEndpoint(ENDPOINT_ID)
response = psc_endpoint.raw_predict(body=json.dumps(prediction_input, indent=2).encode('utf-8'), headers={'Content-Type':'application/json'}, 
                               endpoint_override=IP_ADDRESS)

print(json.loads(response.data)['choices'][0]['message']['content'])

{ "brand": "Ford", "model": "Mustang GT", "car_type": "Coupe" }


### Cleanup

In [None]:
psc_endpoint.undeploy_all()
psc_endpoint.delete()
model.delete()

In [None]:
! gcloud compute forwarding-rules delete op-psc-endpoint --region={LOCATION}  --quiet

! gcloud compute addresses delete psc-prediction --region={LOCATION} --quiet

Delete the bucket if needed.

In [None]:
! gsutil rm -r {BUCKET_URI}

Optionally, you can use the following command to clean up all private endpoint and models if needed.

In [None]:
for pe in aiplatform.PrivateEndpoint.list():
    pe.undeploy_all()
    pe.delete()