In [None]:
# Copyright 2024 Forusone
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Custom container deploy using LocalModel

In [1]:
%cd /home/jupyter/llmOps_vertexAI/llm_serving/vllm/custom

/home/jupyter/llmOps_vertexAI/llm_serving/vllm/custom


In [2]:
!pip install --upgrade --user --quiet google-cloud-aiplatform[prediction]

### Configuration

In [5]:
import os
import logging
logging.basicConfig(level=logging.INFO)
import datetime
now = datetime.datetime.now()

PROJECT_ID = "ai-hangsik"
REGION = "us-central1"
PROJECT_NUMBER = "721521243942"
MODEL_PATH = "gs://20250131_custom_handler/meta-llama/Llama-3.1-8B-Instruct"
MODEL_ID = "vLLM-Meta-Llama-3.1-8B-Instruct" # @param {type:"string"}

MODEL_BUCKET_URI ="gs://sllm_0106/llama3.1_8b_inst" # @param {type:"string"}
MODEL_DISPLAY_NAME = f"{MODEL_ID}-{now}"
ENDPOINT_DISPLAY_NAME = f"{MODEL_ID}-endpoint" # @param {type:"string"}
CONTAINER = "us-docker.pkg.dev/deeplearning-platform-release/vertex-model-garden/vllm-inference.cu121.0-5.ubuntu2204.py310"

VPC_NETWORK = "default2" #vpc network name to peering

## Local Model

### Build LocalModel

In [120]:
# @title Gets the default SERVICE_ACCOUNT.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)

Using this default Service Account: 721521243942-compute@developer.gserviceaccount.com


In [7]:
# @title Set accelerator.
# Find Vertex AI prediction supported accelerators and regions [here](https://cloud.google.com/vertex-ai/docs/predictions/configure-compute).
MACHINE_TYPE = "g2-standard-12" # @param {type:"string"}
ACCELERATOR_TYPE = "NVIDIA_L4" # @param {type:"string"}
ACCELERATOR_COUNT = 1 # @param {type:"string"}

In [14]:
from typing import Tuple
from google.cloud import aiplatform

# See https://docs.vllm.ai/en/latest/serving/engine_args.html for a list of possible arguments with descriptions.
vllm_args = [
    "python",
    "-m",
    "vllm.entrypoints.api_server",
    "--host=0.0.0.0",
    "--port=8080",
    f"--model={MODEL_ID}",
    f"--tensor-parallel-size={ACCELERATOR_COUNT}",
    "--swap-space=16",
    f"--gpu-memory-utilization=0.95",
    f"--max-model-len=8192",
    f"--dtype=auto",
    f"--max-loras=1",
    f"--max-cpu-loras=8",
    f"--max-num-seqs=256",
    "--disable-log-stats",
#     "--trust-remote-code",
#     "--enforce-eager",
#     "--enable-lora",
#     "--model-type=llama",
 ]

env_vars = {
    "MODEL_ID": MODEL_ID,
    "DEPLOY_SOURCE": "notebook",
    "VERTEX_CPR_MAX_WORKERS": "1",
    "RUST_BACKTRACE": "full", #for stack trace printing,
    "CUDA_MEMORY_FRACTION": "0.93",    
}

In [58]:

#must secure sufficient space
from google.cloud.aiplatform.prediction import LocalModel
local_model = LocalModel(
                        serving_container_image_uri=CONTAINER,
                        serving_container_args=vllm_args,
                        serving_container_ports=[8080],
                        # serving_container_predict_route="/generate",
                        serving_container_predict_route="/v1/chat/completions",
                        serving_container_health_route="/metrics",
                        serving_container_environment_variables=env_vars,
                        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
                        serving_container_deployment_timeout=7200,

                        )

In [59]:
local_model.get_serving_container_spec()

image_uri: "us-docker.pkg.dev/deeplearning-platform-release/vertex-model-garden/vllm-inference.cu121.0-5.ubuntu2204.py310"
args: "python"
args: "-m"
args: "vllm.entrypoints.api_server"
args: "--host=0.0.0.0"
args: "--port=8080"
args: "--model=vLLM-Meta-Llama-3.1-8B-Instruct"
args: "--tensor-parallel-size=1"
args: "--swap-space=16"
args: "--gpu-memory-utilization=0.95"
args: "--max-model-len=8192"
args: "--dtype=auto"
args: "--max-loras=1"
args: "--max-cpu-loras=8"
args: "--max-num-seqs=256"
args: "--disable-log-stats"
env {
  name: "MODEL_ID"
  value: "vLLM-Meta-Llama-3.1-8B-Instruct"
}
env {
  name: "DEPLOY_SOURCE"
  value: "notebook"
}
env {
  name: "VERTEX_CPR_MAX_WORKERS"
  value: "1"
}
env {
  name: "RUST_BACKTRACE"
  value: "full"
}
env {
  name: "CUDA_MEMORY_FRACTION"
  value: "0.93"
}
ports {
  container_port: 8080
}
predict_route: "/v1/chat/completions"
health_route: "/metrics"
deployment_timeout {
  seconds: 7200
}
shared_memory_size_mb: 16384

### Deploy to local endpoint

In [64]:
import json
#Manual deploy and test
local_endpoint = local_model.deploy_to_local_endpoint(
    artifact_uri=MODEL_PATH,
    gpu_count=-1,
    container_ready_timeout = 600)

local_endpoint.serve()


In [65]:
local_endpoint.print_container_logs()

### Test request

In [76]:
from enum import Enum
from pydantic import BaseModel

class CarType(str, Enum):
    sedan = "sedan"
    suv = "SUV"
    truck = "Truck"
    coupe = "Coupe"

class CarDescription(BaseModel):
    brand: str
    model: str
    car_type: CarType

json_schema = CarDescription.model_json_schema()

In [78]:

prompt = "Generate a JSON with the brand, model and car_type of the most iconic car of Hyundai from the 90's"

prediction_input = {
    "messages": [{
        "role": "user",
        "content": prompt,
    }],
    "guided_json": json_schema
}

In [86]:
response = local_endpoint.predict(request=json.dumps(prediction_input, indent=2).encode('utf-8'), headers={'Content-Type':'application/json'})
print(response.json()["choices"][0]['message']['content'])

{ "brand": "Hyundai", "model": "Elantra", "car_type": "SUV" }


In [90]:
# https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.prediction.LocalEndpoint#google_cloud_aiplatform_prediction_LocalEndpoint_get_container_status

print(f"Status : {local_endpoint.run_health_check(verbose=True)}")
print(f"Status : {local_endpoint.get_container_status()}")

Status : <Response [200]>
Status : running


### Stop Local Endpoint

In [91]:
local_endpoint.stop()
local_endpoint.print_container_logs()

## Model Upload to Vertex AI

### Container push to registry

In [None]:
# !gcloud auth configure-docker us-central1-docker.pkg.dev --quiet
# local_model.push_image()

### Upload model to Vertex AI

In [94]:
from google.cloud import aiplatform
model = aiplatform.Model.upload(
    display_name = MODEL_ID,
    local_model = local_model,
    artifact_uri = MODEL_PATH,
)

## Public endpoint

### Public endpoint

In [95]:
#Public and dedicated endpoint
from google.cloud import aiplatform
endpoint = aiplatform.Endpoint.create(
    display_name=f"{MODEL_ID} proxy public test endpoint",
    labels={"sample-key": "sample-value"},
    #dedicated_endpoint_enabled=True,
)

### Model deploy on Vertex AI

In [None]:
endpoint.deploy(
    model = model,
    machine_type="g2-standard-24",
    accelerator_type="NVIDIA_L4",
    # machine_type="a2-highgpu-1g",
    # accelerator_type="NVIDIA_TESLA_A100",
    accelerator_count=2,
    min_replica_count=1,
    max_replica_count=1,
    service_account=SERVICE_ACCOUNT
    #traffic_percentage=50
    #traffic_split={'a':50, 'b':50}
)

### Test to public dedicated endpoint

In [100]:
#Public and dedicated endpoint predict
from google.cloud import aiplatform
ENDPOINT_ID = "2161931230788976640"
endpoint = aiplatform.Endpoint(ENDPOINT_ID)
response = endpoint.raw_predict(body=json.dumps(prediction_input, indent=2).encode('utf-8'), headers={'Content-Type':'application/json'})
print(response.json()["choices"][0]['message']['content'])

{ "brand": "Hyundai", "model": "Elantra", "car_type": "SUV" }


In [105]:
#Public endpoint health
import google.auth
import requests
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

PROJECT_NUMBER = "721521243942"
ENDPOINT_ID = "8123993449985736704"

url = f"https://us-central1-aiplatform.googleapis.com/v1/projects/{PROJECT_NUMBER}/locations/us-central1/endpoints/{ENDPOINT_ID}"

headers = {'Authorization': f'Bearer {creds.token}'}
response = requests.get(url, headers=headers)
print(response.text)

{
  "name": "projects/721521243942/locations/us-central1/endpoints/8123993449985736704",
  "displayName": "Llama-3.1-8B-Instruct-TGI proxy private test endpoint2",
  "deployedModels": [
    {
      "id": "8284245070710833152",
      "model": "projects/721521243942/locations/us-central1/models/8490284753663033344",
      "displayName": "Llama-3.1-8B-Instruct-TGI",
      "createTime": "2025-02-01T23:57:27.258150Z",
      "dedicatedResources": {
        "machineSpec": {
          "machineType": "a2-highgpu-1g",
          "acceleratorType": "NVIDIA_TESLA_A100",
          "acceleratorCount": 1
        },
        "minReplicaCount": 1,
        "maxReplicaCount": 1
      },
      "privateEndpoints": {
        "predictHttpUri": "http://8123993449985736704.aiplatform.googleapis.com/v1/models/8284245070710833152:predict",
        "healthHttpUri": "http://8123993449985736704.aiplatform.googleapis.com/v1/models/8284245070710833152"
      },
      "modelVersionId": "1"
    }
  ],
  "etag": "AMEw9yOS

## Private endpoint

### Create private endpoint

In [111]:
PROJECT_ID="ai-hangsik"
!gcloud config set project $PROJECT_ID

Updated property [core/project].


In [112]:
PEERING_RANGE_NAME="google-reserved-range2"

In [None]:
!gcloud compute addresses create $PEERING_RANGE_NAME \
  --global \
  --prefix-length=16 \
  --description="peering range for Google service" \
  --network=$VPC_NETWORK \
  --purpose=VPC_PEERING

In [114]:
!gcloud services vpc-peerings connect \
  --service=servicenetworking.googleapis.com \
  --network=$VPC_NETWORK \
  --ranges=$PEERING_RANGE_NAME \
  --project=$PROJECT_ID

Operation "operations/pssn.p24-721521243942-05dd38b9-5c9c-4d57-a15c-8fb64733cb67" finished successfully.


In [117]:
#Private endpoint
#Refer https://cloud.google.com/vertex-ai/docs/predictions/using-private-endpoints
from google.cloud import aiplatform
endpoint = aiplatform.PrivateEndpoint.create(
    display_name=f"{MODEL_ID} proxy private test endpoint2",
    network=f"projects/{PROJECT_NUMBER}/global/networks/{VPC_NETWORK}",
    labels={"sample-key": "sample-value"},
)

### Deploy private endpoint

In [None]:
#C3, L4, TPU not allowed for private endpoint
#Refer https://cloud.google.com/vertex-ai/docs/training/configure-compute
endpoint.deploy(
    model = model,
    machine_type="a2-highgpu-1g",
    accelerator_type="NVIDIA_TESLA_A100",
    accelerator_count=1,
    min_replica_count=1,
    max_replica_count=1,
    # service_account=SERVICE_ACCOUNT
    #traffic_percentage=50
    #traffic_split={'a':50, 'b':50}
)

### Test private endpoint

In [5]:
#Private endpoint with raw predict, TGI does not use instances= so use raw_predict
from google.cloud import aiplatform
import json

ENDPOINT_ID = "8123993449985736704"
endpoint = aiplatform.PrivateEndpoint(ENDPOINT_ID)
response = endpoint.raw_predict(body=json.dumps(prediction_input, indent=2).encode('utf-8'), headers={'Content-Type':'application/json'})
response.data

b'{"generated_text":"{ \\"activity\\": \\"bike\\", \\"animals\\": [\\"puppy\\",\\"cat\\" ], \\"animals_seen\\": 3, \\"location\\": \\"park\\"}"}'

In [102]:
#Private endpoint health
import google.auth
import requests
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

headers = {'Authorization': f'Bearer {creds.token}'}
response = requests.get(endpoint.health_http_uri, headers=headers)
print(response.text)


