In [None]:
# Copyright 2024 Forusone
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Custom container deploy using LocalModel

In [4]:
%cd /home/jupyter/llmOps_vertexAI/llm_serving/tgi/custom

/home/jupyter/llmOps_vertexAI/llm_serving/tgi/custom


In [5]:
!pip install --upgrade --user --quiet google-cloud-aiplatform[prediction]

### Configuration

In [7]:
import os
import logging
logging.basicConfig(level=logging.INFO)

PROJECT_ID = "ai-hangsik"
REGION = "us-central1"
PROJECT_NUMBER = "721521243942"
MODEL_PATH = "gs://20250131_custom_handler/meta-llama/Llama-3.1-8B-Instruct"
VPC_NETWORK = "default2" #vpc network name to peering

### Build Docker

In [8]:
CONTAINER = f"us-central1-docker.pkg.dev/{PROJECT_ID}/custom-inference-gpu/tgi-release:latest"
!docker build -f Dockerfile.TGI -t {CONTAINER} .

MODEL_NAME = "Llama-3.1-8B-Instruct-TGI"

Sending build context to Docker daemon  365.1kB
Step 1/4 : FROM us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-4.ubuntu2204.py311
 ---> 9e59d29d5e8a
Step 2/4 : COPY entrypoint.sh entrypoint.sh
 ---> 0026d0f97a41
Step 3/4 : RUN chmod -R 775 entrypoint.sh
 ---> Running in 5a69e3017ba3
Removing intermediate container 5a69e3017ba3
 ---> db60eb8c6268
Step 4/4 : ENTRYPOINT ["./entrypoint.sh"]
 ---> Running in f3d74831d628
Removing intermediate container f3d74831d628
 ---> c926fe3a73f7
Successfully built c926fe3a73f7
Successfully tagged us-central1-docker.pkg.dev/ai-hangsik/custom-inference-gpu/tgi-release:latest


## Local Model

### Build LocalModel

In [9]:

#must secure sufficient space
from google.cloud.aiplatform.prediction import LocalModel
local_model = LocalModel(serving_container_image_uri=CONTAINER,
                         serving_container_environment_variables={
                             "VERTEX_CPR_MAX_WORKERS": "1",
                             "RUST_BACKTRACE": "full", #for stack trace printing,
                             "CUDA_MEMORY_FRACTION": "0.93",
                             #"AIP_PREDICT_ROUTE": "/generate",
                             #"AIP_HEALTH_ROUTE": "/metrics"
                             #"MODEL_ID": f"meta-llama/{MODEL_NAME}"
                             #"PORT": "5000", #server runs on 5000, or 8080 by dafault
                         },
                         #serving_container_ports=[5000], #expose container port, system map is random
                         serving_container_health_route="/metrics",
                         serving_container_predict_route="/generate",
                         serving_container_args=["--num-shard 1"], #We can use both serving_container_environment_variables and serving_container_args
                        )

In [10]:
local_model.get_serving_container_spec()

image_uri: "us-central1-docker.pkg.dev/ai-hangsik/custom-inference-gpu/tgi-release:latest"
args: "--num-shard 1"
env {
  name: "VERTEX_CPR_MAX_WORKERS"
  value: "1"
}
env {
  name: "RUST_BACKTRACE"
  value: "full"
}
env {
  name: "CUDA_MEMORY_FRACTION"
  value: "0.93"
}
predict_route: "/generate"
health_route: "/metrics"

### Deploy to local endpoint

In [11]:
import json
#Manual deploy and test
local_endpoint = local_model.deploy_to_local_endpoint(
    artifact_uri=MODEL_PATH,
    gpu_count=-1,
    container_ready_timeout = 600)
local_endpoint.serve()
local_endpoint.print_container_logs()

### Input data

In [29]:
from pydantic import BaseModel, conint
from typing import List

class Animals(BaseModel):
    location: str
    activity: str
    animals_seen: conint(ge=1, le=5)  # Constrained integer type
    animals: List[str]
    
prompt = "convert to JSON: I saw a puppy a cat and a raccoon during my bike ride in the park"

prediction_input = {
    "inputs": prompt,
    "parameters": {
        "repetition_penalty": 1.3,
        "grammar": {
            "type": "json",
            "value": Animals.model_json_schema()
        }
    }
}

### Predict to local endpoint

In [None]:
#Run some code here to test monitor
predict_response = local_endpoint.predict(
        request=json.dumps(prediction_input),
        headers={"Content-Type": "application/json"},
    )
print(predict_response.text)
local_endpoint.print_container_logs()

In [None]:
local_endpoint.run_health_check().text

### Stop Local Endpoint

In [16]:
local_endpoint.stop()
local_endpoint.print_container_logs()

## Model Upload to Vertex AI

### Container push to registry

In [17]:
!gcloud auth configure-docker us-central1-docker.pkg.dev --quiet
local_model.push_image()


{
  "credHelpers": {
    "gcr.io": "gcloud",
    "us.gcr.io": "gcloud",
    "eu.gcr.io": "gcloud",
    "asia.gcr.io": "gcloud",
    "staging-k8s.gcr.io": "gcloud",
    "marketplace.gcr.io": "gcloud",
    "us-central1-docker.pkg.dev": "gcloud"
  }
}
Adding credentials for: us-central1-docker.pkg.dev
gcloud credential helpers already registered correctly.


  self.stdin = io.open(p2cwrite, 'wb', bufsize)
  self.stdout = io.open(c2pread, 'rb', bufsize)


### Upload model to Vertex AI

In [18]:
from google.cloud import aiplatform
model = aiplatform.Model.upload(
    display_name = MODEL_NAME,
    local_model = local_model,
    artifact_uri = MODEL_PATH,
    #parent_model = prev_model.resource_name,
    #is_default_version=True,
    #serving_container_environment_variables={
    #    "VERTEX_CPR_MAX_WORKERS": "1",
    #    "PORT": "8080", #server runs on 5000, or 8080 by dafault
    #    "RUST_BACKTRACE": "full", #for stack trace printing,
    #},
    #serving_container_ports=[8080],
    #serving_container_args = ["--num-shard 1"]
)

## Public endpoint

### Public endpoint

In [19]:
#Public and dedicated endpoint
from google.cloud import aiplatform
endpoint = aiplatform.Endpoint.create(
    display_name=f"{MODEL_NAME} proxy public test endpoint",
    labels={"sample-key": "sample-value"},
    #dedicated_endpoint_enabled=True,
)

### Model deploy on Vertex AI

In [20]:
endpoint.deploy(
    model = model,
    machine_type="g2-standard-4",
    accelerator_type="NVIDIA_L4",
    #machine_type="a2-highgpu-1g",
    #accelerator_type="NVIDIA_TESLA_A100",
    accelerator_count=1,
    min_replica_count=1,
    max_replica_count=1,
    #service_account=SERVICE_ACCOUNT
    #traffic_percentage=50
    #traffic_split={'a':50, 'b':50}
)

### Test to public dedicated endpoint

In [21]:
#Public and dedicated endpoint predict
from google.cloud import aiplatform
ENDPOINT_ID = "8454163597667336192"
endpoint = aiplatform.Endpoint(ENDPOINT_ID)
response = endpoint.raw_predict(body=json.dumps(prediction_input, indent=2).encode('utf-8'), headers={'Content-Type':'application/json'})
response.text

'{"generated_text":"{ \\"activity\\": \\"bike\\", \\"animals\\": [\\"puppy\\",\\"cat\\" ], \\"animals_seen\\": 3, \\"location\\": \\"park\\"}"}'

## Private endpoint

### Create private endpoint

In [22]:
PROJECT_ID="ai-hangsik"
!gcloud config set project $PROJECT_ID

Updated property [core/project].


In [23]:
PEERING_RANGE_NAME="google-reserved-range2"

In [32]:
!gcloud compute addresses create $PEERING_RANGE_NAME \
  --global \
  --prefix-length=16 \
  --description="peering range for Google service" \
  --network=$VPC_NETWORK \
  --purpose=VPC_PEERING

Created [https://www.googleapis.com/compute/v1/projects/ai-hangsik/global/addresses/google-reserved-range2].


In [35]:
!gcloud services vpc-peerings connect \
  --service=servicenetworking.googleapis.com \
  --network=$VPC_NETWORK \
  --ranges=$PEERING_RANGE_NAME \
  --project=$PROJECT_ID

Operation "operations/pssn.p24-721521243942-b2348908-91ea-44ee-8219-59daa8d4791f" finished successfully.


In [None]:
#Private endpoint
#Refer https://cloud.google.com/vertex-ai/docs/predictions/using-private-endpoints
from google.cloud import aiplatform
endpoint = aiplatform.PrivateEndpoint.create(
    display_name=f"{MODEL_NAME} proxy private test endpoint2",
    network=f"projects/{PROJECT_NUMBER}/global/networks/{VPC_NETWORK}",
    labels={"tgi-key": "tgi-value"},
)

### Deploy private endpoint

In [1]:
#C3, L4, TPU not allowed for private endpoint
#Refer https://cloud.google.com/vertex-ai/docs/training/configure-compute
endpoint.deploy(
    model = model,
    machine_type="a2-highgpu-1g",
    accelerator_type="NVIDIA_TESLA_A100",
    accelerator_count=1,
    min_replica_count=1,
    max_replica_count=1,
    #service_account=SERVICE_ACCOUNT
    #traffic_percentage=50
    #traffic_split={'a':50, 'b':50}
)

NameError: name 'endpoint' is not defined

### Test private endpoint

In [27]:
#Private endpoint with raw predict, TGI does not use instances= so use raw_predict
from google.cloud import aiplatform
import json

ENDPOINT_ID = "5571859836150218752"
endpoint = aiplatform.PrivateEndpoint(ENDPOINT_ID)
response = endpoint.raw_predict(body=json.dumps(prediction_input, indent=2).encode('utf-8'), headers={'Content-Type':'application/json'})
response.data

b'{"generated_text":"{ \\"activity\\": \\"bike\\", \\"animals\\": [\\"puppy\\",\\"cat\\" ], \\"animals_seen\\": 3, \\"location\\": \\"park\\"}"}'

In [28]:
#Private endpoint health
import google.auth
import requests
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

headers = {'Authorization': f'Bearer {creds.token}'}
response = requests.get(endpoint.health_http_uri, headers=headers)
print(response.text)

# HELP tgi_request_success Number of successful requests
# TYPE tgi_request_success counter
tgi_request_success 1

# HELP tgi_request_count Total number of requests
# TYPE tgi_request_count counter
tgi_request_count 1

# HELP tgi_batch_inference_count Inference calls per method (prefill or decode)
# TYPE tgi_batch_inference_count counter
tgi_batch_inference_count{method="decode"} 33
tgi_batch_inference_count{method="prefill"} 1

# HELP tgi_batch_inference_success Number of successful inference calls per method (prefill or decode)
# TYPE tgi_batch_inference_success counter
tgi_batch_inference_success{method="prefill"} 1
tgi_batch_inference_success{method="decode"} 33

# HELP tgi_queue_size Current queue size
# TYPE tgi_queue_size gauge
tgi_queue_size 0

# HELP tgi_batch_current_max_tokens Maximum tokens for the current batch
# TYPE tgi_batch_current_max_tokens gauge
tgi_batch_current_max_tokens 0

# HELP tgi_batch_current_size Current batch size
# TYPE tgi_batch_current_size gauge
tgi_b