In [None]:
# Copyright 2024 Forusone
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Serving-meta-llama/Llama-3.1-8B-Instruct-vLLM-GCS-Vertex-L4

* [model_garden_pytorch_llama3_1_deployment.ipynb](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_llama3_1_deployment.ipynb)

In [None]:
# @title Install and upgrade Vertex AI SDK.
! pip3 install --upgrade --quiet google-cloud-aiplatform

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/6.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/6.9 MB[0m [31m49.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━[0m [32m5.9/6.9 MB[0m [31m85.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m6.9/6.9 MB[0m [31m84.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# @title Define deployment constants
import datetime
now = datetime.datetime.now()

PROJECT_ID="ai-hangsik" # @param {type:"string"}
LOCATION="us-central1"  # @param {type:"string"}

MODEL_BUCKET_URI ="gs://sllm_0106/llama3.1_8b_inst" # @param {type:"string"}
VLLM_DOCKER_URI = "us-docker.pkg.dev/deeplearning-platform-release/vertex-model-garden/vllm-inference.cu121.0-5.ubuntu2204.py310" # @param {type:"string"}

MODEL_ID = "Meta-Llama-3.1-8B-Instruct" # @param {type:"string"}
MODEL_DISPLAY_NAME = f"{MODEL_ID}-{now}"
ENDPOINT_DISPLAY_NAME = f"{MODEL_ID}-endpoint" # @param {type:"string"}


In [None]:
# @title Authentication
!gcloud auth login
!gcloud auth application-default login
!gcloud config set project {PROJECT_ID}

In [None]:
# @title Initialize Vertex AI
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION)

In [None]:
# @title Gets the default SERVICE_ACCOUNT.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)

Using this default Service Account: 721521243942-compute@developer.gserviceaccount.com


In [None]:
# @title Set accelerator.
# Find Vertex AI prediction supported accelerators and regions [here](https://cloud.google.com/vertex-ai/docs/predictions/configure-compute).
MACHINE_TYPE = "g2-standard-12" # @param {type:"string"}
ACCELERATOR_TYPE = "NVIDIA_L4" # @param {type:"string"}
ACCELERATOR_COUNT = 1 # @param {type:"string"}


In [None]:
from typing import Tuple
from google.cloud import aiplatform

# See https://docs.vllm.ai/en/latest/serving/engine_args.html for a list of possible arguments with descriptions.
vllm_args = [
    "python",
    "-m",
    "vllm.entrypoints.api_server",
    "--host=0.0.0.0",
    "--port=8080",
    f"--model={MODEL_ID}",
    f"--tensor-parallel-size={ACCELERATOR_COUNT}",
    "--swap-space=16",
    f"--gpu-memory-utilization=0.95",
    f"--max-model-len=8192",
    f"--dtype=auto",
    f"--max-loras=1",
    f"--max-cpu-loras=8",
    f"--max-num-seqs=256",
    "--disable-log-stats",
#     "--trust-remote-code",
#     "--enforce-eager",
#     "--enable-lora",
#     "--model-type=llama",
 ]

env_vars = {
    "MODEL_ID": MODEL_ID,
    "DEPLOY_SOURCE": "notebook",
}

In [None]:
model = aiplatform.Model.upload(
    display_name=MODEL_DISPLAY_NAME,
    artifact_uri=MODEL_BUCKET_URI,
    serving_container_image_uri=VLLM_DOCKER_URI,
    serving_container_args=vllm_args,
    serving_container_ports=[8080],
    serving_container_predict_route="/generate",
    serving_container_health_route="/ping",
    serving_container_environment_variables=env_vars,
    serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
    serving_container_deployment_timeout=7200,
)
print(
    f"Deploying {MODEL_DISPLAY_NAME} on {MACHINE_TYPE} with {ACCELERATOR_COUNT} {ACCELERATOR_TYPE} GPU(s)."
)

INFO:google.cloud.aiplatform.models:Creating Model
INFO:google.cloud.aiplatform.models:Create Model backing LRO: projects/721521243942/locations/us-central1/models/6974234140820373504/operations/125048453859377152
INFO:google.cloud.aiplatform.models:Model created. Resource name: projects/721521243942/locations/us-central1/models/6974234140820373504@1
INFO:google.cloud.aiplatform.models:To use this Model in another session:
INFO:google.cloud.aiplatform.models:model = aiplatform.Model('projects/721521243942/locations/us-central1/models/6974234140820373504@1')


Deploying Meta-Llama-3.1-8B-Instruct-2025-01-14 07:54:18.528202 on g2-standard-12 with 1 NVIDIA_L4 GPU(s).


In [None]:
endpoint = aiplatform.Endpoint.create(
        display_name = ENDPOINT_DISPLAY_NAME,
        dedicated_endpoint_enabled=False,
    )

INFO:google.cloud.aiplatform.models:Creating Endpoint
INFO:google.cloud.aiplatform.models:Create Endpoint backing LRO: projects/721521243942/locations/us-central1/endpoints/7602948083722223616/operations/8303585377164197888
INFO:google.cloud.aiplatform.models:Endpoint created. Resource name: projects/721521243942/locations/us-central1/endpoints/7602948083722223616
INFO:google.cloud.aiplatform.models:To use this Endpoint in another session:
INFO:google.cloud.aiplatform.models:endpoint = aiplatform.Endpoint('projects/721521243942/locations/us-central1/endpoints/7602948083722223616')


In [None]:
model.deploy(
    endpoint=endpoint,
    machine_type=MACHINE_TYPE,
    accelerator_type=ACCELERATOR_TYPE,
    accelerator_count=ACCELERATOR_COUNT,
    deploy_request_timeout=1800,
    service_account=SERVICE_ACCOUNT,
)
print("endpoint_name:", endpoint.name)

INFO:google.cloud.aiplatform.models:Deploying model to Endpoint : projects/721521243942/locations/us-central1/endpoints/7602948083722223616
INFO:google.cloud.aiplatform.models:Deploy Endpoint model backing LRO: projects/721521243942/locations/us-central1/endpoints/7602948083722223616/operations/7972007854599045120
INFO:google.cloud.aiplatform.models:Endpoint model deployed. Resource name: projects/721521243942/locations/us-central1/endpoints/7602948083722223616


endpoint_name: 7602948083722223616


In [None]:
import os

def predict_vllm(prompt: str,):

    ENDPOINT_ID = next(
                          (endpoint.name for endpoint in aiplatform.Endpoint.list()
                          if endpoint.display_name == ENDPOINT_DISPLAY_NAME),
                          None
                      )
    endpoint = aiplatform.Endpoint(f"projects/{PROJECT_ID}/locations/{LOCATION}/endpoints/{ENDPOINT_ID}")

    instance = {
        "prompt": prompt,
        "max_tokens": 128,
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 10,
        "raw_response": False,
    }

    instances = [instance]

    response = endpoint.predict(
        instances=instances,
        use_dedicated_endpoint=False
    )

    return response

In [None]:
prompt = "What is a car?"

response = predict_vllm(prompt=prompt)

for prediction in response.predictions:
    print(prediction)

Prompt:
What is a car?
Output:
 A car, short for automobile, is a wheeled motor vehicle used for transporting passengers or goods. The term "car" is used for a wide variety of vehicles, from the smallest city car to the largest luxury car. A car typically has four wheels and is powered by an internal combustion engine or an electric motor. Some common features of cars include a body, wheels, engine, transmission, brakes, and steering. Cars also often have safety features such as airbags, anti-lock braking systems, and rearview mirrors. In this article, we will explore the history, types, and key features of cars.

# History of Cars
