In [None]:
# Copyright 2024 Forusone
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Deploy Llama 3.1 8B vLLM in GCS on Vertex AI
* [model_garden_pytorch_llama3_1_deployment.ipynb](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_llama3_1_deployment.ipynb)

In [1]:
# @title Install and upgrade Vertex AI SDK.
! pip3 install --upgrade --quiet google-cloud-aiplatform

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
PROJECT_ID="ai-hangsik" # @param {type:"string"}
LOCATION="us-central1"  # @param {type:"string"}

In [10]:
# @title Authentication
!gcloud auth login
!gcloud auth application-default login
!gcloud config set project {PROJECT_ID}

Go to the following link in your browser, and complete the sign-in prompts:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=32555940559.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fsdk.cloud.google.com%2Fauthcode.html&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=pVSvxOFLWskHU2EisB3h5jwdGHPQi3&prompt=consent&token_usage=remote&access_type=offline&code_challenge=0wV1OD2HSTOBwIA3P9R3cKItkhxYLq1mdXZLK8ppnWM&code_challenge_method=S256

Once finished, enter the verification code provided in your browser: 4/0AanRRrulJI1-tAgSbW-C_eDZ21Hie-gZhL9VPL7aKLgdx5y6SstLtlttgZ8h-hm1_k_QMA

You are now logged in as [hangsik@google.com].
Your current project is 

In [14]:
# @title Initialize Vertex AI
from typing import Tuple
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION)

In [12]:
# @title Gets the default SERVICE_ACCOUNT.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)


Using this default Service Account: 721521243942-compute@developer.gserviceaccount.com


In [15]:
# The pre-built serving docker images.
import os

MODEL_ID = "Phi-3.5-MoE-instruct"
model_path_prefix = "microsoft"
model_id = os.path.join(model_path_prefix, MODEL_ID)

VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240926_1639_RC00"

vllm_dtype = "bfloat16"
max_model_len = 131072
accelerator_type = "NVIDIA_L4"
accelerator_count = 8
machine_type = "g2-standard-96"
enable_trust_remote_code = True
gpu_memory_utilization = 0.95

def deploy_model_vllm(
    model_name: str,
    model_id: str,
    service_account: str,
    base_model_id: str = None,
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    gpu_memory_utilization: float = 0.9,
    max_model_len: int = 4096,
    dtype: str = "auto",
    enable_trust_remote_code: bool = False,
    enforce_eager: bool = False,
    enable_lora: bool = False,
    enable_chunked_prefill: bool = False,
    enable_prefix_cache: bool = False,
    host_prefix_kv_cache_utilization_target: float = 0.0,
    max_loras: int = 1,
    max_cpu_loras: int = 8,
    use_dedicated_endpoint: bool = False,
    max_num_seqs: int = 256,
    model_type: str = None,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models with vLLM into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(
        display_name=f"{model_name}-endpoint",
        dedicated_endpoint_enabled=use_dedicated_endpoint,
    )

    if not base_model_id:
        base_model_id = model_id

    # See https://docs.vllm.ai/en/latest/models/engine_args.html for a list of possible arguments with descriptions.
    vllm_args = [
        "python",
        "-m",
        "vllm.entrypoints.api_server",
        "--host=0.0.0.0",
        "--port=8080",
        f"--model={model_id}",
        f"--tensor-parallel-size={accelerator_count}",
        "--swap-space=16",
        f"--gpu-memory-utilization={gpu_memory_utilization}",
        f"--max-model-len={max_model_len}",
        f"--dtype={dtype}",
        f"--max-loras={max_loras}",
        f"--max-cpu-loras={max_cpu_loras}",
        f"--max-num-seqs={max_num_seqs}",
        "--disable-log-stats",
    ]

    if enable_trust_remote_code:
        vllm_args.append("--trust-remote-code")

    if enforce_eager:
        vllm_args.append("--enforce-eager")

    if enable_lora:
        vllm_args.append("--enable-lora")

    if enable_chunked_prefill:
        vllm_args.append("--enable-chunked-prefill")

    if enable_prefix_cache:
        vllm_args.append("--enable-prefix-caching")

    if 0 < host_prefix_kv_cache_utilization_target < 1:
        vllm_args.append(
            f"--host-prefix-kv-cache-utilization-target={host_prefix_kv_cache_utilization_target}"
        )

    if model_type:
        vllm_args.append(f"--model-type={model_type}")

    env_vars = {
        "MODEL_ID": base_model_id,
        "DEPLOY_SOURCE": "notebook",
    }

    # HF_TOKEN is not a compulsory field and may not be defined.
    try:
        if HF_TOKEN:
            env_vars["HF_TOKEN"] = HF_TOKEN
    except NameError:
        pass

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=VLLM_DOCKER_URI,
        serving_container_args=vllm_args,
        serving_container_ports=[8080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        serving_container_deployment_timeout=7200,
    )
    print(
        f"Deploying {model_name} on {machine_type} with {accelerator_count} {accelerator_type} GPU(s)."
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
        system_labels={
            "NOTEBOOK_NAME": "model_garden_phi3_deployment.ipynb",
        },
    )
    print("endpoint_name:", endpoint.name)

    return model, endpoint


use_dedicated_endpoint = True


deploy_model_vllm(
    model_name="Phi-3.5-MoE-instruct",
    model_id=model_id,
    service_account=SERVICE_ACCOUNT,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    max_model_len=max_model_len,
    gpu_memory_utilization=gpu_memory_utilization,
    dtype=vllm_dtype,
    enable_trust_remote_code=enable_trust_remote_code,
    use_dedicated_endpoint=use_dedicated_endpoint,
)

INFO:google.cloud.aiplatform.models:Creating Endpoint
INFO:google.cloud.aiplatform.models:Create Endpoint backing LRO: projects/721521243942/locations/us-central1/endpoints/1412046708459700224/operations/6808407893063237632
INFO:google.cloud.aiplatform.models:Endpoint created. Resource name: projects/721521243942/locations/us-central1/endpoints/1412046708459700224
INFO:google.cloud.aiplatform.models:To use this Endpoint in another session:
INFO:google.cloud.aiplatform.models:endpoint = aiplatform.Endpoint('projects/721521243942/locations/us-central1/endpoints/1412046708459700224')
INFO:google.cloud.aiplatform.models:Creating Model
INFO:google.cloud.aiplatform.models:Create Model backing LRO: projects/721521243942/locations/us-central1/models/5745314392501649408/operations/7877449854610309120
INFO:google.cloud.aiplatform.models:Model created. Resource name: projects/721521243942/locations/us-central1/models/5745314392501649408@1
INFO:google.cloud.aiplatform.models:To use this Model in a

Deploying Phi-3.5-MoE-instruct on g2-standard-96 with 8 NVIDIA_L4 GPU(s).


INFO:google.cloud.aiplatform.models:Deploying model to Endpoint : projects/721521243942/locations/us-central1/endpoints/1412046708459700224
INFO:google.cloud.aiplatform.models:Deploy Endpoint model backing LRO: projects/721521243942/locations/us-central1/endpoints/1412046708459700224/operations/2277786667928518656
INFO:google.cloud.aiplatform.models:Endpoint model deployed. Resource name: projects/721521243942/locations/us-central1/endpoints/1412046708459700224


endpoint_name: 1412046708459700224


NameError: name 'models' is not defined

In [None]:
# @title Define deployment constants
import datetime
now = datetime.datetime.now()

PROJECT_ID="ai-hangsik" # @param {type:"string"}
LOCATION="us-central1"  # @param {type:"string"}

MODEL_BUCKET_URI ="gs://sllm_0106/llama3.1_8b_inst" # @param {type:"string"}
VLLM_DOCKER_URI = "us-docker.pkg.dev/deeplearning-platform-release/vertex-model-garden/vllm-inference.cu121.0-5.ubuntu2204.py310" # @param {type:"string"}

MODEL_ID = "Meta-Llama-3.1-8B-Instruct" # @param {type:"string"}
MODEL_DISPLAY_NAME = f"{MODEL_ID}-{now}"
ENDPOINT_DISPLAY_NAME = f"{MODEL_ID}-endpoint" # @param {type:"string"}


In [None]:
# @title Authentication
!gcloud auth login
!gcloud auth application-default login
!gcloud config set project {PROJECT_ID}

In [None]:
# @title Initialize Vertex AI
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION)

In [None]:
# @title Gets the default SERVICE_ACCOUNT.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)

Using this default Service Account: 721521243942-compute@developer.gserviceaccount.com


In [None]:
# @title Set accelerator.
# Find Vertex AI prediction supported accelerators and regions [here](https://cloud.google.com/vertex-ai/docs/predictions/configure-compute).
MACHINE_TYPE = "g2-standard-12" # @param {type:"string"}
ACCELERATOR_TYPE = "NVIDIA_L4" # @param {type:"string"}
ACCELERATOR_COUNT = 1 # @param {type:"string"}


In [None]:
from typing import Tuple
from google.cloud import aiplatform

# See https://docs.vllm.ai/en/latest/serving/engine_args.html for a list of possible arguments with descriptions.
vllm_args = [
    "python",
    "-m",
    "vllm.entrypoints.api_server",
    "--host=0.0.0.0",
    "--port=8080",
    f"--model={MODEL_ID}",
    f"--tensor-parallel-size={ACCELERATOR_COUNT}",
    "--swap-space=16",
    f"--gpu-memory-utilization=0.95",
    f"--max-model-len=8192",
    f"--dtype=auto",
    f"--max-loras=1",
    f"--max-cpu-loras=8",
    f"--max-num-seqs=256",
    "--disable-log-stats",
#     "--trust-remote-code",
#     "--enforce-eager",
#     "--enable-lora",
#     "--model-type=llama",
 ]

env_vars = {
    "MODEL_ID": MODEL_ID,
    "DEPLOY_SOURCE": "notebook",
}

In [None]:
model = aiplatform.Model.upload(
    display_name=MODEL_DISPLAY_NAME,
    artifact_uri=MODEL_BUCKET_URI,
    serving_container_image_uri=VLLM_DOCKER_URI,
    serving_container_args=vllm_args,
    serving_container_ports=[8080],
    serving_container_predict_route="/generate",
    serving_container_health_route="/ping",
    serving_container_environment_variables=env_vars,
    serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
    serving_container_deployment_timeout=7200,
)
print(
    f"Deploying {MODEL_DISPLAY_NAME} on {MACHINE_TYPE} with {ACCELERATOR_COUNT} {ACCELERATOR_TYPE} GPU(s)."
)

INFO:google.cloud.aiplatform.models:Creating Model
INFO:google.cloud.aiplatform.models:Create Model backing LRO: projects/721521243942/locations/us-central1/models/6974234140820373504/operations/125048453859377152
INFO:google.cloud.aiplatform.models:Model created. Resource name: projects/721521243942/locations/us-central1/models/6974234140820373504@1
INFO:google.cloud.aiplatform.models:To use this Model in another session:
INFO:google.cloud.aiplatform.models:model = aiplatform.Model('projects/721521243942/locations/us-central1/models/6974234140820373504@1')


Deploying Meta-Llama-3.1-8B-Instruct-2025-01-14 07:54:18.528202 on g2-standard-12 with 1 NVIDIA_L4 GPU(s).


In [None]:
endpoint = aiplatform.Endpoint.create(
        display_name = ENDPOINT_DISPLAY_NAME,
        dedicated_endpoint_enabled=False,
    )

INFO:google.cloud.aiplatform.models:Creating Endpoint
INFO:google.cloud.aiplatform.models:Create Endpoint backing LRO: projects/721521243942/locations/us-central1/endpoints/7602948083722223616/operations/8303585377164197888
INFO:google.cloud.aiplatform.models:Endpoint created. Resource name: projects/721521243942/locations/us-central1/endpoints/7602948083722223616
INFO:google.cloud.aiplatform.models:To use this Endpoint in another session:
INFO:google.cloud.aiplatform.models:endpoint = aiplatform.Endpoint('projects/721521243942/locations/us-central1/endpoints/7602948083722223616')


In [None]:
model.deploy(
    endpoint=endpoint,
    machine_type=MACHINE_TYPE,
    accelerator_type=ACCELERATOR_TYPE,
    accelerator_count=ACCELERATOR_COUNT,
    deploy_request_timeout=1800,
    service_account=SERVICE_ACCOUNT,
)
print("endpoint_name:", endpoint.name)

INFO:google.cloud.aiplatform.models:Deploying model to Endpoint : projects/721521243942/locations/us-central1/endpoints/7602948083722223616
INFO:google.cloud.aiplatform.models:Deploy Endpoint model backing LRO: projects/721521243942/locations/us-central1/endpoints/7602948083722223616/operations/7972007854599045120
INFO:google.cloud.aiplatform.models:Endpoint model deployed. Resource name: projects/721521243942/locations/us-central1/endpoints/7602948083722223616


endpoint_name: 7602948083722223616


In [18]:
import os

ENDPOINT_DISPLAY_NAME = "Phi-3.5-MoE-instruct-endpoint"
def predict_vllm(prompt: str,):

    ENDPOINT_ID = next(
                          (endpoint.name for endpoint in aiplatform.Endpoint.list()
                          if endpoint.display_name == ENDPOINT_DISPLAY_NAME),
                          None
                      )
    endpoint = aiplatform.Endpoint(f"projects/{PROJECT_ID}/locations/{LOCATION}/endpoints/{ENDPOINT_ID}")

    instance = {
        "prompt": prompt,
        "max_tokens": 128,
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 10,
        "raw_response": False,
    }

    instances = [instance]

    response = endpoint.predict(
        instances=instances,
        use_dedicated_endpoint=True
    )

    return response

In [19]:
prompt = "What is a car?"

response = predict_vllm(prompt=prompt)

for prediction in response.predictions:
    print(prediction)

Prompt:
What is a car?
Output:
 A car, also known as an automobile or motorcar, is a wheeled motor vehicle used for transportation. Most definitions of cars say they run primarily on roads, seat one to eight people, have four wheels, and mainly transport people rather than goods. Cars were developed in the late 19th century as a successor to horse-drawn carriages and have since become a major part of modern life.

Cars are powered by internal combustion engines, which burn fuel, or electric motors, which use electricity stored in batteries or other storage devices. They come in various
