In [None]:
# Copyright 2024 Forusone
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Serving-microsoft/Phi-3-mini-4k-instruct-vLLM-GCS-Vertex AI-H100-2g

* [model_garden_phi3_deployment.ipynb](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_phi3_deployment.ipynb)

In [None]:
# @title Install and upgrade Vertex AI SDK.
! pip3 install --upgrade --quiet google-cloud-aiplatform

## Vertex AI Configuration

In [None]:
# @title Project information
PROJECT_ID="ai-hangsik" # @param {type:"string"}

# a3-highgpu-2g	2 NVIDIA_H100_80GB	us-west1, asia-southeast1, europe-west4
LOCATION="us-west1"  # @param {type:"string"}

In [None]:
# @title Authentication
!gcloud auth login
!gcloud auth application-default login
!gcloud config set project {PROJECT_ID}

Go to the following link in your browser, and complete the sign-in prompts:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=32555940559.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fsdk.cloud.google.com%2Fauthcode.html&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=jgHQlijGTgsaZuMgAuolARUhY43bbs&prompt=consent&token_usage=remote&access_type=offline&code_challenge=twvi3qvIkZI5FUj1_26P2nae3z1odvgjUZ-4NmwaV8E&code_challenge_method=S256

Once finished, enter the verification code provided in your browser: 4/0AanRRrsky4dICthpslslKIYBPXa9Ns-5zuHDCVVfE3qdKAbWMORC8e89F339jIfWZHYD3Q

You are now logged in as [hangsik@google.com].
Your current project is 

In [None]:
# @title Initialize Vertex AI
from typing import Tuple
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION)

In [None]:
# @title Gets the default SERVICE_ACCOUNT.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)


Using this default Service Account: 721521243942-compute@developer.gserviceaccount.com


## Model Configuration

In [None]:
# @title Define model deployment constants
import datetime
now = datetime.datetime.now()

# https://docs.vllm.ai/en/latest/models/supported_models.html#list-of-supported-models

MODEL_ID = "Phi-3-mini-4k-instruct" # @param {type:"string"}
MODEL_BUCKET_URI =f"gs://sllm_checkpoints/{MODEL_ID}" # @param {type:"string"}
# VLLM_DOCKER_URI = "us-docker.pkg.dev/deeplearning-platform-release/vertex-model-garden/vllm-inference.cu121.0-5.ubuntu2204.py310" # @param {type:"string"}
VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240723_0916_RC00" # @param {type:"string"}

MODEL_DISPLAY_NAME = f"{MODEL_ID}-{now}"
ENDPOINT_DISPLAY_NAME = f"{MODEL_ID}-{now}-endpoint" # @param {type:"string"}

In [None]:
# @title Set accelerator.
# Find Vertex AI prediction supported accelerators and regions [here](https://cloud.google.com/vertex-ai/docs/predictions/configure-compute).
MACHINE_TYPE = "a3-highgpu-2g" # @param {type:"string"}
ACCELERATOR_TYPE = "NVIDIA_H100_80GB"  # @param {type:"string"}
ACCELERATOR_COUNT = 2           # @param {type:"integer"}

In [None]:
# @title Set vLLM args
from typing import Tuple
from google.cloud import aiplatform

# See https://docs.vllm.ai/en/latest/serving/engine_args.html for a list of possible arguments with descriptions.
vllm_args = [
    "python",
    "-m",
    "vllm.entrypoints.api_server",
    "--host=0.0.0.0",
    "--port=8080",
    f"--model={MODEL_ID}",
    f"--tensor-parallel-size={ACCELERATOR_COUNT}",
    f"--swap-space=16",
    f"--gpu-memory-utilization=0.95",
    f"--max-model-len=4096",
    f"--dtype=auto",
    f"--max-loras=1",
    f"--max-cpu-loras=8",
    f"--max-num-seqs=256",
    f"--disable-log-stats",
    f"--trust-remote-code",
  ]

env_vars = {
    "MODEL_ID": MODEL_ID,
    "DEPLOY_SOURCE": "notebook",
}

## Model upload and deployment

In [None]:
# @title Upload model.
model = aiplatform.Model.upload(
    display_name=MODEL_DISPLAY_NAME,
    artifact_uri=MODEL_BUCKET_URI,
    serving_container_image_uri=VLLM_DOCKER_URI,
    serving_container_args=vllm_args,
    serving_container_ports=[8080],
    serving_container_predict_route="/generate",
    serving_container_health_route="/ping",
    serving_container_environment_variables=env_vars,
    serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
    serving_container_deployment_timeout=7200,
)
print(
    f"Deploying {MODEL_DISPLAY_NAME} on {MACHINE_TYPE} with {ACCELERATOR_COUNT} {ACCELERATOR_TYPE} GPU(s)."
)

INFO:google.cloud.aiplatform.models:Creating Model
INFO:google.cloud.aiplatform.models:Create Model backing LRO: projects/721521243942/locations/us-west1/models/6748859545893732352/operations/2336455062198419456
INFO:google.cloud.aiplatform.models:Model created. Resource name: projects/721521243942/locations/us-west1/models/6748859545893732352@1
INFO:google.cloud.aiplatform.models:To use this Model in another session:
INFO:google.cloud.aiplatform.models:model = aiplatform.Model('projects/721521243942/locations/us-west1/models/6748859545893732352@1')


Deploying Phi-3-mini-4k-instruct-2025-01-16 15:47:16.705719 on a3-highgpu-2g with 2 NVIDIA_H100_80GB GPU(s).


In [None]:
# @title Create endpoint
endpoint = aiplatform.Endpoint.create(
        display_name = ENDPOINT_DISPLAY_NAME,
        dedicated_endpoint_enabled=False,
    )

INFO:google.cloud.aiplatform.models:Creating Endpoint
INFO:google.cloud.aiplatform.models:Create Endpoint backing LRO: projects/721521243942/locations/us-west1/endpoints/4590870064290332672/operations/1964345142986932224
INFO:google.cloud.aiplatform.models:Endpoint created. Resource name: projects/721521243942/locations/us-west1/endpoints/4590870064290332672
INFO:google.cloud.aiplatform.models:To use this Endpoint in another session:
INFO:google.cloud.aiplatform.models:endpoint = aiplatform.Endpoint('projects/721521243942/locations/us-west1/endpoints/4590870064290332672')


In [None]:
# @title Deploy model.
model.deploy(
    endpoint=endpoint,
    machine_type=MACHINE_TYPE,
    accelerator_type=ACCELERATOR_TYPE,
    accelerator_count=ACCELERATOR_COUNT,
    deploy_request_timeout=1800,
    service_account=SERVICE_ACCOUNT,
)
print("endpoint_name:", endpoint.name)

INFO:google.cloud.aiplatform.models:Deploying model to Endpoint : projects/721521243942/locations/us-west1/endpoints/4590870064290332672
INFO:google.cloud.aiplatform.models:Deploy Endpoint model backing LRO: projects/721521243942/locations/us-west1/endpoints/4590870064290332672/operations/3891885783501504512
INFO:google.cloud.aiplatform.models:Endpoint model deployed. Resource name: projects/721521243942/locations/us-west1/endpoints/4590870064290332672


endpoint_name: 4590870064290332672


## Prediction

In [None]:
# @title Predict to endpoint.
def predict_vllm(prompt: str,):

  # ENDPOINT_DISPLAY_NAME = "Phi-3.5-MoE-instruct-2025-01-16 11:29:21.744910-endpoint"

    ENDPOINT_ID = next(
                          (endpoint.name for endpoint in aiplatform.Endpoint.list()
                          if endpoint.display_name == ENDPOINT_DISPLAY_NAME),
                          None
                      )
    endpoint = aiplatform.Endpoint(f"projects/{PROJECT_ID}/locations/{LOCATION}/endpoints/{ENDPOINT_ID}")

    instance = {
        "prompt": prompt,
        "max_tokens": 128,
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 10,
        "raw_response": False,
    }

    instances = [instance]

    response = endpoint.predict(
        instances=instances,
        use_dedicated_endpoint=False
    )

    return response

In [None]:
prompt = "What is a car?"
response = predict_vllm(prompt=prompt)

for prediction in response.predictions:
    print(prediction)

Prompt:
What is a car?
Output:
A car is a four-wheeled vehicle powered by an engine which transports individuals from one location to another. Typically, cars have seats for passengers and a driver, and they are designed to operate on roads. The engine in a car can be powered by various types of fuel, such as gasoline, diesel, electricity, or a combination of these. Cars are an essential part of modern life, providing flexibility and convenience for commuting, transporting goods, and leisure activities.
