In [None]:
# Copyright 2024 Forusone
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Serving-meta-llama/Llama-3.1-8B-Instruct-vLLM-GCS-Vertex-L4

* [model_garden_pytorch_llama3_1_deployment.ipynb](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_llama3_1_deployment.ipynb)

In [1]:
# @title Install and upgrade Vertex AI SDK.
! pip3 install --upgrade --quiet google-cloud-aiplatform

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# @title Define deployment constants
import datetime
now = datetime.datetime.now()

PROJECT_ID="ai-hangsik" # @param {type:"string"}
LOCATION="us-central1"  # @param {type:"string"}

MODEL_BUCKET_URI ="gs://sllm_0106/llama3.1_8b_inst" # @param {type:"string"}
VLLM_DOCKER_URI = "us-docker.pkg.dev/deeplearning-platform-release/vertex-model-garden/vllm-inference.cu121.0-5.ubuntu2204.py310" # @param {type:"string"}

MODEL_ID = "Meta-Llama-3.1-8B-Instruct" # @param {type:"string"}
MODEL_DISPLAY_NAME = f"{MODEL_ID}-{now}"
ENDPOINT_DISPLAY_NAME = f"{MODEL_ID}-endpoint" # @param {type:"string"}


In [3]:
# @title Authentication
!gcloud auth login
!gcloud auth application-default login
!gcloud config set project {PROJECT_ID}

Go to the following link in your browser, and complete the sign-in prompts:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=32555940559.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fsdk.cloud.google.com%2Fauthcode.html&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=FzWuOu6Hpa6LtwKU1lOPNpwu2Xb3Wj&prompt=consent&token_usage=remote&access_type=offline&code_challenge=gP_aE6dfx2ZILmzih9fwRgtrgPjYBudxBDQcnPK_Fo8&code_challenge_method=S256

Once finished, enter the verification code provided in your browser: 4/0ASVgi3I2wYDJSsmmh2_IO8wCclnX59tMYfojMj7OoqkgOhTBFGnvX6RY47MfAfplpk5ylg

You are now logged in as [hangsik@google.com].
Your current project is 

In [4]:
# @title Initialize Vertex AI
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION)

In [5]:
# @title Gets the default SERVICE_ACCOUNT.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)

Using this default Service Account: 721521243942-compute@developer.gserviceaccount.com


In [6]:
# @title Set accelerator.
# Find Vertex AI prediction supported accelerators and regions [here](https://cloud.google.com/vertex-ai/docs/predictions/configure-compute).
MACHINE_TYPE = "g2-standard-12" # @param {type:"string"}
ACCELERATOR_TYPE = "NVIDIA_L4" # @param {type:"string"}
ACCELERATOR_COUNT = 1 # @param {type:"string"}


In [7]:
from typing import Tuple
from google.cloud import aiplatform

# See https://docs.vllm.ai/en/latest/serving/engine_args.html for a list of possible arguments with descriptions.
vllm_args = [
    "python",
    "-m",
    "vllm.entrypoints.api_server",
    "--host=0.0.0.0",
    "--port=8080",
    f"--model={MODEL_ID}",
    f"--tensor-parallel-size={ACCELERATOR_COUNT}",
    "--swap-space=16",
    f"--gpu-memory-utilization=0.95",
    f"--max-model-len=8192",
    f"--dtype=auto",
    f"--max-loras=1",
    f"--max-cpu-loras=8",
    f"--max-num-seqs=256",
    "--disable-log-stats",
#     "--trust-remote-code",
#     "--enforce-eager",
#     "--enable-lora",
#     "--model-type=llama",
 ]

env_vars = {
    "MODEL_ID": MODEL_ID,
    "DEPLOY_SOURCE": "notebook",
}

In [8]:
model = aiplatform.Model.upload(
    display_name=MODEL_DISPLAY_NAME,
    artifact_uri=MODEL_BUCKET_URI,
    serving_container_image_uri=VLLM_DOCKER_URI,
    serving_container_args=vllm_args,
    serving_container_ports=[8080],
    serving_container_predict_route="/generate",
    serving_container_health_route="/ping",
    serving_container_environment_variables=env_vars,
    serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
    serving_container_deployment_timeout=7200,
)
print(
    f"Deploying {MODEL_DISPLAY_NAME} on {MACHINE_TYPE} with {ACCELERATOR_COUNT} {ACCELERATOR_TYPE} GPU(s)."
)

INFO:google.cloud.aiplatform.models:Creating Model
INFO:google.cloud.aiplatform.models:Create Model backing LRO: projects/721521243942/locations/us-central1/models/6497556267760877568/operations/7977641752179769344
INFO:google.cloud.aiplatform.models:Model created. Resource name: projects/721521243942/locations/us-central1/models/6497556267760877568@1
INFO:google.cloud.aiplatform.models:To use this Model in another session:
INFO:google.cloud.aiplatform.models:model = aiplatform.Model('projects/721521243942/locations/us-central1/models/6497556267760877568@1')


Deploying Meta-Llama-3.1-8B-Instruct-2025-01-22 08:04:24.624673 on g2-standard-12 with 1 NVIDIA_L4 GPU(s).


In [9]:
endpoint = aiplatform.Endpoint.create(
        display_name = ENDPOINT_DISPLAY_NAME,
        dedicated_endpoint_enabled=False,
    )

INFO:google.cloud.aiplatform.models:Creating Endpoint
INFO:google.cloud.aiplatform.models:Create Endpoint backing LRO: projects/721521243942/locations/us-central1/endpoints/944164928422412288/operations/3609150113630388224
INFO:google.cloud.aiplatform.models:Endpoint created. Resource name: projects/721521243942/locations/us-central1/endpoints/944164928422412288
INFO:google.cloud.aiplatform.models:To use this Endpoint in another session:
INFO:google.cloud.aiplatform.models:endpoint = aiplatform.Endpoint('projects/721521243942/locations/us-central1/endpoints/944164928422412288')


In [10]:
model.deploy(
    endpoint=endpoint,
    machine_type=MACHINE_TYPE,
    accelerator_type=ACCELERATOR_TYPE,
    accelerator_count=ACCELERATOR_COUNT,
    deploy_request_timeout=1800,
    service_account=SERVICE_ACCOUNT,
)
print("endpoint_name:", endpoint.name)

INFO:google.cloud.aiplatform.models:Deploying model to Endpoint : projects/721521243942/locations/us-central1/endpoints/944164928422412288
INFO:google.cloud.aiplatform.models:Deploy Endpoint model backing LRO: projects/721521243942/locations/us-central1/endpoints/944164928422412288/operations/3102495155551207424
INFO:google.cloud.aiplatform.models:Endpoint model deployed. Resource name: projects/721521243942/locations/us-central1/endpoints/944164928422412288


endpoint_name: 944164928422412288


In [11]:
import os

def predict_vllm(prompt: str,):

    ENDPOINT_ID = next(
                          (endpoint.name for endpoint in aiplatform.Endpoint.list()
                          if endpoint.display_name == ENDPOINT_DISPLAY_NAME),
                          None
                      )
    endpoint = aiplatform.Endpoint(f"projects/{PROJECT_ID}/locations/{LOCATION}/endpoints/{ENDPOINT_ID}")

    instance = {
        "prompt": prompt,
        "max_tokens": 128,
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 10,
        "raw_response": False,
    }

    instances = [instance]

    response = endpoint.predict(
        instances=instances,
        use_dedicated_endpoint=False
    )

    return response

In [12]:
prompt = "What is a car?"

response = predict_vllm(prompt=prompt)

for prediction in response.predictions:
    print(prediction)

Prompt:
What is a car?
Output:
 A car, short for automobile, is a wheeled motor vehicle used for transporting passengers or goods. The term "car" is used for a wide variety of vehicles, from the smallest city car to the largest luxury car. A car typically has four wheels and is powered by an internal combustion engine or an electric motor. Some common features of cars include a body, wheels, engine, transmission, brakes, and steering. Cars also often have safety features such as airbags, anti-lock braking systems, and rearview mirrors. In this article, we will explore the history, types, and key features of cars.

# History of Cars


## Generate JSON Output.

In [22]:
# @title Use RESTful API

from pydantic import BaseModel
from enum import Enum
import google.auth
import requests
import warnings
warnings.filterwarnings("ignore")

creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

PROJECT_NUMBER="721521243942"
ENDPOINT_ID = "944164928422412288"

class CarType(str, Enum):
    sedan = "sedan"
    suv = "SUV"
    truck = "Truck"
    coupe = "Coupe"

class CarDescription(BaseModel):
    brand: str
    model: str
    car_type: CarType

json_schema = CarDescription.model_json_schema()

prediction_input = {
    "model": "vllm-llama",
    "messages": [
        {
            "role": "user",
            "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's"
        }
    ],
    "guided_json": json_schema
}

url = f"https://us-central1-aiplatform.googleapis.com/v1/projects/{PROJECT_NUMBER}/locations/us-central1/endpoints/{ENDPOINT_ID}/chat/completions"
headers = {'Authorization': f'Bearer {creds.token}'}
response = requests.post(url, json=prediction_input, headers=headers)
print(response.json()["choices"][0]['message']['content'])


{ "brand": "Ford", "model": "Mustang SVT Cobra", "car_type": "Coupe" }


In [19]:
# @title Use Open AI SDK
from openai import OpenAI
from pydantic import BaseModel
from enum import Enum
import google.auth
import requests
import warnings
warnings.filterwarnings("ignore")

creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

PROJECT_NUMBER="721521243942"
ENDPOINT_ID = "944164928422412288"

client = OpenAI(base_url=f"https://us-central1-aiplatform.googleapis.com/v1/projects/{PROJECT_NUMBER}/locations/us-central1/endpoints/{ENDPOINT_ID}", api_key=creds.token)

class CarType(str, Enum):
    sedan = "sedan"
    suv = "SUV"
    truck = "Truck"
    coupe = "Coupe"


class CarDescription(BaseModel):
    brand: str
    model: str
    car_type: CarType

json_schema = CarDescription.model_json_schema()

completion = client.chat.completions.create(
    model="vllm",
    messages=[{
        "role": "user",
        "content": "Generate a JSON with the brand, model and car_type of the most iconic car of Hyundai from the 90's",
    }],
    extra_body={"guided_json": json_schema},
)
print(completion.choices[0].message.content)



{"brand": "Hyundai", "model": "Elantra", "car_type": "SUV" }
