In [None]:
# Copyright 2024 Forusone
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Serving-meta-llama/Llama-3.1-8B-Instruct-vLLM-GCS-Vertex-L4

* [model_garden_pytorch_llama3_1_deployment.ipynb](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_llama3_1_deployment.ipynb)

In [1]:
# @title Install and upgrade Vertex AI SDK.
! pip3 install --upgrade --quiet google-cloud-aiplatform

In [4]:
# @title Define deployment constants
import datetime
now = datetime.datetime.now()

PROJECT_ID="ai-hangsik" # @param {type:"string"}
LOCATION="us-central1"  # @param {type:"string"}

MODEL_BUCKET_URI ="gs://sllm_0106/llama3.1_8b_inst" # @param {type:"string"}
VLLM_DOCKER_URI = "us-docker.pkg.dev/deeplearning-platform-release/vertex-model-garden/vllm-inference.cu121.0-5.ubuntu2204.py310" # @param {type:"string"}

MODEL_ID = "vLLM-Meta-Llama-3.1-8B-Instruct" # @param {type:"string"}
MODEL_DISPLAY_NAME = f"{MODEL_ID}-{now}"
ENDPOINT_DISPLAY_NAME = f"{MODEL_ID}-endpoint" # @param {type:"string"}


In [None]:
# # @title Authentication
# !gcloud auth login
# !gcloud auth application-default login
# !gcloud config set project {PROJECT_ID}

In [8]:
# @title Initialize Vertex AI
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION)

In [8]:
# @title Gets the default SERVICE_ACCOUNT.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)

Using this default Service Account: 721521243942-compute@developer.gserviceaccount.com


In [3]:
# @title Set accelerator.
# Find Vertex AI prediction supported accelerators and regions [here](https://cloud.google.com/vertex-ai/docs/predictions/configure-compute).
MACHINE_TYPE = "g2-standard-12" # @param {type:"string"}
ACCELERATOR_TYPE = "NVIDIA_L4" # @param {type:"string"}
ACCELERATOR_COUNT = 1 # @param {type:"string"}


In [4]:
from typing import Tuple
from google.cloud import aiplatform

# See https://docs.vllm.ai/en/latest/serving/engine_args.html for a list of possible arguments with descriptions.
vllm_args = [
    "python",
    "-m",
    "vllm.entrypoints.api_server",
    "--host=0.0.0.0",
    "--port=8080",
    f"--model={MODEL_ID}",
    f"--tensor-parallel-size={ACCELERATOR_COUNT}",
    "--swap-space=16",
    f"--gpu-memory-utilization=0.95",
    f"--max-model-len=8192",
    f"--dtype=auto",
    f"--max-loras=1",
    f"--max-cpu-loras=8",
    f"--max-num-seqs=256",
    "--disable-log-stats",
#     "--trust-remote-code",
#     "--enforce-eager",
#     "--enable-lora",
#     "--model-type=llama",
 ]

env_vars = {
    "MODEL_ID": MODEL_ID,
    "DEPLOY_SOURCE": "notebook",
}

In [5]:
model = aiplatform.Model.upload(
    display_name=MODEL_DISPLAY_NAME,
    artifact_uri=MODEL_BUCKET_URI,
    serving_container_image_uri=VLLM_DOCKER_URI,
    serving_container_args=vllm_args,
    serving_container_ports=[8080],
    # serving_container_predict_route="/generate",
    serving_container_predict_route="/v1/chat/completions",
    serving_container_health_route="/ping",
    serving_container_environment_variables=env_vars,
    serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
    serving_container_deployment_timeout=7200,
)
print(
    f"Deploying {MODEL_DISPLAY_NAME} on {MACHINE_TYPE} with {ACCELERATOR_COUNT} {ACCELERATOR_TYPE} GPU(s)."
)

Creating Model
Create Model backing LRO: projects/721521243942/locations/us-central1/models/8976884619651579904/operations/8884517843257786368
Model created. Resource name: projects/721521243942/locations/us-central1/models/8976884619651579904@1
To use this Model in another session:
model = aiplatform.Model('projects/721521243942/locations/us-central1/models/8976884619651579904@1')
Deploying vLLM-Meta-Llama-3.1-8B-Instruct-2025-02-05 09:54:47.934801 on g2-standard-12 with 1 NVIDIA_L4 GPU(s).


In [6]:
endpoint = aiplatform.Endpoint.create(
        display_name = ENDPOINT_DISPLAY_NAME,
        dedicated_endpoint_enabled=False,
    )

Creating Endpoint
Create Endpoint backing LRO: projects/721521243942/locations/us-central1/endpoints/7096328310015131648/operations/5370478680885690368
Endpoint created. Resource name: projects/721521243942/locations/us-central1/endpoints/7096328310015131648
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/721521243942/locations/us-central1/endpoints/7096328310015131648')


In [10]:
model.deploy(
    endpoint=endpoint,
    machine_type=MACHINE_TYPE,
    accelerator_type=ACCELERATOR_TYPE,
    accelerator_count=ACCELERATOR_COUNT,
    deploy_request_timeout=1800,
    service_account=SERVICE_ACCOUNT,
)
print("endpoint_name:", endpoint.name)

Deploying model to Endpoint : projects/721521243942/locations/us-central1/endpoints/7096328310015131648
Deploy Endpoint model backing LRO: projects/721521243942/locations/us-central1/endpoints/7096328310015131648/operations/746407763483033600
Endpoint model deployed. Resource name: projects/721521243942/locations/us-central1/endpoints/7096328310015131648
endpoint_name: 7096328310015131648


In [9]:
from typing import List

def predict_vllm(prompt: str, 
                 json_schema: str,
                 ENDPOINT_ID: str ):

    prediction_input = {
        "messages": [{
            "role": "user",
            "content": prompt,
        }],
        "guided_json": json_schema
    }
    
    endpoint = aiplatform.Endpoint(ENDPOINT_ID)
    response = endpoint.raw_predict(body=json.dumps(prediction_input, indent=2).encode('utf-8'), headers={'Content-Type':'application/json'})

    return response

In [10]:
from enum import Enum
from pydantic import BaseModel

class CarType(str, Enum):
    sedan = "sedan"
    suv = "SUV"
    truck = "Truck"
    coupe = "Coupe"

class CarDescription(BaseModel):
    brand: str
    model: str
    car_type: CarType

json_schema = CarDescription.model_json_schema()

In [12]:
import json

prompt = "Generate a JSON with the brand, model and car_type of the most iconic car of Hyundai from the 90's"

ENDPOINT_ID = next((endpoint.name for endpoint in aiplatform.Endpoint.list()
                      if endpoint.display_name == ENDPOINT_DISPLAY_NAME),
                      None
                  )
# ENDPOINT_ID = "7096328310015131648"

response = predict_vllm(prompt, json_schema, ENDPOINT_ID)
print(response.json()["choices"][0]['message']['content'])


{ "brand": "Hyundai", "model": "Elantra", "car_type": "SUV" }


## Generate JSON Output.

### Use RESTful API

In [13]:

from pydantic import BaseModel
from enum import Enum
import google.auth
import requests
import warnings
warnings.filterwarnings("ignore")

creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

PROJECT_NUMBER="721521243942"
ENDPOINT_ID = "7096328310015131648"

class CarType(str, Enum):
    sedan = "sedan"
    suv = "SUV"
    truck = "Truck"
    coupe = "Coupe"

class CarDescription(BaseModel):
    brand: str
    model: str
    car_type: CarType

json_schema = CarDescription.model_json_schema()

prediction_input = {
    "model": "vllm-llama",
    "messages": [
        {
            "role": "user",
            "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's"
        }
    ],
    "guided_json": json_schema
}

url = f"https://us-central1-aiplatform.googleapis.com/v1/projects/{PROJECT_NUMBER}/locations/us-central1/endpoints/{ENDPOINT_ID}/chat/completions"
headers = {'Authorization': f'Bearer {creds.token}'}
response = requests.post(url, json=prediction_input, headers=headers)
print(response.json()["choices"][0]['message']['content'])


{"brand": "Ford", "model": "Mustang", "car_type": "Coupe"}


### Use Open AI SDK

In [14]:
!pip install -U -q openai

In [15]:
from openai import OpenAI
from pydantic import BaseModel
from enum import Enum
import google.auth
import requests
import warnings
warnings.filterwarnings("ignore")

creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

PROJECT_NUMBER="721521243942"
ENDPOINT_ID = "7096328310015131648"

client = OpenAI(base_url=f"https://us-central1-aiplatform.googleapis.com/v1/projects/{PROJECT_NUMBER}/locations/us-central1/endpoints/{ENDPOINT_ID}", api_key=creds.token)

class CarType(str, Enum):
    sedan = "sedan"
    suv = "SUV"
    truck = "Truck"
    coupe = "Coupe"


class CarDescription(BaseModel):
    brand: str
    model: str
    car_type: CarType

json_schema = CarDescription.model_json_schema()

completion = client.chat.completions.create(
    model="vllm",
    messages=[{
        "role": "user",
        "content": "Generate a JSON with the brand, model and car_type of the most iconic car of Hyundai from the 90's",
    }],
    extra_body={"guided_json": json_schema},
)
print(completion.choices[0].message.content)



{ "brand": "Hyundai", "model": "Elantra", "car_type": "SUV" }
