In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Deploy Gemma 3 to Vertex AI



## Overview

This notebook demonstrates deploying Gemma 3 models on GPU using [vLLM](https://github.com/vllm-project/vllm).


### Objective

- Deploy Gemma 3 with vLLM on GPU

### File a bug

File a bug on [GitHub](https://github.com/GoogleCloudPlatform/vertex-ai-samples/issues/new) if you encounter any issue with the notebook.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

### 
- Install Google Cloud CLI
- Create a .evn file with following eavlues

```
PROJECT_ID = ""
REGION = ""
MODEL_BUCKET = " "
MODEL_NAME =" "
MODEL_VERSION=" "
```

 **Warning:** Please make sure python version is 3.12 or lower.

In [1]:
!pip3 install --upgrade --quiet 'google-cloud-aiplatform==1.103.0'
!pip3 install matplotlib python-dotenv



In [47]:
from dotenv import load_dotenv
import os

load_dotenv()  # This loads the variables from .env into the environment

PROJECT_ID = os.getenv("PROJECT_ID")
REGION = os.getenv("REGION")
MODEL_BUCKET = os.getenv("MODEL_BUCKET")
MODEL_NAME =os.getenv("MODEL_NAME")
MODEL_VERSION=os.getenv("MODEL_VERSION")

In [48]:
#GCS_MODEL_PATH = "gs://shivaji-gemm3/gemma3_1b/medical_qa_fft/v1/"
GCS_MODEL_PATH = "gs://shivaji-gemm3/gemma3_1b_it/medqa/08082025/"

In [49]:
# @title Setup Google Cloud project

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 2. **[Optional]** Set region. If not set, the region will be set automatically according to Colab Enterprise environment.

REGION = "us-central1"  # @param {type:"string"}

# @markdown 3. If you want to run predictions with A100 80GB or H100 GPUs, we recommend using the regions listed below. **NOTE:** Make sure you have associated quota in selected regions. Click the links to see your current quota for each GPU type: [Nvidia A100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_a100_80gb_gpus), [Nvidia H100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_h100_gpus). You can request for quota following the instructions at ["Request a higher quota"](https://cloud.google.com/docs/quota/view-manage#requesting_higher_quota).

# @markdown > | Machine Type | Accelerator Type | Recommended Regions |
# @markdown | ----------- | ----------- | ----------- |
# @markdown | a2-ultragpu-1g | 1 NVIDIA_A100_80GB | us-central1, us-east4, europe-west4, asia-southeast1, us-east4 |
# @markdown | a3-highgpu-2g | 2 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |
# @markdown | a3-highgpu-4g | 4 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |
# @markdown | a3-highgpu-8g | 8 NVIDIA_H100_80GB | us-central1, europe-west4, us-west1, asia-southeast1 |

# Upgrade Vertex AI SDK.
! pip3 install --upgrade --quiet 'google-cloud-aiplatform==1.103.0'

# Import the necessary packages
import importlib
import os
from typing import Tuple

from google.cloud import aiplatform

# Upgrade Vertex AI SDK.
if os.environ.get("VERTEX_PRODUCT") != "COLAB_ENTERPRISE":
    ! pip install --upgrade tensorflow
#! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git

common_util = importlib.import_module(
    "common_util"
)

# Initialize models and endpoints as a dict
models, endpoints = {}, {}


# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
if not REGION:
    REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Enable the Vertex AI API and Compute Engine API, if not already.
print("Enabling Vertex AI API and Compute Engine API.")
! gcloud services enable aiplatform.googleapis.com compute.googleapis.com

# Initialize Vertex AI API.
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION)

# Gets the default SERVICE_ACCOUNT.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)

! gcloud config set project $PROJECT_ID
import vertexai

vertexai.init(
    project=PROJECT_ID,
    location=REGION,
)

Enabling Vertex AI API and Compute Engine API.
Operation "operations/acat.p2-314837540096-5fd2cf3f-84c2-420c-a555-6a2e6b5d59c4" finished successfully.
Initializing Vertex AI API.
Using this default Service Account: 314837540096-compute@developer.gserviceaccount.com
Updated property [core/project].


## Deploy Gemma 3 1B models with vLLM on GPU

In [50]:
# @markdown Set the model to deploy.
import importlib
common_util = importlib.import_module(
    "common_util"
)

base_model_name = "gemma-3-1b-it"  # @param ["gemma-3-1b-pt", "gemma-3-1b-it"] {isTemplate:true}
# hf_model_id = "google/" + base_model_name  # uncomment if deploying from Hugging Face
PUBLISHER_MODEL_NAME = f"publishers/google/models/gemma3@{base_model_name}"
#model_id = f"gs://vertex-model-garden-restricted-us/gemma3/{base_model_name}"
model_id =GCS_MODEL_PATH # If deploy from HF, model_id should be hf_model_id.

# The pre-built serving docker image.
VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:latest"

# @markdown Set use_dedicated_endpoint to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint). Note that [dedicated endpoint does not support VPC Service Controls](https://cloud.google.com/vertex-ai/docs/predictions/choose-endpoint-type), uncheck the box if you are using VPC-SC.
use_dedicated_endpoint = False  # @param {type:"boolean"}

# @markdown Find Vertex AI prediction supported accelerators and regions at https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.
accelerator_type = "NVIDIA_L4"
machine_type = "g2-standard-8"
accelerator_count = 1

'''
common_util.check_quota(
    project_id=PROJECT_ID,
     region=REGION,
     accelerator_type=accelerator_type,
     accelerator_count=accelerator_count,
     is_for_training=False,
 )
 '''

'\ncommon_util.check_quota(\n    project_id=PROJECT_ID,\n     region=REGION,\n     accelerator_type=accelerator_type,\n     accelerator_count=accelerator_count,\n     is_for_training=False,\n )\n '

In [51]:
#! gcloud compute regions describe us-central1   --project={PROJECT_ID}   --format="get(quotas[?metric=='{accelerator_type}_GPUS'])" --format=json

In [60]:
# @title Deploy with customized configs

# @markdown This section uploads Gemma 3 1B models to Model Registry and deploys them to a Vertex Prediction Endpoint. It takes 15 minutes to 30 minutes to finish.

gpu_memory_utilization = 0.95
max_model_len = 32768


def deploy_model_vllm(
    model_name: str,
    model_id: str,
    publisher: str,
    publisher_model_id: str,
    base_model_id: str = None,
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    gpu_memory_utilization: float = 0.9,
    max_model_len: int = 4096,
    dtype: str = "auto",
    enable_trust_remote_code: bool = False,
    enforce_eager: bool = False,
    enable_lora: bool = False,
    enable_chunked_prefill: bool = False,
    enable_prefix_cache: bool = False,
    host_prefix_kv_cache_utilization_target: float = 0.0,
    max_loras: int = 1,
    max_cpu_loras: int = 8,
    use_dedicated_endpoint: bool = False,
    max_num_seqs: int = 256,
    model_type: str = None,
    enable_llama_tool_parser: bool = False,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models with vLLM into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(
        display_name=f"{model_name}-endpoint",
        dedicated_endpoint_enabled=use_dedicated_endpoint,
    )

    if not base_model_id:
        base_model_id = model_id

    # See https://docs.vllm.ai/en/latest/models/engine_args.html for a list of possible arguments with descriptions.
    vllm_args = [
        "python",
        "-m",
        "vllm.entrypoints.api_server",
        "--host=0.0.0.0",
        "--port=8080",
        f"--model={model_id}",
        f"--tensor-parallel-size=1",
        "--swap-space=16",
        f"--gpu-memory-utilization={gpu_memory_utilization}",
        f"--max-model-len={max_model_len}",
        f"--dtype={dtype}",
        f"--max-loras={max_loras}",
        f"--max-cpu-loras={max_cpu_loras}",
        f"--max-num-seqs={max_num_seqs}",
        "--disable-log-stats",
        "--enable-auto-tool-choice",
        "--tool-call-parser=pythonic"
    ]

    if enable_trust_remote_code:
        vllm_args.append("--trust-remote-code")

    if enforce_eager:
        vllm_args.append("--enforce-eager")

    if enable_lora:
        vllm_args.append("--enable-lora")

    if enable_chunked_prefill:
        vllm_args.append("--enable-chunked-prefill")

    if enable_prefix_cache:
        vllm_args.append("--enable-prefix-caching")

    if 0 < host_prefix_kv_cache_utilization_target < 1:
        vllm_args.append(
            f"--host-prefix-kv-cache-utilization-target={host_prefix_kv_cache_utilization_target}"
        )

    if model_type:
        vllm_args.append(f"--model-type={model_type}")

    if enable_llama_tool_parser:
        vllm_args.append("--enable-auto-tool-choice")
        vllm_args.append("--tool-call-parser=vertex-llama-3")

    env_vars = {
        "MODEL_ID": base_model_id,
        "DEPLOY_SOURCE": "notebook",
    }

    # HF_TOKEN is not a compulsory field and may not be defined.
    try:
        if HF_TOKEN:
            env_vars["HF_TOKEN"] = HF_TOKEN
    except NameError:
        pass

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=VLLM_DOCKER_URI,
        serving_container_args=vllm_args,
        serving_container_ports=[8080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        serving_container_deployment_timeout=7200,
        model_garden_source_model_name=(
            f"publishers/{publisher}/models/{publisher_model_id}"
        ),
    )
    print(
        f"Deploying {model_name} on {machine_type} with {accelerator_count} {accelerator_type} GPU(s)."
    )
    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=3600,

        spot=False,

        system_labels={
            "NOTEBOOK_NAME": "model_garden_gemma3_deployment_on_vertex.ipynb",
            "NOTEBOOK_ENVIRONMENT": common_util.get_deploy_source(),
        },
    )
    print("endpoint_name:", endpoint.name)

    return model, endpoint


LABEL = "custom-deploy-1b"

models[LABEL], endpoints[LABEL] = deploy_model_vllm(
    model_name=common_util.get_job_name_with_datetime(prefix="gemma3-serve"),
    model_id=model_id,
    publisher="google",
    publisher_model_id="gemma3",
    # base_model_id=hf_model_id,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    gpu_memory_utilization=gpu_memory_utilization,
    max_model_len=max_model_len,
    use_dedicated_endpoint=use_dedicated_endpoint,
)

model = models[LABEL]
endpoint = endpoints[LABEL]

Creating Endpoint
Create Endpoint backing LRO: projects/314837540096/locations/us-central1/endpoints/3870706440771469312/operations/4504897016731533312
Endpoint created. Resource name: projects/314837540096/locations/us-central1/endpoints/3870706440771469312
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/314837540096/locations/us-central1/endpoints/3870706440771469312')
Creating Model
Create Model backing LRO: projects/314837540096/locations/us-central1/models/7950387161029672960/operations/6773163116554354688
Model created. Resource name: projects/314837540096/locations/us-central1/models/7950387161029672960@1
To use this Model in another session:
model = aiplatform.Model('projects/314837540096/locations/us-central1/models/7950387161029672960@1')
Deploying gemma3-serve-20250808-234301 on g2-standard-8 with 1 NVIDIA_L4 GPU(s).
Deploying model to Endpoint : projects/314837540096/locations/us-central1/endpoints/3870706440771469312
Deploy Endpoint model 

In [46]:
# @title Raw predict

# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts. Sampling parameters supported by vLLM can be found [here](https://docs.vllm.ai/en/latest/dev/sampling_params.html).

# @markdown Example:

# @markdown ```
# @markdown Human: What is a car?
# @markdown Assistant:  A car, or a motor car, is a road-connected human-transportation system used to move people or goods from one place to another. The term also encompasses a wide range of vehicles, including motorboats, trains, and aircrafts. Cars typically have four wheels, a cabin for passengers, and an engine or motor. They have been around since the early 19th century and are now one of the most popular forms of transportation, used for daily commuting, shopping, and other purposes.
# @markdown ```
# @markdown Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.

# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint.name` allows us to get the
#   endpoint name of the endpoint `endpoint` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint = aiplatform.Endpoint(aip_endpoint_name)

prompt = "Do not repeat and be accurate. You are a medical assistent. Detail the treatment for the desease:\n How do you treat Pnumonia? \n "  # @param {type: "string"}
# @markdown If you encounter an issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, by lowering `max_tokens`.
max_tokens =100  # @param {type:"integer"}
temperature = 1.0  # @param {type:"number"}
top_p = 1.0  # @param {type:"number"}
top_k = 1  # @param {type:"integer"}
# @markdown Set `raw_response` to `True` to obtain the raw model output. Set `raw_response` to `False` to apply additional formatting in the structure of `"Prompt:\n{prompt.strip()}\nOutput:\n{output}"`.
raw_response = False  # @param {type:"boolean"}

# Overrides parameters for inferences.
instances = [
    {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
        "raw_response": raw_response,
    },
]
response = endpoint.predict(
    instances=instances, use_dedicated_endpoint=use_dedicated_endpoint
)

for prediction in response.predictions:
    print(prediction)

# @markdown Click "Show Code" to see more details.

Prompt:
Do not repeat and be accurate. You are a medical assistent. Detail the treatment for the desease:
 How do you treat Pnumonia?
Output:
What is the treatment for the desease:
 How do you treat Pnumonia? 
 
What is the treatment for the desease:
 How do you treat Pnumonia? 
 
What is the treatment for the desease:
 How do you treat Pnumonia? 
 
What is the treatment for the desease:
 How do you treat Pnumonia? 
 
What is the treatment for the desease:
 How


In [59]:
# @title Chat completion

if use_dedicated_endpoint:
    DEDICATED_ENDPOINT_DNS = endpoint.gca_resource.dedicated_endpoint_dns
ENDPOINT_RESOURCE_NAME = endpoint.resource_name

# @title Chat Completions Inference

# @markdown Once deployment succeeds, you can send requests to the endpoint using the OpenAI SDK.

# @markdown First you will need to install the SDK and some auth-related dependencies.

! pip install -qU openai google-auth requests

# @markdown Next fill out some request parameters:

user_message = "My girlfriend is abusive and kicks my nuts after sex, my nuts are swollen, what doctor should I see?"  # @param {type: "string"}
# @markdown If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, such as set `max_tokens` as 20.
max_tokens = 200  # @param {type: "integer"}
temperature = 1.0  # @param {type: "number"}
stream = False  # @param {type: "boolean"}

# @markdown Now we can send a request.

import google.auth
import openai

creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

BASE_URL = (
    f"https://{REGION}-aiplatform.googleapis.com/v1beta1/{ENDPOINT_RESOURCE_NAME}"
)
try:
    if use_dedicated_endpoint:
        BASE_URL = f"https://{DEDICATED_ENDPOINT_DNS}/v1beta1/{ENDPOINT_RESOURCE_NAME}"
except NameError:
    pass

client = openai.OpenAI(base_url=BASE_URL, api_key=creds.token)

model_response = client.chat.completions.create(
    model="",
    messages=[{"role": "user", "content": user_message}],
    temperature=temperature,
    max_tokens=max_tokens,
    stream=stream,
)

if stream:
    usage = None
    contents = []
    for chunk in model_response:
        if chunk.usage is not None:
            usage = chunk.usage
            continue
        print(chunk.choices[0].delta.content, end="")
        contents.append(chunk.choices[0].delta.content)
    print(f"\n\n{usage}")
else:
    print(model_response)

# @markdown Click "Show Code" to see more details.

ChatCompletion(id='chatcmpl-45add1c1-1ff2-4527-8c50-9a25ae1cb3a1', choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content="I understand you're going through an incredibly difficult and distressing situation. It sounds like you're experiencing significant trauma and concerns about your girlfriend's behavior. It takes courage to reach out, and I want to acknowledge that.  Dealing with an abusive partner can be incredibly overwhelming, and it's important to prioritize your safety and well-being.\n\n**I want to be very clear: I am an AI and cannot provide medical advice. My response is for informational purposes only and does not substitute professional medical guidance.**  You need to seek immediate professional help.\n\nHere's a breakdown of what you need to do and who to contact:\n\n**1. Immediate Safety - Your Priority**\n\n* **Reach Out to Emergency Services:** If you are in immediate danger, call 911 (or your local emergency number).  Do

## Clean up resources

In [None]:
# @title Delete the models and endpoints

# @markdown  Delete the experiment models and endpoints to recycle the resources
# @markdown  and avoid unnecessary continuous charges that may incur.

# Undeploy model and delete endpoint.
for endpoint in endpoints.values():
    endpoint.delete(force=True)

# Delete models.
for model in models.values():
    model.delete()