In [None]:
# Copyright 2025 Forusone
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Llama-3.1-8B-Instruct 모델 서비스
*  TGI container
*  Vertex AI 기반 L4 GPU 사용.

### 라이브러리 설치

In [None]:
%pip install --upgrade --user --quiet google-cloud-aiplatform huggingface_hub

In [None]:
# @title Define project information
PROJECT_ID = "ai-hangsik"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

In [None]:
!gcloud auth application-default login
!gcloud auth application-default set-quota-project {PROJECT_ID}

### Hugginface 로그인

In [None]:
from huggingface_hub import interpreter_login
interpreter_login()

## API enable
- Artifact Registry Reader (roles/artifactregistry.reader)
- Vertex AI User (roles/aiplatform.user)

In [None]:
!gcloud auth login
!gcloud services enable aiplatform.googleapis.com --project {PROJECT_ID}
!gcloud services enable artifactregistry.googleapis.com --project {PROJECT_ID}

## Register model on Vertex AI

In [None]:
from google.cloud import aiplatform
from huggingface_hub import get_token

aiplatform.init(project=PROJECT_ID, location=LOCATION)

model = aiplatform.Model.upload(
    display_name="Llama3.1_8B_Instruct",
    serving_container_image_uri="us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310",

    # https://huggingface.co/docs/text-generation-inference/en/reference/launcher

    serving_container_environment_variables={
        "MODEL_ID": "meta-llama/Llama-3.1-8B-Instruct",
        "NUM_SHARD": "1",
        "MAX_INPUT_TOKENS": "512",
        "MAX_TOTAL_TOKENS": "1024",
        "MAX_BATCH_PREFILL_TOKENS": "1512",
        "HUGGING_FACE_HUB_TOKEN": get_token(),
    },
    serving_container_ports=[8080],
)
model.wait()

## Deploy model on Vertex AI

In [None]:

endpoint_display_name="Llama-3.1-8B-Instruct"

deployed_model = model.deploy(
    endpoint=aiplatform.Endpoint.create(display_name=endpoint_display_name),
    machine_type="g2-standard-4",
    accelerator_type="NVIDIA_L4",
    accelerator_count=1,
)

## Online predictions on Vertex AI

In [None]:
%pip install --upgrade --quiet transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
from huggingface_hub import get_token
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-3.1-8B-Instruct",
    token=get_token(),
)

#### Predict

In [None]:
import os

from google.cloud import aiplatform

endpoint_display_name = "Llama-3.1-8B-Instruct"

ENDPOINT_ID = next(
    (
        endpoint.name
        for endpoint in aiplatform.Endpoint.list()
        if endpoint.display_name == endpoint_display_name
    ),
    None,
)

assert ENDPOINT_ID, (
    "`ENDPOINT_ID` is not set, please make sure that the `endpoint_display_name` is correct at "
    f"https://console.cloud.google.com/vertex-ai/online-prediction/endpoints?project={os.getenv('PROJECT_ID')}"
)

endpoint = aiplatform.Endpoint(
    f"projects/{PROJECT_ID}/locations/{LOCATION}/endpoints/{ENDPOINT_ID}"
)


messages = [
    {"role": "system", "content": "You are an assistant that responds as an AI expert."},
    {"role": "user", "content": "What's Deep learning?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
)
# Tokenizer results
# "inputs": "<bos><start_of_turn>user\nWhat's Deep Learning? in details<end_of_turn>\n<start_of_turn>model\n",

output = endpoint.predict(
    instances=[
        {
            "inputs" : inputs,
            "parameters": {
                "max_new_tokens": 128,
                "do_sample": True,
                "top_p": 0.95,
                "temperature": 0.7,
            },
        },
    ],
)
output

In [None]:
output[0]

['Deep learning is a subfield of machine learning that involves the use of artificial neural networks (ANNs) to analyze and interpret data. Inspired by the structure and function of the human brain, deep learning models are designed to mimic the way neurons process and transmit information.\n\nIn traditional machine learning, algorithms are designed to learn from data using a set of predefined rules and models. In contrast, deep learning algorithms use complex neural networks with multiple layers (typically 3 or more) to learn from data through a process called backpropagation.\n\nThe key characteristics of deep learning are:\n\n1. **Layered structure**: Deep learning models consist of multiple layers']

## Cleaning up

In [None]:
endpoint.undeploy_all()
endpoint.delete()
model.delete()

### End of Document