In [None]:
# Copyright 2024 Forusone
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Serving-meta-llama/Llama-3.1-8B-Instruct-TGI-GCS-Vertex AI-L4


In [1]:
# @title Install Vertex AI SDK and other required packages
%pip install --upgrade --user --quiet google-cloud-aiplatform \
                                      huggingface_hub[hf_transfer] \
                                      transformers

Note: you may need to restart the kernel to use updated packages.


In [5]:
# @title Define constants
import datetime
now = datetime.datetime.now()

PROJECT_ID="ai-hangsik" # @param {type:"string"}
LOCATION="us-central1" # @param {type:"string"}
BUCKET_URI="gs://sllm_0104" # @param {type:"string"}
ARTIFACT_URI="gs://sllm_0104/llama3.1_8b_inst" # @param {type:"string"}

MODEL_ID = "TGI-Meta-Llama-3.1-8B-Instruct" # @param {type:"string"}
MODEL_DISPLAY_NAME = f"{MODEL_ID}-{now}"

CONTAINER_URI="us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311" # @param {type:"string"}

In [6]:
# @title GCP Authentication

# Use OAuth to access the GCP environment.
import sys
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user(project_id=PROJECT_ID)

## Model upload and deploy

In [7]:
# @title Initialize on Vertex AI
import os
from google.cloud import aiplatform

aiplatform.init(
    project=PROJECT_ID,
    location=LOCATION,
    staging_bucket=BUCKET_URI,
)

In [8]:
# @title Upload a model
model = aiplatform.Model.upload(
    display_name= MODEL_DISPLAY_NAME,
    artifact_uri=ARTIFACT_URI,
    serving_container_image_uri=CONTAINER_URI,
    serving_container_environment_variables={
        "NUM_SHARD": "1",
        "MAX_INPUT_TOKENS": "512",
        "MAX_TOTAL_TOKENS": "1024",
        "MAX_BATCH_PREFILL_TOKENS": "1512",
    },
    serving_container_ports=[8080],
)
model.wait()

Creating Model
Create Model backing LRO: projects/721521243942/locations/us-central1/models/2125355976491008/operations/7706720987584135168
Model created. Resource name: projects/721521243942/locations/us-central1/models/2125355976491008@1
To use this Model in another session:
model = aiplatform.Model('projects/721521243942/locations/us-central1/models/2125355976491008@1')


In [9]:
# @title Create an endpoint
endpoint = aiplatform.Endpoint.create(display_name=f"{MODEL_DISPLAY_NAME}-endpoint")

Creating Endpoint
Create Endpoint backing LRO: projects/721521243942/locations/us-central1/endpoints/5904422521133858816/operations/2433005823933284352
Endpoint created. Resource name: projects/721521243942/locations/us-central1/endpoints/5904422521133858816
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/721521243942/locations/us-central1/endpoints/5904422521133858816')


In [10]:
# @title Deploy model

# https://cloud.google.com/vertex-ai/docs/general/deployment

deployed_model = model.deploy(
    endpoint=endpoint,
    machine_type="g2-standard-4",
    accelerator_type="NVIDIA_L4",
    accelerator_count=1,
)

Deploying model to Endpoint : projects/721521243942/locations/us-central1/endpoints/5904422521133858816
Deploy Endpoint model backing LRO: projects/721521243942/locations/us-central1/endpoints/5904422521133858816/operations/6738447067699478528
Endpoint model deployed. Resource name: projects/721521243942/locations/us-central1/endpoints/5904422521133858816


## Online predictions on Vertex AI

In [23]:
# @title Within the same session
inputs =  "What's machine learning?"

output = deployed_model.predict(
    instances=[
        {
            "inputs": inputs,
            "parameters": {
                "max_new_tokens": 256, 
                "do_sample": True,
                "top_p": 0.95, 
                "temperature": 0.0,
            },
        },
    ]
)
print(output.predictions[0])

 And how does it work?
In this course, I'll introduce you to the foundations of machine learning, a subfield of artificial intelligence (AI). You'll learn about the fundamental concepts, the typical machine learning workflow, and key techniques and algorithms. 
Upon completing this course, you'll be able to:
1. Define machine learning and explain its role in AI
2. Identify the main types of machine learning (supervised, unsupervised, and reinforcement learning)
3. Explain the machine learning workflow, from problem formulation to model deployment
4. Define key machine learning concepts (e.g., bias, variance, overfitting, underfitting)
5. Understand the advantages and challenges of machine learning
6. Familiarize yourself with common machine learning algorithms and techniques (e.g., linear regression, decision trees, clustering)
7. Develop a basic understanding of evaluation metrics (e.g., accuracy, precision, recall, F1 score)

By the end of this course, you'll have a solid grasp of th

In [22]:
# @title From a different session
import os
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION)
endpoint_display_name = f"{MODEL_DISPLAY_NAME}-endpoint"  # TODO: change to your endpoint display name

# Iterates over all the Vertex AI Endpoints within the current project and keeps the first match (if any), otherwise set to None
ENDPOINT_ID = next(
    (endpoint.name for endpoint in aiplatform.Endpoint.list()
     if endpoint.display_name == endpoint_display_name),
    None
)
assert ENDPOINT_ID, (
    "`ENDPOINT_ID` is not set, please make sure that the `endpoint_display_name` is correct at "\
    f"https://console.cloud.google.com/vertex-ai/online-prediction/endpoints?project={os.getenv('PROJECT_ID')}"
)

endpoint = aiplatform.Endpoint(f"projects/{PROJECT_ID}/locations/{LOCATION}/endpoints/{ENDPOINT_ID}")
output = endpoint.predict(
    instances=[
        {
            "inputs": "What's machine learning?",
            "parameters": {
                "max_new_tokens": 128,
                "do_sample": True,
                "top_p": 0.95,
                "temperature": 0.7,
            },
        },
    ],
)
print(output.predictions[0])

 [2/4]
What is machine learning?
Simply put, machine learning is a type of artificial intelligence (AI) that enables computers to learn from experience without being explicitly programmed.
Imagine you're trying to learn how to ride a bike. At first, you don't know how, and you might even have trouble staying on. But with practice, you get better and better, making adjustments as you go along. Eventually, you can ride a bike with ease!
Machine learning works in a similar way. It's a type of learning where the computer system gets better at a task through experience, and this experience comes in the form of data


In [None]:
# @title Resource clean-up
deployed_model.undeploy_all()
deployed_model.delete()
model.delete()

In [None]:
!gcloud storage rm -r $BUCKET_URI