In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - Llama 3 Finetuning

## Overview

With growing public access to bigger and more capable Large Language Models (LLMs) such as Llama2 and Falcon, we're also seeing a growing trend of customers looking to adopt and customize (fine-tune) these foundational models for their specific use cases with custom datasets. By fine tuning these foundational models, customers can quickly turn these general purpose language models into domain experts;- allowing the models to respond with specific responses to user prompts and reduce the chance of hallucinated responses.


This notebook demonstrates finetuning and deploying Llama 3 models with Vertex AI. The examples in this notebook use parameter efficient finetuning methods [PEFT (LoRA)](https://github.com/huggingface/peft) to reduce training and storage costs. LoRA (Low-Rank Adaptation) is one approach of Parameter Efficient FineTuning (PEFT), where pretrained model weights are frozen and rank decomposition matrices representing the change in model weights are trained during finetuning. Read more about LoRA in the following publication: [Hu, E.J., Shen, Y., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., Wang, L. and Chen, W., 2021. Lora: Low-rank adaptation of large language models. *arXiv preprint arXiv:2106.09685*](https://arxiv.org/abs/2106.09685).

After finetuning, we can deploy models on Vertex with GPU.


### Objective

- Finetune Llama 3 models with Vertex AI Custom Training Jobs.
- Deploy finetuned Llama 3 models on Vertex AI Prediction.
- Send prediction requests to your finetuned Llama 3 models.


### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

# Getting Started

## Install Vertex AI SDK for Python and other dependencies

In [None]:
! pip3 install --upgrade --user google-cloud-aiplatform

## Restart current runtime
To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which will restart the current kernel.

In [20]:
# Restart kernel after installs so that your environment can access the new packages
import IPython
import time

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

## Define Google Cloud project information

In [2]:
# Define project information

import sys

GOOGLE_CLOUD_PROJECT = ""  # @param {type:"string"}
GOOGLE_CLOUD_REGION = "us-west1"  # @param {type:"string"}

# if not running on colab, try to get the PROJECT_ID automatically
if "google.colab" not in sys.modules:
    import subprocess

    PROJECT_ID = subprocess.check_output(
        ["gcloud", "config", "get-value", "project"], text=True
    ).strip()

print(f"Your project ID is: {PROJECT_ID}")



Your project ID is: cloud-llm-preview3


## Enabling API's

In [None]:
# Enable the Vertex AI API and Compute Engine API, if not already.
print("Enabling Vertex AI API and Compute Engine API.")
! gcloud services enable aiplatform.googleapis.com compute.googleapis.com

# Enable Vertex AI and Cloud Compute APIs.
! gcloud config set project $PROJECT_ID
! gcloud services enable aiplatform.googleapis.com compute.googleapis.com

## Initializing Vertex AI SDK

In [None]:
import sys

# Initialize Vertex AI
import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

## Import libraries

In [6]:

import os
from datetime import datetime
from typing import Tuple

from google.cloud import aiplatform

# Get the default cloud project id.
PROJECT_ID = GOOGLE_CLOUD_PROJECT

# Get the default region for launching jobs.
REGION = GOOGLE_CLOUD_REGION


Enabling Vertex AI API and Compute Engine API.
Operation "operations/acat.p2-620568690313-f6f50017-178b-4d93-b424-a14d95a97855" finished successfully.


To take a quick anonymous survey, run:
  $ gcloud survey

Creating gs://cloud-llm-preview3-tmp-20240506031756/...
Using this GCS Bucket: gs://cloud-llm-preview3-tmp-20240506031756
Using this default Service Account: 620568690313-compute@developer.gserviceaccount.com
Initializing Vertex AI API.
Updated property [core/project].
Operation "operations/acat.p2-620568690313-8d02ce3b-d17c-4000-9e3d-0347391f33b6" finished successfully.


## Creating GCS Bucket for Model Storage

In [None]:
# Cloud Storage bucket for storing the experiment artifacts.
# A unique GCS bucket will be created for the purpose of this notebook. If you
# prefer using your own GCS bucket, change the value yourself below.
now = datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET_URI = "gs://"  # @param {type:"string"}
BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])
assert BUCKET_URI.startswith("gs://"), "BUCKET_URI must start with `gs://`."
if BUCKET_URI is None or BUCKET_URI.strip() == "" or BUCKET_URI == "gs://":
    # Create a unique GCS bucket for this notebook, if not specified by the user
    BUCKET_URI = f"gs://{PROJECT_ID}-tmp-{now}"
    ! gsutil mb -l {REGION} {BUCKET_URI}
else:
    shell_output = ! gsutil ls -Lb {BUCKET_NAME} | grep "Location constraint:" | sed "s/Location constraint://"
    bucket_region = shell_output[0].strip().lower()
    if bucket_region != REGION:
        raise ValueError(
            "Bucket region %s is different from notebook region %s"
            % (bucket_region, REGION)
        )

print(f"Using this GCS Bucket: {BUCKET_URI}")

# Gets the default BUCKET_URI and SERVICE_ACCOUNT if they were not specified by the user.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)

# Provision permissions to the SERVICE_ACCOUNT with the GCS bucket
BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.admin $BUCKET_NAME

# Initialize Vertex AI API.
STAGING_BUCKET = os.path.join(BUCKET_URI, "staging")
MODEL_BUCKET = os.path.join(STAGING_BUCKET, "model")
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

## Accessing Llama 3 models

We have an option for GPU based finetuning and serving, choose between accessing Llama 3 models on [Hugging Face](https://huggingface.co/) or Vertex AI as described below.

If you already obtained access to Llama 3 models on [Hugging Face](https://huggingface.co/), you can load models from there.
Alternatively, you can also load the original Llama 3 models for finetuning and serving from Vertex AI after accepting the agreement.

## Note : Its important to accept the agreements on both Hugging Face and Vertex AI 

In [4]:
# Select one of the following sections.**
# LOAD_MODEL_FROM = "Hugging Face"  # @param ["Hugging Face", "Google Cloud"] {isTemplate:true}
LOAD_MODEL_FROM = "Google Cloud"


# You must provide a Hugging Face User Access Token (read) to access the Llama 3 models. You can follow the [Hugging Face documentation](https://huggingface.co/docs/hub/en/security-tokens) to create a **read** access token and put it in the `HF_TOKEN` field below.

# HF_TOKEN = ""  # @param {type:"string", isTemplate:true}
# if LOAD_MODEL_FROM == "Hugging Face":
#     assert (
#         HF_TOKEN
#     ), "Provide a read HF_TOKEN to load models from Hugging Face, or select a different model source."


## Access Llama 3 models on Vertex AI for GPU based serving
## The original models from Meta are converted into the Hugging Face format for serving in Vertex AI.
## Accept the model agreement to access the models:

# 1. Open the [Llama 3 model card](https://console.cloud.google.com/vertex-ai/publishers/meta/model-garden/llama3) from [Vertex AI Model Garden](https://cloud.google.com/model-garden).
# 2. Review and accept the agreement in the pop-up window on the model card page. If you have previously accepted the model agreement, there will not be a pop-up window on the model card page and this step is not needed.
# 3. After accepting the agreement of Llama 3, a `gs://` URI containing Llama 3 pretrained and finetuned models will be shared.
# 4. Paste the URI in the `VERTEX_AI_MODEL_GARDEN_LLAMA3` field below.

VERTEX_AI_MODEL_GARDEN_LLAMA3 = "gs://vertex-model-garden-public-us/llama3"  # @param {type:"string", isTemplate:true}

if LOAD_MODEL_FROM == "Google Cloud":
    assert (
        VERTEX_AI_MODEL_GARDEN_LLAMA3
    ), "Click the agreement of Llama 3 in Vertex AI Model Garden, and get the GCS path of Llama 3 model artifacts."
    print(
        "Copying Llama 3 model artifacts from",
        VERTEX_AI_MODEL_GARDEN_LLAMA3,
        "to ",
        MODEL_BUCKET,
    )
    HF_TOKEN = ""

    # ! gsutil -m cp -R $VERTEX_AI_MODEL_GARDEN_LLAMA3/* $MODEL_BUCKET
    ! gsutil -m cp -R $VERTEX_AI_MODEL_GARDEN_LLAMA3/llama3-8b-hf/* $MODEL_BUCKET

NameError: name 'MODEL_BUCKET' is not defined

## Define Pre-built training and Serving Docker Images
The training image uses transformers 4.38.2 and tokenizers 0.15.2.

In [None]:

TRAIN_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-peft-train:20240415_0936_RC00"
VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240418_0936_RC01"


def get_job_name_with_datetime(prefix: str) -> str:
    """Gets the job name with date time when triggering training or deployment
    jobs in Vertex AI.
    """
    return prefix + datetime.now().strftime("_%Y%m%d_%H%M%S")


def deploy_model(
    model_name: str,
    model_id: str,
    service_account: str,
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    gpu_memory_utilization: float = 0.9,
    max_model_len: int = 4096,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models with vLLM into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")

    vllm_args = [
        "--host=0.0.0.0",
        "--port=7080",
        f"--model={model_id}",
        f"--tensor-parallel-size={accelerator_count}",
        "--swap-space=16",
        f"--gpu-memory-utilization={gpu_memory_utilization}",
        f"--max-model-len={max_model_len}",
        "--disable-log-stats",
    ]

    env_vars = {"MODEL_ID": model_id}
    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=VLLM_DOCKER_URI,
        serving_container_command=["python", "-m", "vllm.entrypoints.api_server"],
        serving_container_args=vllm_args,
        serving_container_ports=[7080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=env_vars,
    )
    print(
        f"Deploying {model_name} on {machine_type} with {accelerator_count} {accelerator_type} GPU(s)."
    )
    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
    )
    print("endpoint_name:", endpoint.name)

    return model, endpoint

## Finetune with HuggingFace PEFT and deploy with vLLM on GPUs
Use the Vertex AI SDK to create and run the custom training jobs.

In [11]:

# This notebook uses [timdettmers/openassistant-guanaco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco) dataset as an example.
# You can set `dataset_name` to any existing [Hugging Face dataset](https://huggingface.co/datasets) name, and set `instruct_column_in_dataset` to the name of the dataset column containing training data. The [timdettmers/openassistant-guanaco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco) has only one column `text`, and therefore we set `instruct_column_in_dataset` to `text` in this notebook.

# (Optional) Prepare a custom JSONL dataset for finetuning

# You can prepare a JSONL file where each line is a valid JSON string as your custom training dataset. For example, here is one line from the [timdettmers/openassistant-guanaco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco) dataset:
# ```
#  {"text": "### Human: Hola### Assistant: \u00a1Hola! \u00bfEn qu\u00e9 puedo ayudarte hoy?"}
# ```

# Hugging Face dataset name or gs:// URI to a custom JSONL dataset.
dataset_name = "timdettmers/openassistant-guanaco"  # @param {type:"string"}

# Name of the dataset column containing training text input.
instruct_column_in_dataset = "text"  # @param {type:"string"}

# Optional. Template name or gs:// URI to a custom template.
template = ""  # @param {type:"string"}

## Finetune : Setup Training Job

In [None]:
# Use the Vertex AI SDK to create and run the custom training jobs.
# We recommend setting `finetuning_precision_mode` to `4bit` because it enables using fewer hardware resources for finetuning.
# The Llama 3 base model.

MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"  # @param ["meta-llama/Meta-Llama-3-8B", "meta-llama/Meta-Llama-3-8B-Instruct", "meta-llama/Meta-Llama-3-70B", "meta-llama/Meta-Llama-3-70B-Instruct"] {isTemplate:true}
if LOAD_MODEL_FROM == "Google Cloud":
    if MODEL_ID == "meta-llama/Meta-Llama-3-8B":
        base_model_id = "llama3-8b-hf"
    elif MODEL_ID == "meta-llama/Meta-Llama-3-8B-Instruct":
        base_model_id = "llama3-8b-chat-hf"
    elif MODEL_ID == "meta-llama/Meta-Llama-3-70B":
        base_model_id = "llama3-70b-hf"
    elif MODEL_ID == "meta-llama/Meta-Llama-3-70B-Instruct":
        base_model_id = "llama3-70b-chat-hf"
    else:
        raise ValueError(f"Undefined model ID: {MODEL_ID}.")
    base_model_id = os.path.join(MODEL_BUCKET, base_model_id)
else:
    base_model_id = MODEL_ID

# The accelerator to use.
accelerator_type = "NVIDIA_L4"  # @param ["NVIDIA_L4", "NVIDIA_TESLA_A100"]

# Batch size for finetuning.
per_device_train_batch_size = 1  # @param{type:"integer"}
# Runs 10 training steps as a minimal example.
max_steps = 10  # @param {type:"integer"}
# Precision mode for finetuning.
finetuning_precision_mode = "4bit"  # @param ["4bit", "8bit", "float16"]
# Learning rate.
learning_rate = 2e-4  # @param{type:"number"}
# LoRA parameters.
lora_rank = 16  # @param{type:"integer"}
lora_alpha = 64  # @param{type:"integer"}
lora_dropout = 0.1  # @param{type:"number"}
# Maximum sequence length.
max_seq_length = 8192

# Worker pool spec.

machine_type = None
if "8b" in MODEL_ID.lower():
    if accelerator_type == "NVIDIA_L4":
        if finetuning_precision_mode == "4bit" or finetuning_precision_mode == "8bit":
            accelerator_count = 1
            machine_type = "g2-standard-12"
        else:
            accelerator_count = 2
            machine_type = "g2-standard-24"
    elif accelerator_type == "NVIDIA_TESLA_A100":
        accelerator_count = 1
        machine_type = "a2-highgpu-1g"
    else:
        raise ValueError(
            f"Recommended machine settings not found for: {accelerator_type}. To use another accelerator, edit this code block to pass in an appropriate `machine_type`, `accelerator_type`, and `accelerator_count` to the deploy_model function by clicking `Show Code` and then modifying the code."
        )
elif "70b" in MODEL_ID.lower():
    if accelerator_type == "NVIDIA_L4":
        accelerator_count = 8
        machine_type = "g2-standard-96"
    elif accelerator_type == "NVIDIA_TESLA_A100":
        accelerator_count = 4
        machine_type = "a2-highgpu-4g"
    else:
        raise ValueError(
            f"Recommended machine settings not found for: {accelerator_type}. To use another accelerator, edit this code block to pass in an appropriate `machine_type`, `accelerator_type`, and `accelerator_count` to the deploy_model function by clicking `Show Code` and then modifying the code."
        )
else:
    raise ValueError(f"Unsupported model ID or GCS path: {MODEL_ID}.")

replica_count = 1

# Setup training job.
job_name = get_job_name_with_datetime("llama3-lora-train")

# Pass training arguments and launch job.
train_job = aiplatform.CustomContainerTrainingJob(
    display_name=job_name,
    container_uri=TRAIN_DOCKER_URI,
)

# Create a GCS folder to store the LORA adapter.
lora_adapter_dir = get_job_name_with_datetime("llama3-lora-adapter")
lora_output_dir = os.path.join(STAGING_BUCKET, lora_adapter_dir)

# Create a GCS folder to store the merged model with the base model and the
# finetuned LORA adapter.
merged_model_dir = get_job_name_with_datetime("llama3-merged-model")
merged_model_output_dir = os.path.join(STAGING_BUCKET, merged_model_dir)


Training Output directory:
gs://cloud-llm-preview3-tmp-20240506031756/staging/aiplatform-custom-training-2024-05-06-03:37:02.722 
View Training:
https://console.cloud.google.com/ai/platform/locations/us-west1/training/5507052421785845760?project=620568690313
CustomContainerTrainingJob projects/620568690313/locations/us-west1/trainingPipelines/5507052421785845760 current state:
PipelineState.PIPELINE_STATE_RUNNING
View backing custom job:
https://console.cloud.google.com/ai/platform/locations/us-west1/training/4377352602757627904?project=620568690313
CustomContainerTrainingJob projects/620568690313/locations/us-west1/trainingPipelines/5507052421785845760 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomContainerTrainingJob projects/620568690313/locations/us-west1/trainingPipelines/5507052421785845760 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomContainerTrainingJob projects/620568690313/locations/us-west1/trainingPipelines/5507052421785845760 current state:
Pipelin

## Trigger training Job

In [None]:

train_job.run(
    args=[
        "--task=instruct-lora",
        f"--pretrained_model_id={base_model_id}",
        f"--dataset_name={dataset_name}",
        f"--instruct_column_in_dataset={instruct_column_in_dataset}",
        f"--output_dir={lora_output_dir}",
        f"--merge_base_and_lora_output_dir={merged_model_output_dir}",
        f"--per_device_train_batch_size={per_device_train_batch_size}",
        f"--lora_rank={lora_rank}",
        f"--lora_alpha={lora_alpha}",
        f"--lora_dropout={lora_dropout}",
        f"--max_steps={max_steps}",
        f"--max_seq_length={max_seq_length}",
        f"--learning_rate={learning_rate}",
        f"--precision_mode={finetuning_precision_mode}",
        f"--template={template}",
        f"--huggingface_access_token={HF_TOKEN}",
    ],
    environment_variables={"WANDB_DISABLED": True},
    replica_count=replica_count,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    boot_disk_size_gb=500,
    service_account=SERVICE_ACCOUNT,
)

print("LoRA adapter was saved in: ", lora_output_dir)
print("Trained and merged models were saved in: ", merged_model_output_dir)



## Deploy of the Model to Vertex AI Endpoint
This section uploads the model to Model Registry and deploys it on the Endpoint. It takes 15 minutes to 1 hour to finish.

In [None]:


print("Deploying models in: ", merged_model_output_dir)

# Find Vertex AI prediction supported accelerators and regions in [here](https://cloud.google.com/vertex-ai/docs/predictions/configure-compute).
if "8b" in MODEL_ID.lower():
    machine_type = "g2-standard-12"
    accelerator_type = "NVIDIA_L4"
    accelerator_count = 1
else:
    machine_type = "g2-standard-96"
    accelerator_type = "NVIDIA_L4"
    accelerator_count = 8

gpu_memory_utilization = 0.85
max_model_len = 8192  # Maximum context length.

model, endpoint = deploy_model(
    model_name=get_job_name_with_datetime(prefix="llama3-vllm-serve"),
    model_id=merged_model_output_dir,
    service_account=SERVICE_ACCOUNT,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    gpu_memory_utilization=gpu_memory_utilization,
    max_model_len=max_model_len,
)



Deploying models in:  gs://cloud-llm-preview3-tmp-20240506031756/staging/llama3-merged-model_20240506_033702
Creating Endpoint
Create Endpoint backing LRO: projects/620568690313/locations/us-west1/endpoints/4094409778023890944/operations/8839834893295812608
Endpoint created. Resource name: projects/620568690313/locations/us-west1/endpoints/4094409778023890944
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/620568690313/locations/us-west1/endpoints/4094409778023890944')
Creating Model
Create Model backing LRO: projects/620568690313/locations/us-west1/models/4325381986647539712/operations/512679182287765504
Model created. Resource name: projects/620568690313/locations/us-west1/models/4325381986647539712@1
To use this Model in another session:
model = aiplatform.Model('projects/620568690313/locations/us-west1/models/4325381986647539712@1')
Deploying llama3-vllm-serve_20240506_155759 on g2-standard-12 with 1 NVIDIA_L4 GPU(s).
Deploying model to Endpoint : 

In [15]:
print(model)
print(endpoint)

<google.cloud.aiplatform.models.Model object at 0x7fdc5112d120> 
resource name: projects/620568690313/locations/us-west1/models/4325381986647539712
<google.cloud.aiplatform.models.Endpoint object at 0x7fdc519d0b20> 
resource name: projects/620568690313/locations/us-west1/endpoints/4094409778023890944


## Predict
Once deployment succeeds, you can send requests to the endpoint with text prompts. Sampling parameters supported by vLLM can be found [here](https://docs.vllm.ai/en/latest/dev/sampling_params.html).

In [14]:
#  ```
# Human: What is a car?
# Assistant:  A car, or a motor car, is a road-connected human-transportation system used to move people or goods from one place to another. The term also encompasses a wide range of vehicles, including motorboats, trains, and aircrafts. Cars typically have four wheels, a cabin for passengers, and an engine or motor. They have been around since the early 19th century and are now one of the most popular forms of transportation, used for daily commuting, shopping, and other purposes.
# ```


prompt = "What is a car?"  # @param {type: "string"}
max_tokens = 50  # @param {type:"integer"}
temperature = 1.0  # @param {type:"number"}
top_p = 1.0  # @param {type:"number"}
top_k = 1  # @param {type:"integer"}
raw_response = False  # @param {type:"boolean"}

# Overides parameters for inferences.
# If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`,
# you can reduce the maximum number of output tokens, such as set max_tokens as 20.

instances = [
    {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
        "raw_response": raw_response,
    },
]
response = endpoint.predict(instances=instances)

for prediction in response.predictions:
    print(prediction)



Prompt:
What is a car?
Output:
 A car is a vehicle that is powered by an internal combustion engine and is designed to transport people or goods from one place to another. It is typically made of metal and has four wheels, a body, and a chassis. Cars are used for a


## Clean up resources
Delete the model and endpoint to recycle the resources and avoid unnecessary continuous charges that may incur.

In [None]:


train_job.delete()

# Undeploy model and delete endpoint.
endpoint.delete(force=True)

# Delete model.
model.delete()

delete_bucket = False  # @param {type:"boolean"}
if delete_bucket:
    ! gsutil -m rm -r $BUCKET_URI