In [None]:
# Copyright 2024 Forusone
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Evaluate a Translation Model
* [Evaluate a Translation Model](https://colab.sandbox.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_translation.ipynb#scrollTo=5e_7VOHBer8D)
* In this tutorial, you will learn how to use the Vertex AI Python SDK for [Gen AI Evaluation Service](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-overview) to measure the translation quality of your LLM responses using [BLEU](https://en.wikipedia.org/wiki/BLEU), [MetricX](https://github.com/google-research/metricx) and [COMET](https://unbabel.github.io/COMET/html/index.html).

In [1]:
# @title Install Vertex AI Python SDK
%pip install --user --quiet google-cloud-aiplatform[evaluation]

Note: you may need to restart the kernel to use updated packages.


In [2]:
# @title Define constants

PROJECT_ID="ai-hangsik" # @param {type:"string"}
LOCATION="us-central1" # @param {type:"string"}
EXPERIMENT_NAME = "my-eval-task-experiment"  # @param {type:"string"}

In [3]:
# @title GCP Authentication

# Use OAuth to access the GCP environment.
import sys
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user(project_id=PROJECT_ID)

In [12]:
# @title Initialize Vertex AI SDK
import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

### Import libraries

In [13]:
# General
import pandas as pd
pd.set_option('max_colwidth', 1000)

# Main
from vertexai import evaluation
from vertexai.evaluation.metrics import pointwise_metric

In [14]:
# @title Helper functions
from IPython.display import Markdown, display


def display_eval_result(eval_result, metrics=None, model_name=None, rows=0):
    if model_name is not None:
        display(Markdown("## Eval Result for %s" % model_name))

    """Display the evaluation results."""
    summary_metrics, metrics_table = (
        eval_result.summary_metrics,
        eval_result.metrics_table,
    )

    metrics_df = pd.DataFrame.from_dict(summary_metrics, orient="index").T
    if metrics:
        metrics_df = metrics_df.filter(
            [
                metric
                for metric in metrics_df.columns
                if any(selected_metric in metric for selected_metric in metrics)
            ]
        )
        metrics_table = metrics_table.filter(
            [
                metric
                for metric in metrics_table.columns
                if any(selected_metric in metric for selected_metric in metrics)
            ]
        )

    # Display the summary metrics
    display(Markdown("### Summary Metrics"))
    display(metrics_df)
    if rows > 0:
        # Display samples from the metrics table
        display(Markdown("### Row-based Metrics"))
        display(metrics_table.head(rows))

# Set up eval metrics for your data.

You can evaluate the translation quality of your data generated from an LLM using:
- [BLEU](https://en.wikipedia.org/wiki/BLEU)
- [COMET](https://unbabel.github.io/COMET/html/index.html)
- [MetricX](https://github.com/google-research/metricx)

In [19]:
metrics = [
    # "bleu",
    pointwise_metric.Comet(),
    pointwise_metric.MetricX(),
]

# Prepare your dataset

Evaluate stored generative AI model responses in an evaluation dataset.

In [20]:
sources = [
    "80명 탄 여객기, 강풍 등 악천후 속 눈 쌓인 활주로에 착륙하다가 사고",
    "지난달 워싱턴 여객기-헬기 충돌 참사 등 최근 북미서 항공사고 잇달아",
]

responses = [
    "The accident occurred while the passenger plane with 80 passengers on board was landing on a runway covered in strong winds and snow.",
    "Aviation accidents continue in North America, including a passenger plane and helicopter crash in Washington last month.",
]

references = [
    "Passenger plane carrying 80 people crashes while landing on snow-covered runway in severe weather including strong winds",
    "A series of recent aviation accidents in North America, including the Washington plane-helicopter collision last month",
]

eval_dataset = pd.DataFrame(
    {
        "source": sources,
        "response": responses,
        "reference": references,
    }
)

# Run evaluation

With the evaluation dataset and metrics defined, you can run evaluation for an `EvalTask` on different models and applications, and many other use cases.

In [21]:
eval_task = evaluation.EvalTask(
    dataset=eval_dataset, metrics=metrics, experiment=EXPERIMENT_NAME
)
eval_result = eval_task.evaluate()

Associating projects/721521243942/locations/us-central1/metadataStores/default/contexts/my-eval-task-experiment-f3e95a84-75f8-4a47-a67a-acc85efd9397 to Experiment: my-eval-task-experiment


Computing metrics with a total of 4 Vertex Gen AI Evaluation Service API requests.


100%|██████████| 4/4 [00:07<00:00,  1.95s/it]

All 4 metric requests are successfully computed.
Evaluation Took:7.8091191470157355 seconds





You can view the summary metrics and row-based metrics for each response in the `EvalResult`.


In [22]:
display_eval_result(eval_result, rows=2)

### Summary Metrics

Unnamed: 0,row_count,comet/mean,comet/std,metricx/mean,metricx/std
0,2.0,0.865841,0.009575,3.087396,1.369784


### Row-based Metrics

Unnamed: 0,source,response,reference,comet/score,metricx/score
0,"80명 탄 여객기, 강풍 등 악천후 속 눈 쌓인 활주로에 착륙하다가 사고",The accident occurred while the passenger plane with 80 passengers on board was landing on a runway covered in strong winds and snow.,Passenger plane carrying 80 people crashes while landing on snow-covered runway in severe weather including strong winds,0.85907,2.118812
1,지난달 워싱턴 여객기-헬기 충돌 참사 등 최근 북미서 항공사고 잇달아,"Aviation accidents continue in North America, including a passenger plane and helicopter crash in Washington last month.","A series of recent aviation accidents in North America, including the Washington plane-helicopter collision last month",0.872612,4.055979


# Clean up

Delete ExperimentRun created by the evaluation.

In [11]:
from google.cloud import aiplatform

aiplatform.ExperimentRun(
    run_name=eval_result.metadata["experiment_run"],
    experiment=eval_result.metadata["experiment"],
).delete()

Experiment run c9a3c1eb-6ce6-4f5a-859e-1869b921d6e0 skipped backing tensorboard run deletion.
To delete backing tensorboard run, execute the following:
tensorboard_run_artifact = aiplatform.metadata.artifact.Artifact(artifact_name=f"my-eval-task-experiment-c9a3c1eb-6ce6-4f5a-859e-1869b921d6e0-tb-run")
tensorboard_run_resource = aiplatform.TensorboardRun(tensorboard_run_artifact.metadata["resourceName"])
tensorboard_run_resource.delete()
tensorboard_run_artifact.delete()
Deleting Context : projects/721521243942/locations/us-central1/metadataStores/default/contexts/my-eval-task-experiment-c9a3c1eb-6ce6-4f5a-859e-1869b921d6e0
Context deleted. . Resource name: projects/721521243942/locations/us-central1/metadataStores/default/contexts/my-eval-task-experiment-c9a3c1eb-6ce6-4f5a-859e-1869b921d6e0
Deleting Context resource: projects/721521243942/locations/us-central1/metadataStores/default/contexts/my-eval-task-experiment-c9a3c1eb-6ce6-4f5a-859e-1869b921d6e0
Delete Context backing LRO: projec