In [None]:
# Copyright 2024 Forusone
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Evaluate a Translation Model
* [Evaluate a Translation Model](https://colab.sandbox.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_translation.ipynb#scrollTo=5e_7VOHBer8D)
* In this tutorial, you will learn how to use the Vertex AI Python SDK for [Gen AI Evaluation Service](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-overview) to measure the translation quality of your LLM responses using [BLEU](https://en.wikipedia.org/wiki/BLEU), [MetricX](https://github.com/google-research/metricx) and [COMET](https://unbabel.github.io/COMET/html/index.html).

In [1]:
# @title Install Vertex AI Python SDK
%pip install --upgrade --user --quiet google-cloud-aiplatform[evaluation]

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[0m

In [9]:
# @title Define constants

PROJECT_ID="ai-hangsik" # @param {type:"string"}
LOCATION="us-central1" # @param {type:"string"}
EXPERIMENT_NAME = "my-eval-task-experiment"  # @param {type:"string"}

In [3]:
# @title GCP Authentication

# Use OAuth to access the GCP environment.
import sys
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user(project_id=PROJECT_ID)

In [None]:
# @title Initialize Vertex AI SDK
import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

### Import libraries

In [4]:
# General
import pandas as pd

# Main
from vertexai import evaluation
from vertexai.evaluation.metrics import pointwise_metric

In [5]:
# @title Helper functions
from IPython.display import Markdown, display


def display_eval_result(eval_result, metrics=None, model_name=None, rows=0):
    if model_name is not None:
        display(Markdown("## Eval Result for %s" % model_name))

    """Display the evaluation results."""
    summary_metrics, metrics_table = (
        eval_result.summary_metrics,
        eval_result.metrics_table,
    )

    metrics_df = pd.DataFrame.from_dict(summary_metrics, orient="index").T
    if metrics:
        metrics_df = metrics_df.filter(
            [
                metric
                for metric in metrics_df.columns
                if any(selected_metric in metric for selected_metric in metrics)
            ]
        )
        metrics_table = metrics_table.filter(
            [
                metric
                for metric in metrics_table.columns
                if any(selected_metric in metric for selected_metric in metrics)
            ]
        )

    # Display the summary metrics
    display(Markdown("### Summary Metrics"))
    display(metrics_df)
    if rows > 0:
        # Display samples from the metrics table
        display(Markdown("### Row-based Metrics"))
        display(metrics_table.head(rows))

# Set up eval metrics for your data.

You can evaluate the translation quality of your data generated from an LLM using:
- [BLEU](https://en.wikipedia.org/wiki/BLEU)
- [COMET](https://unbabel.github.io/COMET/html/index.html)
- [MetricX](https://github.com/google-research/metricx)

In [6]:
metrics = [
    "bleu",
    pointwise_metric.Comet(),
    pointwise_metric.MetricX(),
]

# Prepare your dataset

Evaluate stored generative AI model responses in an evaluation dataset.

In [7]:
sources = [
    "Dem Feuer konnte Einhalt geboten werden",
    "Schulen und Kindergärten wurden eröffnet.",
]

responses = [
    "The fire could be stopped",
    "Schools and kindergartens were open",
]

references = [
    "They were able to control the fire.",
    "Schools and kindergartens opened",
]

eval_dataset = pd.DataFrame(
    {
        "source": sources,
        "response": responses,
        "reference": references,
    }
)

# Run evaluation

With the evaluation dataset and metrics defined, you can run evaluation for an `EvalTask` on different models and applications, and many other use cases.

In [10]:
eval_task = evaluation.EvalTask(
    dataset=eval_dataset, metrics=metrics, experiment=EXPERIMENT_NAME
)
eval_result = eval_task.evaluate()

INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/721521243942/locations/us-central1/metadataStores/default/contexts/my-eval-task-experiment-5345f3ad-6072-4d90-b1fe-6e455978d220 to Experiment: my-eval-task-experiment


INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 6 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 6/6 [00:09<00:00,  1.60s/it]
INFO:vertexai.evaluation._evaluation:All 6 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:9.639209558999937 seconds


You can view the summary metrics and row-based metrics for each response in the `EvalResult`.


In [11]:
display_eval_result(eval_result, rows=2)

### Summary Metrics

Unnamed: 0,row_count,bleu/mean,bleu/std,comet/mean,comet/std,metricx/mean,metricx/std
0,2.0,0.22813,0.239717,0.905142,0.094164,3.514046,0.629374


### Row-based Metrics

Unnamed: 0,source,response,reference,bleu/score,comet/score,metricx/score
0,Dem Feuer konnte Einhalt geboten werden,The fire could be stopped,They were able to control the fire.,0.058625,0.838558,3.069011
1,Schulen und Kindergärten wurden eröffnet.,Schools and kindergartens were open,Schools and kindergartens opened,0.397635,0.971726,3.95908


# Clean up

Delete ExperimentRun created by the evaluation.

In [None]:
from google.cloud import aiplatform

aiplatform.ExperimentRun(
    run_name=eval_result.metadata["experiment_run"],
    experiment=eval_result.metadata["experiment"],
).delete()