In [None]:
# Copyright 2024 Forusone
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Compare Generative AI Models | Gen AI Evaluation SDK
* [Compare Generative AI Models | Gen AI Evaluation SDK Tutorial](https://colab.sandbox.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/compare_generative_ai_models.ipynb#scrollTo=QN61Ug4hLby5)
     

In [1]:
# @title Install Vertex AI Python SDK
%pip install --user --quiet google-cloud-aiplatform[evaluation] plotly

Note: you may need to restart the kernel to use updated packages.


In [2]:
# @title Set GCP information
PROJECT_ID = "ai-hangsik"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}
EXPERIMENT_NAME = "my-eval-task-experiment"  # @param {type:"string"}

In [3]:
# @title Authentication to GCP
import sys

if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user()

### Set Google Cloud project information and initialize Vertex AI SDK

In [4]:
# @title Initialize Vertex AI SDK
import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

In [8]:
# @title Import libraries
import inspect
import logging
import random
import string
import warnings

from IPython.display import HTML, Markdown, display

import pandas as pd
pd.set_option('max_colwidth', 1000)

import plotly.graph_objects as go

# Main
from vertexai.evaluation import EvalTask, MetricPromptTemplateExamples
from vertexai.generative_models import GenerativeModel, HarmBlockThreshold, HarmCategory

### Library settings

In [6]:
logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

### Helper functions

In [7]:
def generate_uuid(length: int = 8) -> str:
    """Generate a uuid of a specified length (default=8)."""
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))


def print_doc(function):
    print(f"{function.__name__}:\n{inspect.getdoc(function)}\n")


def display_eval_report(eval_result, metrics=None):
    """Display the evaluation results."""

    title, summary_metrics, report_df = eval_result
    metrics_df = pd.DataFrame.from_dict(summary_metrics, orient="index").T
    if metrics:
        metrics_df = metrics_df.filter(
            [
                metric
                for metric in metrics_df.columns
                if any(selected_metric in metric for selected_metric in metrics)
            ]
        )
        report_df = report_df.filter(
            [
                metric
                for metric in report_df.columns
                if any(selected_metric in metric for selected_metric in metrics)
            ]
        )

    # Display the title with Markdown for emphasis
    display(Markdown(f"## {title}"))

    # Display the metrics DataFrame
    display(Markdown("### Summary Metrics"))
    display(metrics_df)

    # Display the detailed report DataFrame
    display(Markdown("### Report Metrics"))
    display(report_df)


def display_explanations(df, metrics=None, n=1):
    style = "white-space: pre-wrap; width: 800px; overflow-x: auto;"
    df = df.sample(n=n)
    if metrics:
        df = df.filter(
            ["context", "reference", "completed_prompt", "response"]
            + [
                metric
                for metric in df.columns
                if any(selected_metric in metric for selected_metric in metrics)
            ]
        )

    for index, row in df.iterrows():
        for col in df.columns:
            display(HTML(f"<h2>{col}:</h2> <div style='{style}'>{row[col]}</div>"))
        display(HTML("<hr>"))


def plot_radar_plot(eval_results, metrics=None):
    fig = go.Figure()

    for eval_result in eval_results:
        title, summary_metrics, report_df = eval_result

        if metrics:
            summary_metrics = {
                k: summary_metrics[k]
                for k, v in summary_metrics.items()
                if any(selected_metric in k for selected_metric in metrics)
            }

        fig.add_trace(
            go.Scatterpolar(
                r=list(summary_metrics.values()),
                theta=list(summary_metrics.keys()),
                fill="toself",
                name=title,
            )
        )

    fig.update_layout(
        polar=dict(radialaxis=dict(visible=True, range=[0, 5])), showlegend=True
    )

    fig.show()


def sample_pairwise_result(eval_result, n=1, metric=None):
    """Display a random row of pairwise metric result with model responses."""
    style = "white-space: pre-wrap; width: 800px; overflow-x: auto;"
    df = eval_result.metrics_table
    df = df.sample(n=n)
    for index, row in df.iterrows():
        display(HTML(f"<h2>Prompt:</h2> <div style='{style}'>{row['prompt']}</div>"))
        display(
            HTML(
                f"<h2>Baseline Model Response:</h2> <div style='{style}'>{row['baseline_model_response']}</div>"
            )
        )
        display(
            HTML(
                f"<h2>Candidate Model Response:</h2> <div style='{style}'>{row['response']}</div>"
            )
        )
        display(
            HTML(
                f"<h2>Explanation:</h2> <div style='{style}'>{row[f'{metric}/explanation']}</div>"
            )
        )
        display(
            HTML(
                f"<h2>Winner:</h2> <div style='{style}'>{row[f'{metric}/pairwise_choice']}</div>"
            )
        )
        display(HTML("<hr>"))


def display_pairwise_win_rate(eval_result, metric=None):
    """Display pairwise aggregated metrics"""
    summary_metrics = eval_result.summary_metrics
    candidate_model_win_rate = round(
        summary_metrics[f"{metric}/candidate_model_win_rate"] * 100
    )
    display(
        HTML(
            f"<h3>Win rate: Autorater prefers Candidate Model over Baseline Model {candidate_model_win_rate}% of time.</h3>"
        )
    )

## Compare and Select Generative Models

You can define an `EvalTask` with pointwise metrics and an evaluation dataset, and then conduct a controlled experiment by running the evaluation multiple times with the same setup, but each run utilizing a different model. This allows you to isolate the impact of the model on the results, and ensuring consistent conditions for comparison.

### Define a Dataset

In [9]:
instruction = "Summarize the following article"

context = [
    "To make a classic spaghetti carbonara, start by bringing a large pot of salted water to a boil. While the water is heating up, cook pancetta or guanciale in a skillet with olive oil over medium heat until it's crispy and golden brown. Once the pancetta is done, remove it from the skillet and set it aside. In the same skillet, whisk together eggs, grated Parmesan cheese, and black pepper to make the sauce. When the pasta is cooked al dente, drain it and immediately toss it in the skillet with the egg mixture, adding a splash of the pasta cooking water to create a creamy sauce.",
    "Preparing a perfect risotto requires patience and attention to detail. Begin by heating butter in a large, heavy-bottomed pot over medium heat. Add finely chopped onions and minced garlic to the pot, and cook until they're soft and translucent, about 5 minutes. Next, add Arborio rice to the pot and cook, stirring constantly, until the grains are coated with the butter and begin to toast slightly. Pour in a splash of white wine and cook until it's absorbed. From there, gradually add hot chicken or vegetable broth to the rice, stirring frequently, until the risotto is creamy and the rice is tender with a slight bite.",
    "For a flavorful grilled steak, start by choosing a well-marbled cut of beef like ribeye or New York strip. Season the steak generously with kosher salt and freshly ground black pepper on both sides, pressing the seasoning into the meat. Preheat a grill to high heat and brush the grates with oil to prevent sticking. Place the seasoned steak on the grill and cook for about 4-5 minutes on each side for medium-rare, or adjust the cooking time to your desired level of doneness. Let the steak rest for a few minutes before slicing against the grain and serving.",
    "Creating a creamy homemade tomato soup is a comforting and simple process. Begin by heating olive oil in a large pot over medium heat. Add diced onions and minced garlic to the pot and cook until they're soft and fragrant. Next, add chopped fresh tomatoes, chicken or vegetable broth, and a sprig of fresh basil to the pot. Simmer the soup for about 20-30 minutes, or until the tomatoes are tender and falling apart. Remove the basil sprig and use an immersion blender to puree the soup until smooth. Season with salt and pepper to taste before serving.",
    "To bake a decadent chocolate cake from scratch, start by preheating your oven to 350°F (175°C) and greasing and flouring two 9-inch round cake pans. In a large mixing bowl, cream together softened butter and granulated sugar until light and fluffy. Beat in eggs one at a time, making sure each egg is fully incorporated before adding the next. In a separate bowl, sift together all-purpose flour, cocoa powder, baking powder, baking soda, and salt. Divide the batter evenly between the prepared cake pans and bake for 25-30 minutes, or until a toothpick inserted into the center comes out clean.",
]

reference = [
    "The process of making spaghetti carbonara involves boiling pasta, crisping pancetta or guanciale, whisking together eggs and Parmesan cheese, and tossing everything together to create a creamy sauce.",
    "Preparing risotto entails sautéing onions and garlic, toasting Arborio rice, adding wine and broth gradually, and stirring until creamy and tender.",
    "Grilling a flavorful steak involves seasoning generously, preheating the grill, cooking to desired doneness, and letting it rest before slicing.",
    "Creating homemade tomato soup includes sautéing onions and garlic, simmering with tomatoes and broth, pureeing until smooth, and seasoning to taste.",
    "Baking a decadent chocolate cake requires creaming butter and sugar, beating in eggs and alternating dry ingredients with buttermilk before baking until done.",
]


eval_dataset = pd.DataFrame(
    {
        "context": context,
        "instruction": [instruction] * len(context),
        "reference": reference,
    }
)

eval_dataset

Unnamed: 0,context,instruction,reference
0,"To make a classic spaghetti carbonara, start by bringing a large pot of salted water to a boil. While the water is heating up, cook pancetta or guanciale in a skillet with olive oil over medium heat until it's crispy and golden brown. Once the pancetta is done, remove it from the skillet and set it aside. In the same skillet, whisk together eggs, grated Parmesan cheese, and black pepper to make the sauce. When the pasta is cooked al dente, drain it and immediately toss it in the skillet with the egg mixture, adding a splash of the pasta cooking water to create a creamy sauce.",Summarize the following article,"The process of making spaghetti carbonara involves boiling pasta, crisping pancetta or guanciale, whisking together eggs and Parmesan cheese, and tossing everything together to create a creamy sauce."
1,"Preparing a perfect risotto requires patience and attention to detail. Begin by heating butter in a large, heavy-bottomed pot over medium heat. Add finely chopped onions and minced garlic to the pot, and cook until they're soft and translucent, about 5 minutes. Next, add Arborio rice to the pot and cook, stirring constantly, until the grains are coated with the butter and begin to toast slightly. Pour in a splash of white wine and cook until it's absorbed. From there, gradually add hot chicken or vegetable broth to the rice, stirring frequently, until the risotto is creamy and the rice is tender with a slight bite.",Summarize the following article,"Preparing risotto entails sautéing onions and garlic, toasting Arborio rice, adding wine and broth gradually, and stirring until creamy and tender."
2,"For a flavorful grilled steak, start by choosing a well-marbled cut of beef like ribeye or New York strip. Season the steak generously with kosher salt and freshly ground black pepper on both sides, pressing the seasoning into the meat. Preheat a grill to high heat and brush the grates with oil to prevent sticking. Place the seasoned steak on the grill and cook for about 4-5 minutes on each side for medium-rare, or adjust the cooking time to your desired level of doneness. Let the steak rest for a few minutes before slicing against the grain and serving.",Summarize the following article,"Grilling a flavorful steak involves seasoning generously, preheating the grill, cooking to desired doneness, and letting it rest before slicing."
3,"Creating a creamy homemade tomato soup is a comforting and simple process. Begin by heating olive oil in a large pot over medium heat. Add diced onions and minced garlic to the pot and cook until they're soft and fragrant. Next, add chopped fresh tomatoes, chicken or vegetable broth, and a sprig of fresh basil to the pot. Simmer the soup for about 20-30 minutes, or until the tomatoes are tender and falling apart. Remove the basil sprig and use an immersion blender to puree the soup until smooth. Season with salt and pepper to taste before serving.",Summarize the following article,"Creating homemade tomato soup includes sautéing onions and garlic, simmering with tomatoes and broth, pureeing until smooth, and seasoning to taste."
4,"To bake a decadent chocolate cake from scratch, start by preheating your oven to 350°F (175°C) and greasing and flouring two 9-inch round cake pans. In a large mixing bowl, cream together softened butter and granulated sugar until light and fluffy. Beat in eggs one at a time, making sure each egg is fully incorporated before adding the next. In a separate bowl, sift together all-purpose flour, cocoa powder, baking powder, baking soda, and salt. Divide the batter evenly between the prepared cake pans and bake for 25-30 minutes, or until a toothpick inserted into the center comes out clean.",Summarize the following article,"Baking a decadent chocolate cake requires creaming butter and sugar, beating in eggs and alternating dry ingredients with buttermilk before baking until done."


### Create an Evaluation Task

#### Documentation

Documentation and example usages for the `EvalTask`

In [10]:
print_doc(EvalTask)


EvalTask:
A class representing an EvalTask.

An Evaluation Tasks is defined to measure the model's ability to perform a
certain task in response to specific prompts or inputs. Evaluation tasks must
contain an evaluation dataset, and a list of metrics to evaluate. Evaluation
tasks help developers compare prompt templates, track experiments, compare
models and their settings, and assess the quality of the model's generated
text.

Dataset Details:

    Default dataset column names:
        * prompt_column_name: "prompt"
        * reference_column_name: "reference"
        * response_column_name: "response"
        * baseline_model_response_column_name: "baseline_model_response"

    Requirement for different use cases:
      * Bring-your-own-response (BYOR): You already have the data that you
          want to evaluate stored in the dataset. Response column name can be
          customized by providing `response_column_name` parameter, or in the
          `metric_column_mapping`. For BYOR

#### Design text prompt template

For more task-specific prompt guidance, see https://cloud.google.com/vertex-ai/generative-ai/docs/text/text-prompts.

In [11]:
prompt_template = "{instruction}. Article: {context}. Summary:"
prompt_template

'{instruction}. Article: {context}. Summary:'

#### Define metrics


In [12]:
metrics = [
    MetricPromptTemplateExamples.Pointwise.TEXT_QUALITY,
    MetricPromptTemplateExamples.Pointwise.FLUENCY,
    MetricPromptTemplateExamples.Pointwise.SAFETY,
    MetricPromptTemplateExamples.Pointwise.VERBOSITY,
]

#### Define EvalTask & Experiment


In [13]:
summarization_eval_task = EvalTask(
    dataset=eval_dataset,
    metrics=metrics,
    experiment=EXPERIMENT_NAME,
)

### Compare Gemini-1.5-Pro with Gemini-1.0-Pro

Evaluate **Gemini-1.5-Pro** model and **Gemini-1.0-Pro** model on the text summarization task defined above using Gen AI Eval Service SDK

#### Model settings


In [14]:
generation_config = {
    "max_output_tokens": 128,
    "temperature": 0.4,
}

safety_settings = {
    HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
}

# Gemini-1.5-Pro
gemini_model_15 = GenerativeModel(
    "gemini-1.5-pro",
    generation_config=generation_config,
    safety_settings=safety_settings,
)

# Gemini-1.0-Pro
gemini_model_1 = GenerativeModel(
    "gemini-1.0-pro",
    generation_config=generation_config,
    safety_settings=safety_settings,
)

In [15]:
models = {
    "gemini-pro-1-5": gemini_model_15,
    "gemini-pro-1-0": gemini_model_1,
}

#### Running evaluation

In [16]:
eval_results = []
run_id = generate_uuid()

for model_name, model in models.items():
    experiment_run_name = f"eval-{model_name}-{run_id}"

    eval_result = summarization_eval_task.evaluate(
        model=model,
        prompt_template=prompt_template,
        experiment_run_name=experiment_run_name,
    )

    eval_results.append(
        (f"Model {model_name}", eval_result.summary_metrics, eval_result.metrics_table)
    )

Associating projects/721521243942/locations/us-central1/metadataStores/default/contexts/my-eval-task-experiment-eval-gemini-pro-1-5-asmswwc0 to Experiment: my-eval-task-experiment


Logging Eval Experiment metadata: {'prompt_template': '{instruction}. Article: {context}. Summary:', 'model_name': 'publishers/google/models/gemini-1.5-pro', 'max_output_tokens': 128, 'temperature': 0.4, 'HARM_CATEGORY_UNSPECIFIED': 'BLOCK_NONE', 'HARM_CATEGORY_DANGEROUS_CONTENT': 'BLOCK_NONE', 'HARM_CATEGORY_HATE_SPEECH': 'BLOCK_NONE', 'HARM_CATEGORY_HARASSMENT': 'BLOCK_NONE', 'HARM_CATEGORY_SEXUALLY_EXPLICIT': 'BLOCK_NONE'}
Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
Generating a total of 5 responses from Gemini model gemini-1.5-pro.


100%|██████████| 5/5 [00:01<00:00,  2.76it/s]

All 5 responses are successfully generated from Gemini model gemini-1.5-pro.
Multithreaded Batch Inference took: 1.8189898220007308 seconds.
Computing metrics with a total of 20 Vertex Gen AI Evaluation Service API requests.



100%|██████████| 20/20 [00:22<00:00,  1.11s/it]

All 20 metric requests are successfully computed.
Evaluation Took:22.18446291098371 seconds





Associating projects/721521243942/locations/us-central1/metadataStores/default/contexts/my-eval-task-experiment-eval-gemini-pro-1-0-asmswwc0 to Experiment: my-eval-task-experiment


Logging Eval Experiment metadata: {'prompt_template': '{instruction}. Article: {context}. Summary:', 'model_name': 'publishers/google/models/gemini-1.0-pro', 'max_output_tokens': 128, 'temperature': 0.4, 'HARM_CATEGORY_UNSPECIFIED': 'BLOCK_NONE', 'HARM_CATEGORY_DANGEROUS_CONTENT': 'BLOCK_NONE', 'HARM_CATEGORY_HATE_SPEECH': 'BLOCK_NONE', 'HARM_CATEGORY_HARASSMENT': 'BLOCK_NONE', 'HARM_CATEGORY_SEXUALLY_EXPLICIT': 'BLOCK_NONE'}
Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
Generating a total of 5 responses from Gemini model gemini-1.0-pro.


100%|██████████| 5/5 [00:01<00:00,  2.95it/s]

All 5 responses are successfully generated from Gemini model gemini-1.0-pro.
Multithreaded Batch Inference took: 1.6976503909972962 seconds.
Computing metrics with a total of 20 Vertex Gen AI Evaluation Service API requests.



100%|██████████| 20/20 [00:21<00:00,  1.10s/it]

All 20 metric requests are successfully computed.
Evaluation Took:21.992216491984436 seconds





#### Evaluation Report

In [17]:
for eval_result in eval_results:
    display_eval_report(eval_result)

## Model gemini-pro-1-5

### Summary Metrics

Unnamed: 0,row_count,text_quality/mean,text_quality/std,fluency/mean,fluency/std,safety/mean,safety/std,verbosity/mean,verbosity/std
0,5.0,5.0,0.0,5.0,0.0,1.0,0.0,0.8,0.447214


### Report Metrics

Unnamed: 0,context,instruction,reference,prompt,response,text_quality/explanation,text_quality/score,fluency/explanation,fluency/score,safety/explanation,safety/score,verbosity/explanation,verbosity/score
0,"To make a classic spaghetti carbonara, start by bringing a large pot of salted water to a boil. While the water is heating up, cook pancetta or guanciale in a skillet with olive oil over medium heat until it's crispy and golden brown. Once the pancetta is done, remove it from the skillet and set it aside. In the same skillet, whisk together eggs, grated Parmesan cheese, and black pepper to make the sauce. When the pasta is cooked al dente, drain it and immediately toss it in the skillet with the egg mixture, adding a splash of the pasta cooking water to create a creamy sauce.",Summarize the following article,"The process of making spaghetti carbonara involves boiling pasta, crisping pancetta or guanciale, whisking together eggs and Parmesan cheese, and tossing everything together to create a creamy sauce.","Summarize the following article. Article: To make a classic spaghetti carbonara, start by bringing a large pot of salted water to a boil. While the water is heating up, cook pancetta or guanciale in a skillet with olive oil over medium heat until it's crispy and golden brown. Once the pancetta is done, remove it from the skillet and set it aside. In the same skillet, whisk together eggs, grated Parmesan cheese, and black pepper to make the sauce. When the pasta is cooked al dente, drain it and immediately toss it in the skillet with the egg mixture, adding a splash of the pasta cooking water to create a creamy sauce.. Summary:","This article provides a concise guide to making classic spaghetti carbonara. It emphasizes cooking pancetta or guanciale until crispy, then using the rendered fat to create a creamy sauce by whisking eggs, Parmesan cheese, and black pepper. The hot pasta is tossed directly into this sauce along with a bit of pasta water, ensuring a silky texture. \n","Coherence: The response presents the steps involved in making spaghetti carbonara clearly and logically.\nFluency: The response demonstrates excellent fluency, using appropriate vocabulary and grammar.\nInstruction following: The response accurately summarizes the provided text about preparing spaghetti carbonara.\nGroundedness: The response stays entirely grounded, deriving all information from the provided text.\nVerbosity: The response provides a concise and effective summary without unnecessary details, presenting all the key steps involved in the process.",5.0,"STEP 1: Assess grammar correctness: The response is free of grammatical errors. The punctuation, capitalization, and sentence structures are correct. STEP 2: Assess word choice and flow: The word choice is specific, precise, and appropriate for explaining a cooking process, such as ""rendered fat,"" ""creamy sauce,"" and ""silky texture."" The flow is smooth and logical, guiding the reader through the steps. STEP 3: Assess overall cohesion: The summary is well-organized and coherent, with a clear beginning, middle, and end. It uses transition words like ""then"" and ""along with"" effectively to connect the steps in the process. The concise and focused nature of the writing enhances readability.",5.0,"The response is safe as it accurately summarizes how to make spaghetti carbonara. It does not contain any toxic language or content that promotes harm, discrimination, or exploitation, including hate speech, harassment, sexually explicit material, or dangerous activities.",1.0,"STEP 1: Assess completeness: The response covers all the main points of the recipe, including cooking the pancetta, making the sauce, and combining it with the pasta.\nSTEP 2: Assess conciseness: While generally concise, there might be minor improvements in word choice. For example, ""It emphasizes cooking..."" could be more direct. The phrase ""ensuring a silky texture"" is slightly redundant as the creamy texture has already been mentioned.\nSTEP 3: Assess overall balance: The response provides a good summary but could be slightly more concise by removing a few words, like ""directly"" and perhaps combining the last two sentences into one. Therefore, it veers slightly towards verbosity but not enough to be considered overly wordy.",1.0
1,"Preparing a perfect risotto requires patience and attention to detail. Begin by heating butter in a large, heavy-bottomed pot over medium heat. Add finely chopped onions and minced garlic to the pot, and cook until they're soft and translucent, about 5 minutes. Next, add Arborio rice to the pot and cook, stirring constantly, until the grains are coated with the butter and begin to toast slightly. Pour in a splash of white wine and cook until it's absorbed. From there, gradually add hot chicken or vegetable broth to the rice, stirring frequently, until the risotto is creamy and the rice is tender with a slight bite.",Summarize the following article,"Preparing risotto entails sautéing onions and garlic, toasting Arborio rice, adding wine and broth gradually, and stirring until creamy and tender.","Summarize the following article. Article: Preparing a perfect risotto requires patience and attention to detail. Begin by heating butter in a large, heavy-bottomed pot over medium heat. Add finely chopped onions and minced garlic to the pot, and cook until they're soft and translucent, about 5 minutes. Next, add Arborio rice to the pot and cook, stirring constantly, until the grains are coated with the butter and begin to toast slightly. Pour in a splash of white wine and cook until it's absorbed. From there, gradually add hot chicken or vegetable broth to the rice, stirring frequently, until the risotto is creamy and the rice is tender with a slight bite.. Summary:","Making perfect risotto is a labor of love! Start by sautéing chopped onions and garlic in butter until softened. Add Arborio rice and toast it lightly. Deglaze the pot with white wine, then gradually stir in hot broth until the risotto is creamy and the rice is cooked al dente. \n","Coherence: The response presents ideas in a logical and organized manner, with clear transitions, making it easy to follow and understand.\nFluency: The text flows smoothly and naturally, adhering to grammatical rules and using appropriate vocabulary.\nInstruction following: The response demonstrates a clear understanding of the task instructions, satisfying all of the instruction's requirements.\nGroundedness: The response contains information included only in the context. The response does not reference any outside information.\nVerbosity: The response is appropriately concise, providing sufficient detail without using complex language to thoroughly address the prompt without being overly wordy or excessively brief.",5.0,"STEP 1: The response uses correct grammar and demonstrates proper sentence structure, punctuation, and capitalization. STEP 2: Word choice is varied, accurate, and appropriate for the context. The summary includes precise cooking terms like ""sautéing,"" ""deglaze,"" and ""al dente."" Phrases such as ""labor of love"" add a descriptive element while keeping the tone concise and engaging. The flow is natural and easy to follow, creating an effective summary of the risotto-making process. Transitions between steps are implied and enhance readability. STEP 3: The response presents a cohesive summary. Information is presented logically and concisely, summarizing the key actions involved in preparing risotto.",5.0,The response is safe. It provides a summary of making risotto and does not contain any unsafe content.,1.0,"STEP 1: The response includes all the essential steps for making risotto. It covers the ingredients, order of operations, and desired outcome. STEP 2: The response is generally concise, but some phrases could be slightly more efficient. For example, ""a labor of love"" is a slightly verbose way to introduce the process. STEP 3: Overall, the response provides a good summary but could benefit from minor trimming of unnecessary phrases. The response includes most of the details, but omits a couple of details, such as using a large, heavy-bottomed pot and cooking times. While slightly verbose, the response is still easily understood. Therefore, it is rated as somewhat verbose.",1.0
2,"For a flavorful grilled steak, start by choosing a well-marbled cut of beef like ribeye or New York strip. Season the steak generously with kosher salt and freshly ground black pepper on both sides, pressing the seasoning into the meat. Preheat a grill to high heat and brush the grates with oil to prevent sticking. Place the seasoned steak on the grill and cook for about 4-5 minutes on each side for medium-rare, or adjust the cooking time to your desired level of doneness. Let the steak rest for a few minutes before slicing against the grain and serving.",Summarize the following article,"Grilling a flavorful steak involves seasoning generously, preheating the grill, cooking to desired doneness, and letting it rest before slicing.","Summarize the following article. Article: For a flavorful grilled steak, start by choosing a well-marbled cut of beef like ribeye or New York strip. Season the steak generously with kosher salt and freshly ground black pepper on both sides, pressing the seasoning into the meat. Preheat a grill to high heat and brush the grates with oil to prevent sticking. Place the seasoned steak on the grill and cook for about 4-5 minutes on each side for medium-rare, or adjust the cooking time to your desired level of doneness. Let the steak rest for a few minutes before slicing against the grain and serving.. Summary:","To grill a delicious steak, choose a well-marbled cut like ribeye or New York strip. Season generously with salt and pepper, pressing it into the meat. Preheat your grill to high, oil the grates, and cook the steak for 4-5 minutes per side for medium-rare. Adjust cooking time as needed. Let the steak rest before slicing and serving. \n","Coherence: The response presents information logically and uses clear transitions.\nFluency: The writing is smooth, natural, and grammatically correct. \nInstruction Following: The response follows all instructions, summarizing the article effectively.\nGroundedness: The response is fully grounded, using only information from the context.\nVerbosity: The response is perfectly concise, providing all key information without being overly verbose.\nOverall, the response is an excellent summary that fulfills all instructions.",5.0,"STEP 1: Assess grammar correctness: The response is grammatically correct and free from errors. STEP 2: Assess word choice and flow: The response uses clear and concise language with no awkward phrasing. It uses words that fit the context appropriately, and the ideas flow logically. STEP 3: Assess overall cohesion: The entire summary reads smoothly, logically, and has a natural, seamless flow with smooth transitions between sentences. Each sentence logically leads to the next, creating a cohesive overall summary of how to grill steak.",5.0,"The response provides a safe summary of how to grill a steak. It does not contain any dangerous, harmful, or explicit content.",1.0,"STEP 1: The response covers all the key points for grilling a flavorful steak, such as choosing the right cut, seasoning, preheating the grill, and letting the steak rest. All necessary information is present.\nSTEP 2: The response is already quite concise. However, some phrases could be slightly more compact, like changing ""Preheat your grill to high"" to ""Preheat grill to high"" or omitting ""Adjust cooking time as needed"" as it's implied.\nSTEP 3: Overall, the balance is good, leaning slightly towards wordiness but not excessively. The slight trimming of words won't significantly impact the clarity and will enhance the summary's conciseness.",1.0
3,"Creating a creamy homemade tomato soup is a comforting and simple process. Begin by heating olive oil in a large pot over medium heat. Add diced onions and minced garlic to the pot and cook until they're soft and fragrant. Next, add chopped fresh tomatoes, chicken or vegetable broth, and a sprig of fresh basil to the pot. Simmer the soup for about 20-30 minutes, or until the tomatoes are tender and falling apart. Remove the basil sprig and use an immersion blender to puree the soup until smooth. Season with salt and pepper to taste before serving.",Summarize the following article,"Creating homemade tomato soup includes sautéing onions and garlic, simmering with tomatoes and broth, pureeing until smooth, and seasoning to taste.","Summarize the following article. Article: Creating a creamy homemade tomato soup is a comforting and simple process. Begin by heating olive oil in a large pot over medium heat. Add diced onions and minced garlic to the pot and cook until they're soft and fragrant. Next, add chopped fresh tomatoes, chicken or vegetable broth, and a sprig of fresh basil to the pot. Simmer the soup for about 20-30 minutes, or until the tomatoes are tender and falling apart. Remove the basil sprig and use an immersion blender to puree the soup until smooth. Season with salt and pepper to taste before serving.. Summary:","This article provides a simple recipe for creamy homemade tomato soup. You start by sautéing onions and garlic in olive oil. Chopped tomatoes, broth (chicken or vegetable), and basil are then simmered until the tomatoes break down. After removing the basil, an immersion blender creates a smooth consistency. Finally, the soup is seasoned to taste with salt and pepper. \n","Coherence: The response is coherent and easy to follow, logically presenting the steps involved in making tomato soup.\nFluency: The response demonstrates excellent fluency with natural language and proper grammar.\nInstruction Following: The response accurately summarizes the provided recipe, fulfilling the prompt's instructions.\nGroundedness: The response stays entirely grounded, only including information present in the provided text.\nVerbosity: The response maintains appropriate conciseness, effectively summarizing the recipe without unnecessary detail.\nOverall, the response is exceptionally well-written and closely adheres to the prompt's requirements.",5.0,"STEP 1: Grammar correctness: The response demonstrates perfect grammar, with accurate punctuation, capitalization, and sentence structure. There are no grammatical errors present.\nSTEP 2: Word choice and flow: The word choices are accurate and concise. The sentences flow logically and smoothly, creating a clear and easy-to-understand summary of the recipe. Terms such as ""sautéing"" and ""creates a smooth consistency"" effectively convey the cooking process.\nSTEP 3: Overall cohesion: The response exhibits excellent cohesion. The steps are presented in a logical order, mirroring the original recipe instructions. The use of transitional words like ""then"" and ""finally"" further enhances the flow and readability of the summary.\nOverall, the response demonstrates complete fluency in its language use, resulting in a concise and well-written summary.",5.0,"The response is safe. It provides a summary of how to make tomato soup. It does not contain any unsafe content such as hate speech, harassment, dangerous or sexually explicit content.",1.0,"STEP 1: The response includes all the necessary steps and ingredients, including the type of broth, the simmering duration, and the seasoning instructions. Therefore, it effectively conveys the core process of the recipe.\nSTEP 2: The response uses precise language such as ""sautéing onions and garlic"" and ""immersion blender creates a smooth consistency,"" which add richness to the summary without unnecessary verbosity. The sentences are structured effectively, making each step in the recipe easily digestible.\nSTEP 3: The response achieves an excellent balance between detail and conciseness. It captures the essence of each step without going into excessive detail about quantities or cooking times, making it efficient and informative.",0.0
4,"To bake a decadent chocolate cake from scratch, start by preheating your oven to 350°F (175°C) and greasing and flouring two 9-inch round cake pans. In a large mixing bowl, cream together softened butter and granulated sugar until light and fluffy. Beat in eggs one at a time, making sure each egg is fully incorporated before adding the next. In a separate bowl, sift together all-purpose flour, cocoa powder, baking powder, baking soda, and salt. Divide the batter evenly between the prepared cake pans and bake for 25-30 minutes, or until a toothpick inserted into the center comes out clean.",Summarize the following article,"Baking a decadent chocolate cake requires creaming butter and sugar, beating in eggs and alternating dry ingredients with buttermilk before baking until done.","Summarize the following article. Article: To bake a decadent chocolate cake from scratch, start by preheating your oven to 350°F (175°C) and greasing and flouring two 9-inch round cake pans. In a large mixing bowl, cream together softened butter and granulated sugar until light and fluffy. Beat in eggs one at a time, making sure each egg is fully incorporated before adding the next. In a separate bowl, sift together all-purpose flour, cocoa powder, baking powder, baking soda, and salt. Divide the batter evenly between the prepared cake pans and bake for 25-30 minutes, or until a toothpick inserted into the center comes out clean.. Summary:","This article provides a concise guide to baking a classic chocolate cake. It outlines the steps starting from oven preheating and pan preparation, through combining wet ingredients (butter, sugar, eggs) and dry ingredients (flour, cocoa powder, leavening agents, salt) separately, to dividing the batter and baking until done. \n","Coherence: The response is coherent and well-organized, presenting the steps in a logical order.\nFluency: The response is fluent and uses appropriate vocabulary.\nInstruction following: The response accurately summarizes the main points of the article.\nGroundedness: The response stays grounded in the provided text, not referencing any outside information.\nVerbosity: The response is concise and to the point, summarizing the article effectively without unnecessary detail.\nOverall, the response effectively summarizes the given text, demonstrating clear coherence, fluency, and adherence to instructions. It provides a concise overview of the chocolate cake recipe without adding any extraneous information. Therefore, it receives a rating of 5, indicating a very good response.",5.0,"STEP 1: Grammar correctness: The response is free of grammatical errors. The sentence structure is clear and easy to understand. Punctuation and capitalization are used correctly.\nSTEP 2: Word choice and flow: Word choices are accurate and appropriate for a recipe summary. The sentences flow logically and smoothly, with good use of transition words like ""starting from"" and ""through.""\nSTEP 3: Cohesion: The entire response is cohesive and reads well. It accurately summarizes the main steps of the recipe in a clear and concise manner.\nOverall, the response demonstrates a high level of fluency, making it easy to understand and follow.",5.0,"The response is safe as it only describes the steps of baking a chocolate cake and does not contain any unsafe content such as hate speech, harassment, dangerous content, or sexually explicit content.",1.0,"STEP 1: The response provides all the necessary steps involved in baking a chocolate cake. It covers preheating, pan preparation, mixing wet ingredients, mixing dry ingredients, dividing the batter, and baking. STEP 2: It's concise. It provides enough detail, but it could be slightly more concise. It states the wet and dry ingredients in parentheses, but it would be more concise to just say ""combine wet and dry ingredients."" STEP 3: The response is easy to read, well-organized, and clear. It strikes a relatively good balance between providing enough detail and being concise.",1.0


## Model gemini-pro-1-0

### Summary Metrics

Unnamed: 0,row_count,text_quality/mean,text_quality/std,fluency/mean,fluency/std,safety/mean,safety/std,verbosity/mean,verbosity/std
0,5.0,3.4,1.140175,3.2,1.095445,1.0,0.0,1.0,0.0


### Report Metrics

Unnamed: 0,context,instruction,reference,prompt,response,text_quality/explanation,text_quality/score,fluency/explanation,fluency/score,safety/explanation,safety/score,verbosity/explanation,verbosity/score
0,"To make a classic spaghetti carbonara, start by bringing a large pot of salted water to a boil. While the water is heating up, cook pancetta or guanciale in a skillet with olive oil over medium heat until it's crispy and golden brown. Once the pancetta is done, remove it from the skillet and set it aside. In the same skillet, whisk together eggs, grated Parmesan cheese, and black pepper to make the sauce. When the pasta is cooked al dente, drain it and immediately toss it in the skillet with the egg mixture, adding a splash of the pasta cooking water to create a creamy sauce.",Summarize the following article,"The process of making spaghetti carbonara involves boiling pasta, crisping pancetta or guanciale, whisking together eggs and Parmesan cheese, and tossing everything together to create a creamy sauce.","Summarize the following article. Article: To make a classic spaghetti carbonara, start by bringing a large pot of salted water to a boil. While the water is heating up, cook pancetta or guanciale in a skillet with olive oil over medium heat until it's crispy and golden brown. Once the pancetta is done, remove it from the skillet and set it aside. In the same skillet, whisk together eggs, grated Parmesan cheese, and black pepper to make the sauce. When the pasta is cooked al dente, drain it and immediately toss it in the skillet with the egg mixture, adding a splash of the pasta cooking water to create a creamy sauce.. Summary:","## Spaghetti Carbonara Summary:\n\n**Ingredients:**\n\n* Pancetta or guanciale\n* Olive oil\n* Eggs\n* Grated Parmesan cheese\n* Black pepper\n* Pasta (spaghetti recommended)\n* Salt\n\n**Instructions:**\n\n1. Boil a large pot of salted water.\n2. While the water heats, cook pancetta/guanciale in olive oil until crispy. Remove and set aside.\n3. Whisk together eggs, cheese, and pepper in the same skillet.\n4. Cook pasta al dente, drain, and immediately toss with egg mixture and a splash of pasta water to create a creamy","Coherence: The response is organized logically, presenting the steps in the correct order. However, the last sentence is cut off abruptly, disrupting the flow and clarity.\nFluency: The writing is generally fluent, but the incomplete sentence impacts overall fluency negatively.\nInstruction following: The response attempts to summarize the recipe but deviates from a standard summary format by listing ingredients separately. While the instructions are present, the format is not ideal for a summary.\nGroundedness: All information provided is grounded in the given text.\nVerbosity: While concise, the abrupt ending makes the response feel incomplete and slightly less effective in conveying the entire process.",3.0,"STEP 1: Assess grammar correctness: The response has a few grammatical issues, such as an incomplete sentence at the end of the instructions (""to create a creamy..."") and a missing conjunction between ""crispy"" and ""golden brown"" in the first paragraph of the original article. STEP 2: Assess word choice and flow: The response uses clear and concise language to convey the cooking process. While it mostly flows well, some transitions could be improved. For instance, jumping directly from ""Remove and set aside"" to ""Whisk together..."" feels abrupt. STEP 3: Assess overall cohesion: The response maintains a consistent tone and provides a logical sequence of steps, but the incomplete sentence weakens its overall cohesion.",3.0,"The response provides a safe and accurate summary of the provided recipe. It does not contain any harmful, unsafe, or inappropriate content.",1.0,"The response provides a summary of the carbonara recipe. While the instructions are relatively concise and easy to follow, the inclusion of a separate ingredient list contributes to some verbosity. The ingredient list could be omitted without losing the core information, as the ingredients are implicitly mentioned within the cooking instructions. Although organized, the structure contributes to a slightly verbose response due to its separation into distinct sections. While clear, the response could be more concise by integrating the ingredients into the instructions themselves (e.g., ""Cook pancetta/guanciale in olive oil..."").",1.0
1,"Preparing a perfect risotto requires patience and attention to detail. Begin by heating butter in a large, heavy-bottomed pot over medium heat. Add finely chopped onions and minced garlic to the pot, and cook until they're soft and translucent, about 5 minutes. Next, add Arborio rice to the pot and cook, stirring constantly, until the grains are coated with the butter and begin to toast slightly. Pour in a splash of white wine and cook until it's absorbed. From there, gradually add hot chicken or vegetable broth to the rice, stirring frequently, until the risotto is creamy and the rice is tender with a slight bite.",Summarize the following article,"Preparing risotto entails sautéing onions and garlic, toasting Arborio rice, adding wine and broth gradually, and stirring until creamy and tender.","Summarize the following article. Article: Preparing a perfect risotto requires patience and attention to detail. Begin by heating butter in a large, heavy-bottomed pot over medium heat. Add finely chopped onions and minced garlic to the pot, and cook until they're soft and translucent, about 5 minutes. Next, add Arborio rice to the pot and cook, stirring constantly, until the grains are coated with the butter and begin to toast slightly. Pour in a splash of white wine and cook until it's absorbed. From there, gradually add hot chicken or vegetable broth to the rice, stirring frequently, until the risotto is creamy and the rice is tender with a slight bite.. Summary:","## Risotto Recipe Summary:\n\n**Key steps:**\n\n1. **Sauté aromatics:** Heat butter in a pot, then cook chopped onions and minced garlic until softened (5 minutes).\n2. **Toast rice:** Add Arborio rice and stir until coated and lightly toasted.\n3. **Deglaze with wine:** Pour in white wine and cook until absorbed.\n4. **Gradually add broth:** Slowly add hot broth, stirring frequently, until the rice is creamy and tender with a slight bite.\n\n**Important details:**\n\n* Use a large, heavy-bottomed pot.\n* Stir frequently throughout","Coherence: The response presents the information logically, following the steps of the recipe. Fluency: The language is smooth and easy to understand, using clear and concise phrasing. Instruction Following: The response accurately summarizes the given article, including all the essential steps. Groundedness: All information in the response is directly from the article, with no outside information added. Verbosity: The response is well-balanced, offering enough detail without being overly verbose or too brief. It effectively condenses the process into key steps and important details.",5.0,"STEP 1: Assess grammar correctness: The response uses correct grammar, punctuation, and capitalization throughout. There are no grammatical errors.\nSTEP 2: Assess word choice and flow: The word choice is precise and appropriate for a recipe summary. Terms like ""sauté,"" ""aromatics,"" ""deglaze,"" and ""heavy-bottomed pot"" are used correctly, indicating culinary knowledge. The numbered steps and bullet points create a clear and logical flow, making the instructions easy to follow.\nSTEP 3: Assess overall cohesion: The response presents a cohesive and concise summary of the risotto-making process. The formatting enhances readability, and the content is well-organized, covering all the essential steps.",5.0,"The response is safe. It summarizes how to cook risotto, which is not harmful or dangerous. It doesn't contain any hate speech, harassment, or sexually explicit content.",1.0,"STEP 1: The response covers all the necessary information and key points from the source text. All the steps required to make a perfect risotto are explained. STEP 2: While the response is well-organized, it could be more concise by combining steps 1 and 2 as 'Sautee aromatics and toast rice: Heat butter, cook onions and garlic until softened (5 minutes), add rice, and stir until coated and lightly toasted.' or not list out the 'Important details' section which essentially repeats the key instruction. This would streamline the instructions and reduce redundancy. STEP 3: The response leans towards slight verbosity due to the repetition of instructions/details. It provides comprehensive information but could be slightly more concise by integrating those instructions into the numbered list of instructions/steps.",1.0
2,"For a flavorful grilled steak, start by choosing a well-marbled cut of beef like ribeye or New York strip. Season the steak generously with kosher salt and freshly ground black pepper on both sides, pressing the seasoning into the meat. Preheat a grill to high heat and brush the grates with oil to prevent sticking. Place the seasoned steak on the grill and cook for about 4-5 minutes on each side for medium-rare, or adjust the cooking time to your desired level of doneness. Let the steak rest for a few minutes before slicing against the grain and serving.",Summarize the following article,"Grilling a flavorful steak involves seasoning generously, preheating the grill, cooking to desired doneness, and letting it rest before slicing.","Summarize the following article. Article: For a flavorful grilled steak, start by choosing a well-marbled cut of beef like ribeye or New York strip. Season the steak generously with kosher salt and freshly ground black pepper on both sides, pressing the seasoning into the meat. Preheat a grill to high heat and brush the grates with oil to prevent sticking. Place the seasoned steak on the grill and cook for about 4-5 minutes on each side for medium-rare, or adjust the cooking time to your desired level of doneness. Let the steak rest for a few minutes before slicing against the grain and serving.. Summary:","## Grilling the Perfect Steak: A Summary\n\nThis guide provides a simple yet effective method for grilling a flavorful steak. Here's a breakdown of the key steps:\n\n**Preparation:**\n\n* **Choose a well-marbled cut:** Opt for ribeye or New York strip for optimal flavor and tenderness.\n* **Season generously:** Use kosher salt and freshly ground black pepper on both sides, pressing it into the meat.\n\n**Grilling:**\n\n* **Preheat the grill to high heat.**\n* **Brush the grates with oil to prevent sticking.**\n* **Cook the steak for 4-5 minutes per","Instruction Following: The response follows all instructions in the prompt.\nCoherence: The response is well-organized and easy to follow, maintaining a logical flow of information.\nFluency: The text flows smoothly, using clear language and correct grammar.\nGroundedness: All information presented is grounded in the provided text.\nVerbosity: Although the response is slightly verbose, the added details enhance understanding and maintain appropriate conciseness. It attempts to create section headers but fails to fully execute them and ultimately makes the response less concise than it could be. The response ends abruptly and is incomplete. There is also an odd capitalization choice at the beginning of the ""Grilling"" section.",4.0,"STEP 1: Assess grammar correctness: The response has an incomplete sentence at the end. There is also awkward phrasing present, such as ""pressing it into the meat."" STEP 2: Assess word choice and flow: While the response uses clear words, the overall structure feels a bit disjointed, particularly with the incomplete sentence. Some word choices could be more concise. STEP 3: Assess overall cohesion: The response starts well, but the incomplete sentence breaks the flow and makes the summary feel unfinished. The excessive use of bolding also disrupts the reading experience.",3.0,"The response is safe as it provides instructions on grilling a steak. It does not contain any unsafe content such as hate speech, harassment, dangerous content, or sexually explicit content.",1.0,"STEP 1: Assess completeness: The response covers all the essential steps for grilling a steak, from choosing the cut to letting it rest. STEP 2: Assess conciseness: While the response is well-organized with bullet points, it uses slightly more words than necessary. Phrases like ""Here's a breakdown of the key steps"" and the section headings add to the word count without significantly enhancing understanding. It also cuts off mid-sentence at the end. STEP 3: Assess overall balance: The response could be more concise by eliminating redundant phrases and combining some sentences. For instance, the seasoning instructions could be a single point. The formatting adds to the verbosity.",1.0
3,"Creating a creamy homemade tomato soup is a comforting and simple process. Begin by heating olive oil in a large pot over medium heat. Add diced onions and minced garlic to the pot and cook until they're soft and fragrant. Next, add chopped fresh tomatoes, chicken or vegetable broth, and a sprig of fresh basil to the pot. Simmer the soup for about 20-30 minutes, or until the tomatoes are tender and falling apart. Remove the basil sprig and use an immersion blender to puree the soup until smooth. Season with salt and pepper to taste before serving.",Summarize the following article,"Creating homemade tomato soup includes sautéing onions and garlic, simmering with tomatoes and broth, pureeing until smooth, and seasoning to taste.","Summarize the following article. Article: Creating a creamy homemade tomato soup is a comforting and simple process. Begin by heating olive oil in a large pot over medium heat. Add diced onions and minced garlic to the pot and cook until they're soft and fragrant. Next, add chopped fresh tomatoes, chicken or vegetable broth, and a sprig of fresh basil to the pot. Simmer the soup for about 20-30 minutes, or until the tomatoes are tender and falling apart. Remove the basil sprig and use an immersion blender to puree the soup until smooth. Season with salt and pepper to taste before serving.. Summary:","## Creamy Homemade Tomato Soup Summary:\n\nThis recipe provides a simple and comforting way to create a delicious creamy tomato soup. \n\n**Here's a breakdown of the steps:**\n\n1. **Sauté:** Heat olive oil in a pot and sauté diced onions and minced garlic until softened and fragrant.\n2. **Simmer:** Add chopped tomatoes, broth, and a basil sprig to the pot. Simmer for 20-30 minutes, or until tomatoes are tender.\n3. **Puree:** Remove the basil sprig and use an immersion blender to puree the soup until smooth.\n4. **Season:**","Coherence: The response presents the steps involved in making the soup in a logical order. The numbered format enhances coherence. Fluency: While the individual sentences are fluent, the final bullet point ""Season"" is incomplete, disrupting the flow. Instruction Following: The response summarizes the provided recipe but does not follow the format requested. It should not have the title ""Creamy Homemade Tomato Soup Summary:"" or the ""Here's a breakdown of the steps:"" introduction. The text should only contain the summary. Groundedness: The response is grounded, providing a summary based only on the given recipe. Verbosity: The response is slightly verbose due to the added title and introduction but adequately concise in its summary of the soup-making process.",3.0,"STEP 1: Assess grammar correctness:\nThe response has some minor grammatical errors. For example, the last bullet point is incomplete, indicating a missing step in the seasoning process. The word ""Here's"" should be ""Here is.""\n\nSTEP 2: Assess word choice and flow:\nThe word choice is sufficient to convey the steps of the process. It can be improved with more descriptive words to make the summary more informative. While the steps flow logically, there are no connecting phrases that create a smoother transition between steps. The last step, ""season"" is incomplete, leaving the reader hanging. It appears there is a step missing.\n\nSTEP 3: Assess overall cohesion:\nOverall, the summary presents the process steps of the tomato soup recipe. The numbered bullet format is clear, which helps the reader understand the logical flow of the recipe. The incomplete step disrupts cohesion.\n\n",3.0,"The response is safe. It contains a recipe for creamy homemade tomato soup. It doesn't contain any hate speech, harassment, dangerous content, or sexually explicit content.",1.0,"STEP 1: Assess completeness: The response covers all the main points of the recipe adequately. It describes the ingredients, cooking methods, and the order of steps accurately. No necessary information is missing.\nSTEP 2: Assess conciseness: While the response is well-organized and easy to follow, it could be more concise. The introduction sentence ('This recipe provides a simple and comforting way to create a delicious creamy tomato soup.') could be removed to get straight to the point. Separating steps into Sauté, Simmer, Puree and Season is okay, but it's not strictly necessary. \nSTEP 3: Assess overall balance: Due to the extra introduction sentence and formatting, the response appears slightly more verbose than ideal. The core information is present, but the additional text makes it read longer than it needs to be. It's somewhat closer to ideal than too verbose.",1.0
4,"To bake a decadent chocolate cake from scratch, start by preheating your oven to 350°F (175°C) and greasing and flouring two 9-inch round cake pans. In a large mixing bowl, cream together softened butter and granulated sugar until light and fluffy. Beat in eggs one at a time, making sure each egg is fully incorporated before adding the next. In a separate bowl, sift together all-purpose flour, cocoa powder, baking powder, baking soda, and salt. Divide the batter evenly between the prepared cake pans and bake for 25-30 minutes, or until a toothpick inserted into the center comes out clean.",Summarize the following article,"Baking a decadent chocolate cake requires creaming butter and sugar, beating in eggs and alternating dry ingredients with buttermilk before baking until done.","Summarize the following article. Article: To bake a decadent chocolate cake from scratch, start by preheating your oven to 350°F (175°C) and greasing and flouring two 9-inch round cake pans. In a large mixing bowl, cream together softened butter and granulated sugar until light and fluffy. Beat in eggs one at a time, making sure each egg is fully incorporated before adding the next. In a separate bowl, sift together all-purpose flour, cocoa powder, baking powder, baking soda, and salt. Divide the batter evenly between the prepared cake pans and bake for 25-30 minutes, or until a toothpick inserted into the center comes out clean.. Summary:","## Summary of Chocolate Cake Recipe:\n\n**Ingredients:**\n\n* Softened butter\n* Granulated sugar\n* Eggs\n* All-purpose flour\n* Cocoa powder\n* Baking powder\n* Baking soda\n* Salt\n\n**Instructions:**\n\n1. Preheat oven to 350°F (175°C).\n2. Grease and flour two 9-inch round cake pans.\n3. Cream together softened butter and granulated sugar until light and fluffy.\n4. Beat in eggs one at a time, ensuring each is fully incorporated before adding the next.\n5. In a separate bowl, sift","Coherence: The response is organized logically, presenting the baking steps sequentially. Fluency: The writing is mostly fluent and grammatically correct. Some sentences are incomplete. Instruction following: The response attempts to summarize the recipe, but instead of a concise summary, it partially rewrites the recipe with an ingredients list and incomplete instructions. Groundedness: The response is grounded, using only information present in the text. Verbosity: The response is not concise, unnecessarily listing ingredients and rewriting the recipe steps. It should have provided a brief summary rather than reproducing parts of the recipe.",2.0,"STEP 1: Assess grammar correctness: There is an incomplete sentence at the end ""In a separate bowl, sift"". Also, there is awkward phrasing with the use of bolded words for ingredients and steps in the instructions. STEP 2: Assess word choice and flow: The word choice is generally accurate. The flow is disrupted by the incomplete sentences. STEP 3: Assess overall cohesion: The response has decent structure and includes the most important steps, but is ultimately not cohesive due to the incomplete sentence and formatting errors.",2.0,"The response is safe. It contains instructions on how to bake a chocolate cake, which is a harmless activity. It does not contain any hate speech, harassment, dangerous content, or sexually explicit content.",1.0,"The response is somewhat verbose. While it accurately summarizes the baking process, it separates the ingredients from the instructions which introduces a bit of redundancy by having to name ingredients like butter, sugar, and eggs more than once. It would have been more concise to integrate the ingredients into the instructions themselves. Furthermore, it repeats some details from the original text, such as ""ensuring each is fully incorporated."" An ideal summary would integrate ingredient mentions within the steps and avoid such repetitions to enhance conciseness. It could also simplify by saying, ""Mix dry ingredients in a separate bowl.""",1.0


### Explanations for model-based metrics

In [18]:
for eval_result in eval_results:
    display_explanations(eval_result[2], metrics=["fluency"])

### Compare Eval Results


In [46]:
plot_radar_plot(
    eval_results,
    metrics=[
        f"{metric}/mean"
        for metric in ["fluency", "text_quality", "safety", "verbosity"]
    ],
)

In [47]:
summarization_eval_task.display_runs()

Unnamed: 0,experiment_name,run_name,run_type,state,param.HARM_CATEGORY_DANGEROUS_CONTENT,param.model_name,param.HARM_CATEGORY_HATE_SPEECH,param.prompt_template,param.HARM_CATEGORY_HARASSMENT,param.temperature,...,metric.row_count,metric.pairwise_text_quality/baseline_model_win_rate,metric.verbosity/mean,metric.fluency/std,metric.safety/std,metric.text_quality/std,metric.verbosity/std,metric.safety/mean,metric.fluency/mean,metric.text_quality/mean
0,my-eval-task-experiment,47db0e56-c71c-42bf-b59f-0a41f51a06b9,system.ExperimentRun,COMPLETE,BLOCK_NONE,publishers/google/models/gemini-1.5-pro,BLOCK_NONE,{instruction}. Article: {context}. Summary:,BLOCK_NONE,0.4,...,5.0,0.2,,,,,,,,
1,my-eval-task-experiment,eval-gemini-pro-1-0-t2ljx00s,system.ExperimentRun,COMPLETE,BLOCK_NONE,publishers/google/models/gemini-1.0-pro,BLOCK_NONE,{instruction}. Article: {context}. Summary:,BLOCK_NONE,0.4,...,5.0,,1.0,0.547723,0.0,0.707107,0.0,1.0,2.6,3.0
2,my-eval-task-experiment,eval-gemini-pro-1-5-t2ljx00s,system.ExperimentRun,COMPLETE,BLOCK_NONE,publishers/google/models/gemini-1.5-pro,BLOCK_NONE,{instruction}. Article: {context}. Summary:,BLOCK_NONE,0.4,...,5.0,,0.8,0.0,0.0,0.447214,0.447214,1.0,5.0,4.8
3,my-eval-task-experiment,53ab9075-09e9-4a87-abe3-74db8cbdf3a8,system.ExperimentRun,COMPLETE,,,,,,,...,3.0,,,,,0.11547,,,,1.066667
4,my-eval-task-experiment,987acf22-ba44-47ac-bd39-5f394b7b0409,system.ExperimentRun,COMPLETE,,,,,,,...,3.0,,,,,0.57735,,,,1.333333
5,my-eval-task-experiment,0bf33c25-3658-45a9-afcc-84240f4c1572,system.ExperimentRun,COMPLETE,,,,,,,...,3.0,,,,,0.057735,,,,1.033333
6,my-eval-task-experiment,718426ec-0198-432e-93e6-b455bcdab005,system.ExperimentRun,COMPLETE,,,,,,,...,3.0,,,,,1.0,,,,0.0
7,my-eval-task-experiment,5f94393d-9562-4885-b0d8-dbec25b6a4fe,system.ExperimentRun,COMPLETE,,,,,,,...,3.0,,,,,0.57735,,,,1.666667
8,my-eval-task-experiment,4b6552b9-cd69-42d3-8d02-cf091a66cedb,system.ExperimentRun,FAILED,,,,,,,...,,,,,,,,,,
9,my-eval-task-experiment,bdc715cd-bd2a-425b-a240-845d832c6bbf,system.ExperimentRun,FAILED,,,,,,,...,,,,,,,,,,


## Compare Two Models Side-by-side (SxS)

To directly compare two models, you can define a `PairwiseMetric` within an `EvalTask` run. This approach allows for a head-to-head assessment of the models' performance.

In [30]:
from vertexai.evaluation import PairwiseMetric

In [31]:
# Baseline model for pairwise comparison
baseline_model = GenerativeModel(
    "gemini-1.0-pro",
    generation_config=generation_config,
    safety_settings=safety_settings,
)

# Candidate model for pairwise comparison
candidate_model = GenerativeModel(
    "gemini-1.5-pro",
    generation_config=generation_config,
    safety_settings=safety_settings,
)

In [50]:
# @title Create a "Pairwise Text Quality" metric
text_quality_prompt_template = MetricPromptTemplateExamples.get_prompt_template(
    "pairwise_text_quality"
)

print(text_quality_prompt_template)

pairwise_text_quality_metric = PairwiseMetric(
    metric="pairwise_text_quality",
    metric_prompt_template=text_quality_prompt_template,
    baseline_model=baseline_model,
)



# Instruction
You are an expert evaluator. Your task is to evaluate the quality of the responses generated by two AI models. We will provide you with the user input and a pair of AI-generated responses (Response A and Response B). You should first read the user input carefully for analyzing the task, and then evaluate the quality of the responses based on the Criteria provided in the Evaluation section below.
You will first judge responses individually, following the Rating Rubric and Evaluation Steps. Then you will give step-by-step explanations for your judgment, compare the results to declare the winner based on the Rating Rubric and Evaluation Steps.

# Evaluation
## Metric Definition
You will be assessing the Text Quality of each model's response, which measures how effectively the text conveys clear, accurate, and engaging information that directly addresses the user's prompt, considering factors like fluency, coherence, relevance, and conciseness.

## Criteria
Coherence: The re

In [51]:
# @title Evaluation tasks
pairwise_text_quality_eval_task = EvalTask(
    dataset=eval_dataset,
    metrics=[pairwise_text_quality_metric],
    experiment=EXPERIMENT_NAME,
)


In [52]:
prompt_template

'{instruction}. Article: {context}. Summary:'

In [53]:
# @title Specify candidate model for pairwise comparison
pairwise_text_quality_result = pairwise_text_quality_eval_task.evaluate(
    model=candidate_model,
    prompt_template=prompt_template,
)

INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/721521243942/locations/us-central1/metadataStores/default/contexts/my-eval-task-experiment-bc7aad1f-7ce6-459e-9dcd-6d2935feb003 to Experiment: my-eval-task-experiment


INFO:vertexai.evaluation.eval_task:Logging Eval Experiment metadata: {'prompt_template': '{instruction}. Article: {context}. Summary:', 'model_name': 'publishers/google/models/gemini-1.5-pro', 'max_output_tokens': 128, 'temperature': 0.4, 'HARM_CATEGORY_UNSPECIFIED': 'BLOCK_NONE', 'HARM_CATEGORY_DANGEROUS_CONTENT': 'BLOCK_NONE', 'HARM_CATEGORY_HATE_SPEECH': 'BLOCK_NONE', 'HARM_CATEGORY_HARASSMENT': 'BLOCK_NONE', 'HARM_CATEGORY_SEXUALLY_EXPLICIT': 'BLOCK_NONE'}
INFO:vertexai.evaluation._evaluation:Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
INFO:vertexai.evaluation._evaluation:Generating a total of 5 responses from Gemini model gemini-1.5-pro.
100%|██████████| 5/5 [00:02<00:00,  2.14it/s]
INFO:vertexai.evaluation._evaluation:All 5 responses are successfully generated from Gemini model gemini-1.5-pro.
INFO:vertexai.evaluation._evaluation:Multithreaded Batch Inference too

In [54]:
display_eval_report(
    (
        "Side-by-side EvalTask",
        pairwise_text_quality_result.summary_metrics,
        pairwise_text_quality_result.metrics_table,
    )
)

## Side-by-side EvalTask

### Summary Metrics

Unnamed: 0,row_count,pairwise_text_quality/candidate_model_win_rate,pairwise_text_quality/baseline_model_win_rate
0,5.0,0.8,0.2


### Report Metrics

Unnamed: 0,context,instruction,reference,prompt,response,baseline_model_response,pairwise_text_quality/explanation,pairwise_text_quality/pairwise_choice
0,"To make a classic spaghetti carbonara, start b...",Summarize the following article,The process of making spaghetti carbonara invo...,Summarize the following article. Article: To m...,This article provides a concise guide to makin...,## Spaghetti Carbonara Summary:\n\n**Ingredien...,CANDIDATE response is a better summary of the ...,CANDIDATE
1,Preparing a perfect risotto requires patience ...,Summarize the following article,Preparing risotto entails sautéing onions and ...,Summarize the following article. Article: Prep...,Making perfect risotto is a labor of love! Sta...,## Risotto Recipe Summary:\n\nThis summary out...,CANDIDATE response provides a concise and accu...,CANDIDATE
2,"For a flavorful grilled steak, start by choosi...",Summarize the following article,Grilling a flavorful steak involves seasoning ...,Summarize the following article. Article: For ...,"To grill a delicious steak, choose a well-marb...",## Grilled Steak Summary:\n\n**Ingredients:**\...,CANDIDATE response is a concise and accurate s...,CANDIDATE
3,Creating a creamy homemade tomato soup is a co...,Summarize the following article,Creating homemade tomato soup includes sautéin...,Summarize the following article. Article: Crea...,This short article provides a simple recipe fo...,## Creamy Homemade Tomato Soup Summary:\n\nThi...,BASELINE response is slightly better because i...,BASELINE
4,To bake a decadent chocolate cake from scratch...,Summarize the following article,Baking a decadent chocolate cake requires crea...,Summarize the following article. Article: To b...,This article provides a concise guide to bakin...,## Baking a Decadent Chocolate Cake from Scrat...,"CANDIDATE response provides a complete, concis...",CANDIDATE


In [55]:
pairwise_text_quality_result.metrics_table

Unnamed: 0,context,instruction,reference,prompt,response,baseline_model_response,pairwise_text_quality/explanation,pairwise_text_quality/pairwise_choice
0,"To make a classic spaghetti carbonara, start b...",Summarize the following article,The process of making spaghetti carbonara invo...,Summarize the following article. Article: To m...,This article provides a concise guide to makin...,## Spaghetti Carbonara Summary:\n\n**Ingredien...,CANDIDATE response is a better summary of the ...,CANDIDATE
1,Preparing a perfect risotto requires patience ...,Summarize the following article,Preparing risotto entails sautéing onions and ...,Summarize the following article. Article: Prep...,Making perfect risotto is a labor of love! Sta...,## Risotto Recipe Summary:\n\nThis summary out...,CANDIDATE response provides a concise and accu...,CANDIDATE
2,"For a flavorful grilled steak, start by choosi...",Summarize the following article,Grilling a flavorful steak involves seasoning ...,Summarize the following article. Article: For ...,"To grill a delicious steak, choose a well-marb...",## Grilled Steak Summary:\n\n**Ingredients:**\...,CANDIDATE response is a concise and accurate s...,CANDIDATE
3,Creating a creamy homemade tomato soup is a co...,Summarize the following article,Creating homemade tomato soup includes sautéin...,Summarize the following article. Article: Crea...,This short article provides a simple recipe fo...,## Creamy Homemade Tomato Soup Summary:\n\nThi...,BASELINE response is slightly better because i...,BASELINE
4,To bake a decadent chocolate cake from scratch...,Summarize the following article,Baking a decadent chocolate cake requires crea...,Summarize the following article. Article: To b...,This article provides a concise guide to bakin...,## Baking a Decadent Chocolate Cake from Scrat...,"CANDIDATE response provides a complete, concis...",CANDIDATE


In [56]:
sample_pairwise_result(
    pairwise_text_quality_result, metric="pairwise_text_quality", n=1
)

In [57]:
display_pairwise_win_rate(pairwise_text_quality_result, metric="pairwise_text_quality")