In [None]:
# Copyright 2024 Forusone
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

 # Gen AI Evaluation Service           

In [1]:
# @title Install Vertex AI Python SDK
%pip install --upgrade --user --quiet google-cloud-aiplatform[evaluation]

In [2]:
# @title Set GCP information
PROJECT_ID = "ai-hangsik"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}
EXPERIMENT_NAME = "my-eval-task-experiment"  # @param {type:"string"}

In [3]:
# @title Authentication to GCP
import sys

if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user()

### Set Google Cloud project information and initialize Vertex AI SDK

In [4]:
# @title Initialize Vertex AI SDK
import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

In [5]:
# @title Import libraries
import pandas as pd

# Main
from vertexai.evaluation import EvalTask, PointwiseMetric, PointwiseMetricPromptTemplate

In [20]:
# @title Evaluation prompt
metric_prompt_template = """

  # Instruction
  You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models.
  We will provide you with the user input and an AI-generated responses.
  You should first read the user input carefully for analyzing the task, and then evaluate the quality of the responses based on the Criteria provided in the Evaluation section below.
  You will assign the response a rating following the Rating Rubric and Evaluation Steps. Give step-by-step explanations for your rating, and only choose ratings from the Rating Rubric.

  # Evaluation
  ## Metric Definition
  You will be assessing fluency, which measures language mastery of the model's response based on the user prompt.

  ## Criteria
  Fluency: The text is free of grammatical errors, employs varied sentence structures, and maintains a consistent tone and style, resulting in a smooth and natural flow that is easy to understand.

  ## Rating Rubric
  5: (Completely fluent). The response is free of grammatical errors, demonstrates nuanced word choice, and has a natural, seamless flow.
  4: (Mostly fluent). The response has very few, if any, minor grammatical errors. Word choice is clear, and sentences generally flow well.
  3: (Somewhat fluent). The response has grammatical errors present, which may cause some difficulty for the reader. Word choice is mostly appropriate, but some awkward phrasing or word repetition may exist.
  2: (Somewhat inarticulate). The response has frequent grammatical errors make the writing difficult to understand. Sentence structure is often awkward, and there's little sense of flow.
  1: (Inarticulate). The response is riddled with grammatical issues, rendering it incomprehensible in parts. Word choices may be very limited or inaccurate.

  ## Evaluation Steps
  STEP 1: Assess grammar correctness: Identify any specific errors in the response's sentence structure, verb usage, subject-verb agreement, punctuation, and capitalization.
  STEP 2: Assess word choice and flow: Examine the response's sentence structure and how the writing moves from one idea to the next. Are words accurate and well-suited to the context?
  STEP 3: Assess overall cohesion: Does the entire response read logically and smoothly, with appropriate transitions?

  # User Inputs and AI-generated Response
  ## User Inputs
  ### Prompt
  {prompt}

  ## AI-generated Response
  {response}

"""


In [21]:
text_quality = PointwiseMetric(
    metric="text_quality",
    metric_prompt_template=metric_prompt_template,
)

In [23]:
# @title Prepare evaluation dataset
data = [
    # An example of good text_quality
    ["Life is a rollercoaster, full of ups and downs, but it's the thrill that keeps us coming back for more!", " "],
    # An example of medium text_quality
    ["The weather is nice today, not too hot, not too cold.", " "],
    # An example of poor text_quality
    ["The weather is, you know, whatever.", " "],
]

eval_dataset = pd.DataFrame(data, columns=['prompt', 'response'])
eval_dataset

Unnamed: 0,prompt,response
0,"Life is a rollercoaster, full of ups and downs...",
1,"The weather is nice today, not too hot, not to...",
2,"The weather is, you know, whatever.",


In [24]:
# @title Run evaluation
eval_task = EvalTask(
    dataset=eval_dataset,
    metrics=[text_quality],
    experiment=EXPERIMENT_NAME
)

eval_result = eval_task.evaluate()

INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/721521243942/locations/us-central1/metadataStores/default/contexts/my-eval-task-experiment-53ab9075-09e9-4a87-abe3-74db8cbdf3a8 to Experiment: my-eval-task-experiment


INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 3 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 3/3 [00:05<00:00,  1.81s/it]
INFO:vertexai.evaluation._evaluation:All 3 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:5.45297934600012 seconds


In [25]:
# @title Results
eval_result.metrics_table

Unnamed: 0,prompt,response,text_quality/explanation,text_quality/score
0,"Life is a rollercoaster, full of ups and downs...",,STEP 1: Assess grammar correctness: The respo...,1.2
1,"The weather is nice today, not too hot, not to...",,STEP 1: Assess grammar correctness: The respon...,1.0
2,"The weather is, you know, whatever.",,STEP 1: Assess grammar correctness: The AI mod...,1.0


In [None]:
# @title Clean up
from google.cloud import aiplatform

aiplatform.ExperimentRun(
    run_name=eval_result.metadata["experiment_run"],
    experiment=eval_result.metadata["experiment"],
).delete()