## Prepare Eval Dataset

In [1]:
import pandas as pd
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/renoldnatasasmita/Datalabs/Project/GenaiSpecialization/sa.json"

PATH = 'final_qa_val.jsonl'
examples = pd.read_json(PATH, lines=True)
# Tambahkan kode untuk mengekstrak question dan answer
def extract_qa(row):
    contents = row['contents']
    question = contents[0]['parts'][0]['text']  # Mengambil pertanyaan dari user
    answer = contents[1]['parts'][0]['text'] if len(contents) > 1 else None  # Mengambil jawaban jika ada
    return pd.Series({'content': question, 'response_a': answer})

# Terapkan ekstraksi dan gabungkan dengan DataFrame asli
qa_df = examples.apply(extract_qa, axis=1)
examples = pd.concat([examples, qa_df], axis=1)

In [2]:
examples.head()

Unnamed: 0,contents,content,response_a
0,"[{'role': 'user', 'parts': [{'text': 'What is ...",What is the purpose of hydraulic storage in beds?,Hydraulic storage in beds uses a lift mechanis...
1,"[{'role': 'user', 'parts': [{'text': ""Can you ...",Can you explain the term 'Engineered Wood'?,'Engineered Wood' refers to a composite materi...
2,"[{'role': 'user', 'parts': [{'text': 'What doe...",What does GSM in bed linens mean?,"GSM stands for grams per square meter, indicat..."
3,"[{'role': 'user', 'parts': [{'text': 'How does...",How does a recliner mechanism work in chairs?,A recliner mechanism allows the chair's backre...
4,"[{'role': 'user', 'parts': [{'text': 'What is ...",What is the difference between PVC and PU leat...,PVC leather is more durable and water-resistan...


In [3]:
examples = examples.drop(columns=['contents'])
# add id column
examples['id'] = examples.index
# add content column but just string
examples['response_b'] = ""
examples.head()

Unnamed: 0,content,response_a,id,response_b
0,What is the purpose of hydraulic storage in beds?,Hydraulic storage in beds uses a lift mechanis...,0,
1,Can you explain the term 'Engineered Wood'?,'Engineered Wood' refers to a composite materi...,1,
2,What does GSM in bed linens mean?,"GSM stands for grams per square meter, indicat...",2,
3,How does a recliner mechanism work in chairs?,A recliner mechanism allows the chair's backre...,3,
4,What is the difference between PVC and PU leat...,PVC leather is more durable and water-resistan...,4,


In [4]:
# Call Fine Tuned Model
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, SafetySetting, Part

generation_config = {
    "max_output_tokens": 8192,
    "temperature": 1,
    "top_p": 0.95,
}

safety_settings = [
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
]

def multiturn_generate_content(question):
    vertexai.init(project="844021890758", location="asia-southeast1")
    model = GenerativeModel(
        "projects/844021890758/locations/asia-southeast1/endpoints/5071110355023822848",
    )
    chat = model.start_chat()
    ans = chat.send_message(question, generation_config=generation_config, safety_settings=safety_settings)
    return ans.candidates[0].content.parts[0].text

# Apply the function to the 'question' column
examples['response_b'] = examples['content'].apply(multiturn_generate_content)
examples.head()

Unnamed: 0,content,response_a,id,response_b
0,What is the purpose of hydraulic storage in beds?,Hydraulic storage in beds uses a lift mechanis...,0,"There's no established concept of ""hydraulic s..."
1,Can you explain the term 'Engineered Wood'?,'Engineered Wood' refers to a composite materi...,1,"Engineered wood, also known as composite wood,..."
2,What does GSM in bed linens mean?,"GSM stands for grams per square meter, indicat...",2,GSM in bed linens stands for **Grams per Squar...
3,How does a recliner mechanism work in chairs?,A recliner mechanism allows the chair's backre...,3,A recliner mechanism allows a chair to smoothl...
4,What is the difference between PVC and PU leat...,PVC leather is more durable and water-resistan...,4,PVC and PU leather are both synthetic material...


In [5]:
# Eval dataset from examples but change the column names into prompt, response, baseline_model_response
eval_dataset = examples.rename(columns={"content": "prompt", "response_a": "reference", "response_b": "response"})
eval_dataset.head()

Unnamed: 0,prompt,reference,id,response
0,What is the purpose of hydraulic storage in beds?,Hydraulic storage in beds uses a lift mechanis...,0,"There's no established concept of ""hydraulic s..."
1,Can you explain the term 'Engineered Wood'?,'Engineered Wood' refers to a composite materi...,1,"Engineered wood, also known as composite wood,..."
2,What does GSM in bed linens mean?,"GSM stands for grams per square meter, indicat...",2,GSM in bed linens stands for **Grams per Squar...
3,How does a recliner mechanism work in chairs?,A recliner mechanism allows the chair's backre...,3,A recliner mechanism allows a chair to smoothl...
4,What is the difference between PVC and PU leat...,PVC leather is more durable and water-resistan...,4,PVC and PU leather are both synthetic material...


In [6]:
eval_dataset.to_json("eval_dataset.json", orient="records", lines=True)

## Start EVAL

In [1]:
import pandas as pd
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/renoldnatasasmita/Datalabs/Project/GenaiSpecialization/sa.json"
eval_dataset = pd.read_json("eval_dataset.json", lines=True)
eval_dataset.head()

Unnamed: 0,prompt,reference,id,response
0,What is the purpose of hydraulic storage in beds?,Hydraulic storage in beds uses a lift mechanis...,0,"There's no established concept of ""hydraulic s..."
1,Can you explain the term 'Engineered Wood'?,'Engineered Wood' refers to a composite materi...,1,"Engineered wood, also known as composite wood,..."
2,What does GSM in bed linens mean?,"GSM stands for grams per square meter, indicat...",2,GSM in bed linens stands for **Grams per Squar...
3,How does a recliner mechanism work in chairs?,A recliner mechanism allows the chair's backre...,3,A recliner mechanism allows a chair to smoothl...
4,What is the difference between PVC and PU leat...,PVC leather is more durable and water-resistan...,4,PVC and PU leather are both synthetic material...


In [2]:
from vertexai.evaluation import (
    EvalTask,
    PairwiseMetric,
    PairwiseMetricPromptTemplate,
    PointwiseMetric,
    PointwiseMetricPromptTemplate,
    MetricPromptTemplateExamples
)

In [3]:
metrics = [
    "bleu",
    "rouge_l_sum",
    MetricPromptTemplateExamples.Pointwise.FLUENCY,
    MetricPromptTemplateExamples.Pointwise.COHERENCE,
]

In [4]:
eval_task = EvalTask(
    dataset=eval_dataset,
    metrics=metrics,
    experiment="demo3-pairwise-experiment-eval-fine-tuned-model"
)

In [5]:
import vertexai
vertexai.init(project="dla-gen-ai-specialization", location="asia-southeast1")

In [6]:
eval_result = eval_task.evaluate(
    experiment_run_name="demo3-pairwise-experiment-eval-fine-tuned-model"
)

Associating projects/844021890758/locations/asia-southeast1/metadataStores/default/contexts/demo3-pairwise-experiment-eval-fine-tuned-model-demo3-pairwise-experiment-eval-fine-tuned-model to Experiment: demo3-pairwise-experiment-eval-fine-tuned-model


Computing metrics with a total of 200 Vertex Gen AI Evaluation Service API requests.


100%|██████████| 200/200 [13:22<00:00,  4.01s/it]

All 200 metric requests are successfully computed.
Evaluation Took:802.615517875005 seconds





In [7]:
eval_result.metrics_table

Unnamed: 0,prompt,reference,id,response,bleu/score,rouge_l_sum/score,fluency/explanation,fluency/score,coherence/explanation,coherence/score
0,What is the purpose of hydraulic storage in beds?,Hydraulic storage in beds uses a lift mechanis...,0,"There's no established concept of ""hydraulic s...",0.006745,0.138365,STEP 1: Assess grammar correctness: The respon...,5.0,STEP 1: The purpose of the prompt is to unders...,5.0
1,Can you explain the term 'Engineered Wood'?,'Engineered Wood' refers to a composite materi...,1,"Engineered wood, also known as composite wood,...",0.010385,0.078029,STEP 1: Assess grammar correctness: The respon...,5.0,The AI response demonstrates a completely cohe...,5.0
2,What does GSM in bed linens mean?,"GSM stands for grams per square meter, indicat...",2,GSM in bed linens stands for **Grams per Squar...,0.060371,0.5,STEP 1: Assess grammar correctness: The respon...,5.0,STEP 1: The purpose of the prompt is to unders...,5.0
3,How does a recliner mechanism work in chairs?,A recliner mechanism allows the chair's backre...,3,A recliner mechanism allows a chair to smoothl...,0.01086,0.056795,STEP 1: Assess grammar correctness: The respon...,5.0,STEP 1: The purpose is to explain how a reclin...,5.0
4,What is the difference between PVC and PU leat...,PVC leather is more durable and water-resistan...,4,PVC and PU leather are both synthetic material...,0.012718,0.099217,STEP 1: Assess grammar correctness: The respon...,5.0,STEP 1: The purpose of the prompt is to unders...,5.0
5,What is the advantage of tempered glass in fur...,Tempered glass is stronger and safer than regu...,5,Tempered glass offers several advantages when ...,0.022773,0.12987,STEP 1: Assess grammar correctness: The respon...,5.0,STEP 1: The purpose is to inform the user abou...,5.0
6,How does particle board differ from MDF in cab...,"Particle board is lighter and less expensive, ...",6,Particle board and MDF (medium-density fiberbo...,0.003327,0.067416,STEP 1: Assess grammar correctness: The respon...,5.0,STEP 1: The purpose of the response is to info...,5.0
7,What does the term 'Pocket Spring Mattress' re...,A pocket spring mattress contains individually...,7,A pocket spring mattress refers to a mattress ...,0.130406,0.315789,STEP 1: Assess grammar correctness: The respon...,5.0,STEP 1: The purpose of the response is to defi...,5.0
8,What is the use of coir in mattresses?,"Coir, made from coconut fibers, is used in mat...",8,"Coir, a natural fiber extracted from the husk ...",0.023049,0.118919,STEP 1: Assess grammar correctness: The respon...,5.0,STEP 1: The purpose of the response is to expl...,5.0
9,How do you maintain wooden furniture for long-...,"To maintain wooden furniture, avoid direct sun...",9,Maintaining wooden furniture for long-term dur...,0.004289,0.06639,STEP 1: Assess grammar correctness: The respon...,5.0,The AI response demonstrates a completely cohe...,5.0


In [8]:
eval_result.metadata

{'experiment': 'demo3-pairwise-experiment-eval-fine-tuned-model',
 'experiment_run': 'demo3-pairwise-experiment-eval-fine-tuned-model'}

In [9]:
eval_result.summary_metrics

{'row_count': 50,
 'bleu/mean': 0.021755236418,
 'bleu/std': 0.026951018089257355,
 'rouge_l_sum/mean': 0.12202883160000003,
 'rouge_l_sum/std': 0.08553704473144383,
 'fluency/mean': 5.0,
 'fluency/std': 0.0,
 'coherence/mean': 4.96,
 'coherence/std': 0.282842712474619}

In [10]:
eval_result.metrics_table.to_csv("eval_result.csv")