## 0. Set Environment

In [None]:
!pip install ragas
!pip install tonic-validate
!pip install mlflow
!pip install openai
!pip install tiktoken
!pip install anthropic

Collecting ragas
  Downloading ragas-0.1.7-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.2/81.2 kB[0m [31m943.0 kB/s[0m eta [36m0:00:00[0m
Collecting datasets (from ragas)
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tiktoken (from ragas)
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m39.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain (from ragas)
  Downloading langchain-0.1.20-py3-none-any.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-core (from ragas)
  Downloading langchain_core-0.1.52-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━

In [None]:
import os
import openai
import mlflow
import pandas as pd

from collections import Counter
from datasets import Dataset
from anthropic import Anthropic

from ragas import evaluate
from ragas.metrics import answer_correctness, answer_similarity
from tonic_validate import ValidateScorer, Benchmark

os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"
os.environ["ANTHROPIC_API_KEY"] = "ANTHROPIC_API_KEY"

In [None]:
question_list = ["2024년 1월, 2월, 3월 각각의 평균 조달금리와 응찰률이 어떻게 되나요?", "2024년 1월, 2월, 3월 각각의 평균 조달금리와 응찰률이 어떻게 되나요?"]
generated_answer_list = ["2024년 1월의 평균 조달금리는 3.27%, 응찰률은 333%입니다. 2월의 평균 조달금리는 3.36%, 응찰률은 335%입니다. 3월의 평균 조달금리는 3.32%, 응찰률은 334%입니다[2].", "2024년 1월, 2월, 3월의 평균 조달 금리는 각각 3.57%, 3.52%, 3.32% 입니다. 응찰률은 각각 271%, 285%, 334% 입니다."]
target_answer_list = ["2024년 1월의 평균 조달금리는 3.27%, 응찰률은 333이며, 2월의 평균 조달금리는 3.36%, 응찰률은 335이며, 3월의 평균 조달금리는 3.32%, 응찰률은 334입니다.", "2024년 1월의 평균 조달금리는 3.27%, 응찰률은 333%입니다. 2월의 평균 조달금리는 3.36%, 응찰률은 335%입니다. 3월의 평균 조달금리는 3.32%, 응찰률은 334%입니다[2]."]

## 1. RAGAS : answer_correctness

In [None]:
data_samples = {
    'question': question_list,
    'answer': generated_answer_list,
    'ground_truth': target_answer_list
}
dataset = Dataset.from_dict(data_samples)

ragas_score = evaluate(dataset, metrics=[answer_correctness])
ragas_score = ragas_score.to_pandas()
ragas_answer_correctness = ["O" if ele > 0.6 else "X" for ele in ragas_score["answer_correctness"].tolist()]
print(ragas_answer_correctness)

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

['O', 'O']


## 2. TonicAI : answer_similarity

In [None]:
def get_llm_response(question_index):
    return {
        "llm_answer": generated_answer_list[question_index],
        "llm_context_list": [""]
    }


scorer = ValidateScorer()

tonic_answer_similarity = []
for i, question in enumerate(question_list):
    benchmark = Benchmark(questions=[question], answers=[target_answer_list[i]])
    run = scorer.score(benchmark, lambda q: get_llm_response(i))
    tonic_score = run.run_data[0].scores["answer_similarity"]
    tonic_answer_similarity.append("O" if tonic_score > 3.0 else "X")

print(tonic_answer_similarity)

Retrieving responses: 100%|██████████| 1/1 [00:00<00:00, 5159.05it/s]
Scoring responses:   0%|          | 0/1 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Scoring responses: 100%|██████████| 1/1 [00:07<00:00,  7.23s/it]
Retrieving responses: 100%|██████████| 1/1 [00:00<00:00, 4826.59it/s]
Scoring responses:   0%|          | 0/1 [00:00<?, ?it/s]INFO:openai._base_client:Retrying request to /

['O', 'O']


## 3. MLflow

In [None]:
mlflow_eval_data = pd.DataFrame({"inputs": question_list, "predictions": generated_answer_list, "ground_truth": target_answer_list})

with mlflow.start_run() as run:
    results = mlflow.evaluate(
        data=mlflow_eval_data,
        targets="ground_truth",
        predictions="predictions",
        extra_metrics=[mlflow.metrics.genai.answer_similarity(), mlflow.metrics.genai.answer_correctness()],
        evaluators="default",
    )

    eval_table = results.tables["eval_results_table"]
    mlflow_answer_similarity = eval_table["answer_similarity/v1/score"].tolist()
    mlflow_answer_correctness = eval_table["answer_correctness/v1/score"].tolist()

mlflow_answer_similarity = ["O" if ele > 3 else "X" for ele in mlflow_answer_similarity]
mlflow_answer_correctness = ["O" if ele > 3 else "X" for ele in mlflow_answer_correctness]

print(mlflow_answer_similarity)
print(mlflow_answer_correctness)

2024/05/15 23:45:43 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2024/05/15 23:45:43 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

['O', 'X']
['O', 'X']


## 4. Allganize Eval : claude3-opus

In [None]:
client = Anthropic()

EVAL_PROMPT = """
question = \"\"\"
{question}
\"\"\"

target_answer = \"\"\"
{target_answer}
\"\"\"

generated_answer = \"\"\"
{generated_answer}
\"\"\"

question을 참조해서 target_answer와 generated_answer가 관련있는지 체크해줘.
두개의 답변이 일치하면 1 일치하지 않으면 0으로 답변해줘.
반드시 1 혹은 0으로 답변해줘.
"""


claude3_opus_result = []
for i in range(0, len(question_list)):
    eval_prompt = EVAL_PROMPT.format(target_answer=target_answer_list[i], generated_answer=generated_answer_list[i], question=question_list[i])
    message = client.messages.create(
        max_tokens=100,
        messages=[{"role": "user", "content": eval_prompt}],
        model="claude-3-opus-20240229",
    )
    claude3_opus_result.append(int(message.content[0].text))

claude3_opus_result = ["O" if ele == 1 else "X" for ele in claude3_opus_result]
print(claude3_opus_result)

INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


['X', 'X']


## 5. Ensemble

In [None]:
def vote(lst, threshold):
    counts = Counter(lst)

    if counts.get("O", 0) >= threshold:
        return "O"
    else:
        return "X"


total_result = []
for i in range(0, len(claude3_opus_result)):
    ensem_list = [ragas_answer_correctness[i], tonic_answer_similarity[i], mlflow_answer_correctness[i], mlflow_answer_similarity[i], claude3_opus_result[i]]
    result = vote(ensem_list, threshold=3)
    total_result.append(result)

print(total_result)

['O', 'X']
