In [1]:
from deepeval import evaluate
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
from pathlib import Path
import json
from transformers import AutoModelForCausalLM, AutoTokenizer,BloomForCausalLM
from deepeval.models.base_model import DeepEvalBaseLLM
from helpers.text_utils import TextUtils
from deepeval.metrics import GEval,HallucinationMetric,AnswerRelevancyMetric
from deepeval.test_case import LLMTestCaseParams
import re
from deepeval.dataset import EvaluationDataset
from langchain_openai import AzureChatOpenAI
from deepeval.models.base_model import DeepEvalBaseLLM
from helpers.call_openai import choosed_gpt4_key
from deepeval.metrics import BaseMetric

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class AzureOpenAI(DeepEvalBaseLLM):
    def __init__(
        self,
        model
    ):
        self.model = model

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        # print(chat_model.invoke(prompt).content)
        res=chat_model.invoke(prompt).content
        print(f"{res=}")
        return res

    async def a_generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        res = await chat_model.ainvoke(prompt)
        print(f"{res.content=}")
        return res.content

    def get_model_name(self):
        return "Custom Azure OpenAI Model"

llm_api = choosed_gpt4_key()["api"]
# Replace these with real values
custom_model = AzureChatOpenAI(
    openai_api_version="2023-12-01-preview",
    azure_deployment=llm_api["model_id"],
    azure_endpoint=llm_api["endpoint"],
    openai_api_key=llm_api["key1"],
)
llm1=AzureOpenAI(model=custom_model)

In [3]:
geval_question_undertsanding_metric = GEval(
    model=llm1,
    name="Question Understanding with Tasks Steps Accuracy",
    # criteria="Task Coherence - determine if the tasks key value given in actual output is coherent with the question given in the input.",
    # criteria="LanguageTaskUnderstanding - determine if the steps returned in the tasks key value in the output is correct and accurate for the question given in the input. ",
    criteria="Question Understanding with Tasks Steps Accuracy - determine if it is able to understand the input question given in any language well and has provided accurate steps to answer the question",
    threshold=0.5,
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
)

geval_format_metric = GEval(
    model=llm1,
    name="Output Format",
    # criteria="Task Coherence - determine if the tasks key value given in actual output is coherent with the question given in the input.",
    criteria="""Output Format - determine if the output matches the following JSON schema
    
    {
  "$schema": "http://json-schema.org/draft-07/schema#",
  "type": "object",
  "properties": {
    "question": {
      "type": "string"
    },
    "tasks": {
      "type": "array"
    },
    "can_i_answer": {
      "type": "boolean"
    }
  },
  "required": ["question", "tasks", "can_i_answer"]
}""",
    threshold=0.7,
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
)


# hallucination_metric = HallucinationMetric(threshold=0.5,model=llm1)
answerrelevancy_metric = AnswerRelevancyMetric(
    threshold=0.7,
    model=llm1,
    include_reason=True
)
'''
class CustomMetric(BaseMetric):
    def __init__(self, threshold=0.5):
        self.threshold = threshold
        self.async_mode = False

    def measure(self, test_case: LLMTestCase):
        # Custom evaluation logic goes here
        input_question = test_case.input['question']
        actual_can_i_answer = test_case.actual_output['can_i_answer']
        actual_tasks = test_case.actual_output['tasks']

        # Your custom evaluation logic
        question_accuracy = 1 if input_question == test_case.actual_output['question'] else 0
        can_i_answer_accuracy = 1 if actual_can_i_answer == test_case.input['can_i_answer'] else 0
        tasks_accuracy = None if actual_can_i_answer else (1 if actual_tasks == test_case.input['tasks'] else 0)

        total_score = (question_accuracy + can_i_answer_accuracy + tasks_accuracy) / 3

        # Set self.success and self.score
        self.score = total_score
        self.success = total_score >= self.threshold

        # Set a reason for the score returned
        if self.success:
            self.reason = "Custom metric score is above threshold."
        else:
            self.reason = "Custom metric score is below threshold."

        return self.score

    def is_successful(self):
        return self.success

    @property
    def __name__(self):
        return "CustomMetric"

custom_metric = CustomMetric(threshold=0.7)
'''

'\nclass CustomMetric(BaseMetric):\n    def __init__(self, threshold=0.5):\n        self.threshold = threshold\n        self.async_mode = False\n\n    def measure(self, test_case: LLMTestCase):\n        # Custom evaluation logic goes here\n        input_question = test_case.input[\'question\']\n        actual_can_i_answer = test_case.actual_output[\'can_i_answer\']\n        actual_tasks = test_case.actual_output[\'tasks\']\n\n        # Your custom evaluation logic\n        question_accuracy = 1 if input_question == test_case.actual_output[\'question\'] else 0\n        can_i_answer_accuracy = 1 if actual_can_i_answer == test_case.input[\'can_i_answer\'] else 0\n        tasks_accuracy = None if actual_can_i_answer else (1 if actual_tasks == test_case.input[\'tasks\'] else 0)\n\n        total_score = (question_accuracy + can_i_answer_accuracy + tasks_accuracy) / 3\n\n        # Set self.success and self.score\n        self.score = total_score\n        self.success = total_score >= self.thr

In [4]:
# input={"question":"\u00bfExisten incentivos para la instalaci\u00f3n de estaciones de carga de veh\u00edculos el\u00e9ctricos en Italia en 2023?"}

# # input=json.dumps(input).strip()

# prompt_template = f"""<<SYS>> Being an honest and smart assistant talented in breaking down questions into actionable items, you're charged with interpreting a JSON-formatted question. Your output must be a JSON object articulated with two keys: can_i_answer (indicating true if the inquiry is answerable using internal capabilities, or false if it requires external resources) and tasks, delineating the series of steps to answer the question with external aids if can_i_answer is false. <<SYS>> [INST] {input} [/INST] """


# output={"question":"\u00bfExisten incentivos para la instalaci\u00f3n de estaciones de carga de veh\u00edculos el\u00e9ctricos en Italia en 2023?","tasks":["RESEARCH: Investigate whether there are incentives for installing electric vehicle charging stations in Italy in 2023"],"can_i_answer":False}

# actual_output=get_llm_response(prompt_template)

import pandas as pd
directory_path = Path("eval_dataset_for_evaluation_metric")
testing_file_path = directory_path / "testing.json"
'''
df=pd.read_json(testing_file_path)

# print(df)
df['Status'] = None
df['Status'] = None
'''

dataset = EvaluationDataset()
dataset.add_test_cases_from_json_file(
    file_path=testing_file_path,
    input_key_name="questions",
    actual_output_key_name="actual_output",
    expected_output_key_name="expected_output",
    # context_key_name="expected_output",
    # retrieval_context_key_name="expected_output",
)

# results=evaluate(dataset, [answerrelevancy_metric,geval_metric,custom_metric])
results=evaluate(dataset, [answerrelevancy_metric,geval_question_undertsanding_metric,geval_format_metric])
print(results)
# df = pd.DataFrame()
# for i in results:
#     print(i.input)
#     print(i.actual_output)
#     print(i.context)
#     # print(i.metrics)
#     print(i.success)
#     for j in i.metrics:
#         print(j.score)
#         print(j.reason)




# print(f"Evaluation Metric Results:")
# print(results)
# for index, row in df.iterrows():
#     test_case1 = LLMTestCase(
#         input=row["questions"],
#         # Replace this with the actual output from your LLM application
#         actual_output=row["actual_output"],
#         context=[json.dumps(row["expected_output"])]
#     )
#     results=evaluate([test_case1], [answerrelevancy_metric,hallucination_metric,geval_metric])
    

Evaluating test cases...
Event loop is already running. Applying nest_asyncio patch to allow async execution...




Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: Custom Azure OpenAI Model, reason: The score is 1.00 because the output is directly addressing the input with precision and without including any irrelevant statements. It's spot on! Keep up the good work!, error: None)
  - ✅ Question Understanding with Tasks Steps Accuracy (GEval) (score: 0.8, threshold: 0.5, strict: False, evaluation model: Custom Azure OpenAI Model, reason: The assistant correctly identified the language of the question and its subject matter, acknowledging that the latest electric vehicle tax credits for 2023 can't be answered using internal capabilities. The output is in the correct JSON structure with the necessary keys: 'can_i_answer' and 'tasks.' However, the task list could include more detailed steps for external research to fully address the question, such as specifying the need to check government or authoritative financial websites, which would make the 

[TestResult(success=True, metrics=[<deepeval.metrics.answer_relevancy.answer_relevancy.AnswerRelevancyMetric object at 0x7f11b97b62d0>, <deepeval.metrics.g_eval.g_eval.GEval object at 0x7f11adc28390>, <deepeval.metrics.g_eval.g_eval.GEval object at 0x7f11adc2b490>], input="<<SYS>> Being an honest and smart assistant talented in breaking down questions into actionable items, you're charged with interpreting a JSON-formatted question. Your output must be a JSON object articulated with two keys: can_i_answer (indicating true if the inquiry is answerable using internal capabilities, or false if it requires external resources) and tasks, delineating the series of steps to answer the question with external aids if can_i_answer is false. <<SYS>> [INST] {'question': 'What are the latest electric vehicle tax credits available in 2023?'} [/INST] ", actual_output='{\\"question\\":\\"What are the latest electric vehicle tax credits available in 2023?\\",\\"tasks\\":[\\"RESEARCH: Look up the latest

In [5]:
from datetime import datetime
from collections import defaultdict

# print(type(i.success))
# data={"input":[result.input for result in result],"actual_output":[result.actual_output for result in result],"context":[result.context for result in result]}
# data1={ x:y for x, y in zip(("input","actual_output","context"),[i.input,i.actual_output,i.context] for i in results)} | {"overall_status": ["Passed" for i in results if i.success else "Failed"]} | {x:y for x, y in zip((f"{j.__name__} status",f"{j.__name__} score",f"{j.__name__} reason"),(j.input,j.actual_output,j.context)) for i in results for j in i.metrics}

new_dict = defaultdict(list)
# print(data)


for i in results:
    new_dict["input"].append(i.input)
    new_dict["actual_output"].append(i.actual_output)
    new_dict["context"].append(i.context)
    if i.success:
        new_dict["overall_status"].append("Paased")
    else:
        new_dict["overall_status"].append("Failed")
    for j in i.metrics:
        new_dict[f"{j.__name__} status"].append(j.is_successful())
        new_dict[f"{j.__name__} score"].append(j.score)
        new_dict[f"{j.__name__} reason"].append(j.reason)


# print(new_dict)
df = pd.DataFrame(new_dict)

current_time = datetime.now().strftime("%d%m%Y%H%M%S")

csv_path=directory_path / f"eval_test_run_{current_time}.csv"
df.to_csv(csv_path,sep="|",index=False,header=True,mode="w")

In [6]:
# for i in results:
#     print(i.input)
#     print(i.actual_output)
#     print(i.context)
#     # print(i.metrics)
#     print(i.success)
#     for j in i.metrics:
#         print(j.score)
#         print(j.reason)
# print(list(results))