In [21]:
from deepeval.models.base_model import DeepEvalBaseLLM
from openai import OpenAI
from dotenv import load_dotenv
import os


env_file = "./.env"

load_dotenv(dotenv_path=env_file)

model = OpenAI(api_key=os.getenv("OPENAI_KEY"))

# define the custom evaluation model class
class CustomEvalModel(DeepEvalBaseLLM):
    def __init__(
        self,
        model
    ):
        self.model = model

    def load_model(self):
        return self.model
    
    def generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        res = chat_model.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            model="gpt-4o-mini-2024-07-18",
            response_format={"type": "json_object"}
        )

        return res.choices[0].message.content

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    def get_model_name(self):
        return "Custom Azure OpenAI Model"


In [22]:
from llama_index.core import PromptTemplate
from deepeval.metrics import ContextualRelevancyMetric
from deepeval.test_case import LLMTestCase
import json
import backoff


format_str = PromptTemplate(
    "Respond in a JSON format as shown below:\n"
    "{{\n"
    "\t“plan”: string, // Write down a step-by-step plan on how to solve the task given the information above.\n"
    "\t“rationale”: string, // Provide a concise explanation of whether and why the configuration options depend on each other due to value-equality.\n"
    "\t“uncertainty”: integer, // Rate your certainty of this dependency on a scale from 0 (completely uncertain) to 10 (absolutely certain), given the context, plan, and rationale.\n"
    "\t“isDependency”: boolean // True if a dependency exists, or False otherwise.\n"
    "}}"
)


@backoff.on_exception(backoff.expo, Exception, max_tries=3)
def get_context_relevance_score(input_str: str, response_str: str, context_str: str):
    context_relevancy_metric = ContextualRelevancyMetric(
        threshold=0.7,
        model=custom_eval_model,
        include_reason=True
    )

    test_case = LLMTestCase(
        input=input_str,
        actual_output=response_str,
        retrieval_context=[context_str]
    )

    context_relevancy_metric.measure(test_case)

    print("Relevance Score: ", context_relevancy_metric.score)
    print("Reason: ", context_relevancy_metric.reason)

    return context_relevancy_metric.score

def compute_context_relevance(file_name: str):

    with open(file_name, "r", encoding="utf-8") as src:
        data = json.load(src)


    for entry in data:

        input_str = f"{entry['task_str']}\n\n{format_str.format()}"
        context_str = entry["context_str"]
        response_str = entry["response"]

        context_relevance_score = get_context_relevance_score(
            input_str=input_str,
            response_str=response_str,
            context_str=context_str
        )
        
        entry["context_relevance_score"] = context_relevance_score


        for context in entry["context"]:
            relevance_score = get_context_relevance_score(
                input_str=input_str,
                response_str=response_str,
                context_str=context["content"]
            )
            
            context["relevance_score"] = relevance_score


        break


    with open(f"../data/results/{config_str}/all_dependencies_all_gpt-3.5-turbo-0125_test.json", "w", encoding="utf-8") as dest:
        json.dump(data, dest, indent=2)


custom_eval_model = CustomEvalModel(model=model)
model_names = ["gpt-40-2024-05-13", "gpt-3.5-turbo-0125", "llama3:8b", "llama3:70b"]
config_str = "config1"

#for model_name in model_names:
    #file_name = f"../data/results/{config_str}/all_dependencies_all_{model_name}.json"
    #compute_context_relevance(file_name=file_name, eval_model=custom_eval_model)


compute_context_relevance(file_name=f"../data/results/{config_str}/all_dependencies_all_gpt-3.5-turbo-0125.json")


ValidationError: 1 validation error for ContextualRelevancyVerdict
verdict
  Field required [type=missing, input_value={'plan': 'Compare the mod...0, 'isDependency': True}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.7/v/missing