In [4]:
from llama_index.core import PromptTemplate
from deepeval.metrics import ContextualRelevancyMetric
from deepeval.test_case import LLMTestCase
from deepeval.models.base_model import DeepEvalBaseLLM
from openai import OpenAI
from dotenv import dotenv_values
from pydantic import ValidationError
import os
import json
import backoff

format_str = PromptTemplate(
    "Respond in a JSON format as shown below:\n"
    "{{\n"
    "\t“plan”: string, // Write down a step-by-step plan on how to solve the task given the information above.\n"
    "\t“rationale”: string, // Provide a concise explanation of whether and why the configuration options depend on each other due to value-equality.\n"
    "\t“uncertainty”: integer, // Rate your certainty of this dependency on a scale from 0 (completely uncertain) to 10 (absolutely certain), given the context, plan, and rationale.\n"
    "\t“isDependency”: boolean // True if a dependency exists, or False otherwise.\n"
    "}}"
)

env_file = "../.env"

config = dotenv_values(dotenv_path=env_file)

print(config["OPENAI_KEY"])

os.environ["OPENAI_API_KEY"] = config["OPENAI_KEY"]


@backoff.on_exception(backoff.expo, ValidationError, max_tries=3)
def get_context_relevance_score(input_str: str, response_str: str, context_str: str):
    context_relevancy_metric = ContextualRelevancyMetric(
        threshold=0.5,
        model="gpt-4o-mini",
        include_reason=False
    )

    test_case = LLMTestCase(
        input=input_str,
        actual_output=response_str,
        retrieval_context=[context_str]
    )

    context_relevancy_metric.measure(test_case)

    print("Relevance Score: ", context_relevancy_metric.score)
    print("Reason: ", context_relevancy_metric.reason)

    return context_relevancy_metric.score

def compute_context_relevance(file_name: str):

    with open(file_name, "r", encoding="utf-8") as src:
        data = json.load(src)


    for entry in data[:5]:

        input_str = f"{entry['task_str']}\n\n{format_str.format()}"
        context_str = entry["context_str"]
        response_str = entry["response"]

        context_relevance_score = get_context_relevance_score(
            input_str=input_str,
            response_str=response_str,
            context_str=context_str
        )
        
        entry["context_relevance_score"] = context_relevance_score


        for context in entry["context"]:
            relevance_score = get_context_relevance_score(
                input_str=input_str,
                response_str=response_str,
                context_str=context["content"]
            )
            
            context["relevance_score"] = relevance_score

        break


    with open(f"../data/results/{config_str}/all_dependencies_all_gpt-3.5-turbo-0125_test.json", "w", encoding="utf-8") as dest:
        json.dump(data, dest, indent=2)


model_names = ["gpt-40-2024-05-13", "gpt-3.5-turbo-0125", "llama3:8b", "llama3:70b"]
config_str = "config1"

#for model_name in model_names:
    #file_name = f"../data/results/{config_str}/all_dependencies_all_{model_name}.json"
    #compute_context_relevance(file_name=file_name, eval_model=custom_eval_model)


compute_context_relevance(file_name=f"../data/results/{config_str}/all_dependencies_all_gpt-3.5-turbo-0125.json")


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


Relevance Score:  0.0
Reason:  None


Relevance Score:  0.0
Reason:  None


Relevance Score:  0.0
Reason:  None


Relevance Score:  0.0
Reason:  None


Relevance Score:  0.0
Reason:  None


Relevance Score:  0.0
Reason:  None
