In [16]:
import pandas as pd
import json

with open("../data/evaluation/results/apollo_dependencies_tech-docs.json", "r", encoding="utf-8") as src:
    data = json.load(src)

In [17]:
from deepeval.models.base_model import DeepEvalBaseLLM
from openai import OpenAI

model_name = "gpt-4-0125-preview"
api_key = "a29859a1-e871-47c5-b15e-57be20c61e8c"
base_url = "http://172.26.92.115"

# define OpenAI model using the credentials of the proxy server
model = OpenAI(base_url=base_url, api_key=api_key)


# define the custom evaluation model class
class CustomEvalModel(DeepEvalBaseLLM):
    def __init__(
        self,
        model
    ):
        self.model = model

    def load_model(self):
        return self.model
    
    def generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        res = chat_model.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            model=model_name
        )

        return res.choices[0].message.content

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    def get_model_name(self):
        return "Custom Azure OpenAI Model"


# define the custom evaluation model
custom_eval_model = CustomEvalModel(model=model)


In [18]:
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric, ContextualRelevancyMetric
from deepeval.test_case import LLMTestCase
from dotenv import dotenv_values
import os

config = dotenv_values(dotenv_path="../.env")
os.environ['OPENAI_API_KEY'] = config["OPENAI_KEY"]

answer_relevancy_metric = AnswerRelevancyMetric(
    threshold=0.7,
    model=custom_eval_model,
    include_reason=True
)

faithfulness_metric = FaithfulnessMetric(
    threshold=0.7,
    model=custom_eval_model,
    include_reason=True
)

context_relevancy_metric = ContextualRelevancyMetric(
    threshold=0.7,
    model=custom_eval_model,
    include_reason=True
)


for response in data:
    test_case = LLMTestCase(
        input=response["input"],
        actual_output=response["response"],
        retrieval_context=[x for x in response["context"]]
    )

    context_relevancy_metric.measure(test_case)
    faithfulness_metric.measure(test_case)
    answer_relevancy_metric.measure(test_case)

    print("Context Relevancy: ", context_relevancy_metric.score)
    print("Answer Relevancy: ", answer_relevancy_metric.score)
    print("Faithfulness: ", faithfulness_metric.score)


Context Relevancy:  0.3333333333333333
Answer Relevancy:  1.0
Faithfulness:  1.0


Context Relevancy:  0.0
Answer Relevancy:  1.0
Faithfulness:  1.0


Context Relevancy:  0.0
Answer Relevancy:  1.0
Faithfulness:  1.0


Context Relevancy:  0.0
Answer Relevancy:  1.0
Faithfulness:  1.0


Context Relevancy:  0.0
Answer Relevancy:  1.0
Faithfulness:  1.0


In [10]:
from typing import List
import pandas as pd
import json

def measure_validation_metrics(outputs: List, df_baseline: pd.DataFrame):

    true_positives = []
    true_negatives = []
    false_positives = []
    false_negatives = []
    accuracy = []

    for response, baseline in zip(outputs, df_baseline.to_dict("records")):

        response_dict = json.loads(response.response)

        if baseline["rating"] in ("True") and response_dict["isDependency"]:
            accuracy.append(1)
            true_positives.append(1)
        if baseline["rating"] in ("False") and not response_dict["isDependency"]:
            accuracy.append(1)
            true_negatives.append(1)
        if baseline["rating"] in ("True") and not response_dict["isDependency"]:
            accuracy.append(0)
            false_negatives.append(1)
        if baseline["rating"] in ("False") and response_dict["isDependency"]:
            accuracy.append(0)
            false_positives.append(1)


    #precision = sum(true_positives)/(sum(true_positives)+sum(false_positives))
    #recall = sum(true_positives)/(sum(true_positives)+sum(false_negatives))
    #f1_score = 2 * (precision * recall) / (precision + recall)

    print("Accuracy", sum(accuracy)/len(accuracy))
    print("TP", sum(true_positives))
    print("FP", sum(false_positives))
    print("TN", sum(true_negatives))
    print("FN", sum(false_negatives))
    #print("Precision", precision)
    #print("Recall", recall)
    #print("F1 Score: ", f1_score)

    return {
        "true_positives": sum(true_positives),
        "false_positives": sum(false_positives),
        "true_negatives": sum(true_negatives),
        "false_negatives": sum(false_negatives),
        "accuracy": sum(accuracy)/len(accuracy)
    }