In [None]:
from cval import CVal

config_file = "../config.toml"
env_file = "../.env"

cval = CVal.init(
    config_file=config_file,
    env_file=env_file
)

In [None]:
from data import Dependency

dep = Dependency(
    project="piggymetrics",
    dependency_category="value-equality",
    option_name="EXPOSE",
    option_value="8080",
    option_type="PORT",
    option_file="Dockerfile",
    option_technology="Docker",
    dependent_option_name="server.port",
    dependent_option_value="8080",
    dependent_option_file="application.yml",
    dependent_option_type="PORT",
    dependent_option_technology="Spring-Boot"
)




In [None]:
import pandas as pd

eval_data_file = "../data/test_data/dependencies_without_rules.csv"
df = pd.read_csv(eval_data_file, sep=";")
df_sample = df[:10]
df_sample

In [None]:
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric, ContextualRelevancyMetric
from deepeval.test_case import LLMTestCase
from dotenv import dotenv_values
import os

config = dotenv_values(dotenv_path="../.env")
os.environ['OPENAI_API_KEY'] = config["OPENAI_KEY"]

answer_relevancy_metric = AnswerRelevancyMetric(
    threshold=0.7,
    model="gpt-4o",
    include_reason=True
)

faithfulness_metric = FaithfulnessMetric(
    threshold=0.7,
    model="gpt-4o",
    include_reason=True
)

context_relevancy_metric = ContextualRelevancyMetric(
    threshold=0.7,
    model="gpt-4o",
    include_reason=True
)

In [None]:
from typing import List
import pandas as pd
import json

def measure_validation_metrics(outputs: List, df_baseline: pd.DataFrame):

    true_positives = []
    true_negatives = []
    false_positives = []
    false_negatives = []
    accuracy = []

    for response, baseline in zip(outputs, df_baseline.to_dict("records")):

        response_dict = json.loads(response.response)

        if baseline["rating"] in ("True") and response_dict["isDependency"]:
            accuracy.append(1)
            true_positives.append(1)
        if baseline["rating"] in ("False") and not response_dict["isDependency"]:
            accuracy.append(1)
            true_negatives.append(1)
        if baseline["rating"] in ("True") and not response_dict["isDependency"]:
            accuracy.append(0)
            false_negatives.append(1)
        if baseline["rating"] in ("False") and response_dict["isDependency"]:
            accuracy.append(0)
            false_positives.append(1)


    #precision = sum(true_positives)/(sum(true_positives)+sum(false_positives))
    #recall = sum(true_positives)/(sum(true_positives)+sum(false_negatives))
    #f1_score = 2 * (precision * recall) / (precision + recall)

    print("Accuracy", sum(accuracy)/len(accuracy))
    print("TP", sum(true_positives))
    print("FP", sum(false_positives))
    print("TN", sum(true_negatives))
    print("FN", sum(false_negatives))
    #print("Precision", precision)
    #print("Recall", recall)
    #print("F1 Score: ", f1_score)

    return {
        "true_positives": sum(true_positives),
        "false_positives": sum(false_positives),
        "true_negatives": sum(true_negatives),
        "false_negatives": sum(false_negatives),
        "accuracy": sum(accuracy)/len(accuracy)
    }

In [None]:
from typing import List
import pandas as pd


def measure_llm_netrics(outputs: List):
    faithfulness_scores = []
    answer_relevancy_scores = []
    context_relevancy_scores = []

    for response in outputs:
        
        test_case = LLMTestCase(
            input=response.input,
            actual_output=response.response,
            retrieval_context=[source_node.node.get_content() for source_node in response.source_nodes]
        )

        context_relevancy_metric.measure(test_case)
        faithfulness_metric.measure(test_case)
        answer_relevancy_metric.measure(test_case)

        context_relevancy_scores.append(context_relevancy_metric.score)
        faithfulness_scores.append(faithfulness_metric.score)
        answer_relevancy_scores.append(answer_relevancy_metric.score)       

    context_relevance = sum(context_relevancy_scores)/len(context_relevancy_scores)
    answer_relevance = sum(answer_relevancy_scores)/len(answer_relevancy_scores)
    faithfulness = sum(faithfulness_scores)/len(faithfulness_scores)
            
    print("Context Relevancy: ", context_relevancy_scores, context_relevance)
    print("Answer Relevancy: ", answer_relevancy_scores, answer_relevance)
    print("Faithfulness: ", faithfulness_scores, faithfulness)

    return {
        "answer_relevance": answer_relevance,
        "faithfulness": faithfulness,
        "context_relevance": context_relevance
    }

In [None]:
from data import Dependency
import mlflow
import datetime

index_name = "all"
exp_name = "preliminary-experiments"
date_time = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

mlflow.set_experiment(experiment_name=exp_name)

with mlflow.start_run(run_name=f"{exp_name}_{date_time}"):

    mlflow.log_params(cval.config)

    outputs = []

    for x in df_sample.to_dict("records"):
        dependency = Dependency(
            project=x["project"],
            option_name=x["option_name"],
            option_value=x["option_value"],
            option_type=x["option_type"].split(".")[-1],
            option_file=x["option_file"],
            option_technology=x["option_technology"],
            dependent_option_name=x["dependent_option_name"],
            dependent_option_value=x["dependent_option_value"],
            dependent_option_type=x["dependent_option_type"].split(".")[-1],
            dependent_option_file=x["dependent_option_file"],
            dependent_option_technology=x["dependent_option_technology"]
        )

        response = cval.query(
            dependency=dependency,
            index_name=index_name
        )

        outputs.append(response)

    llm_metrics = measure_llm_netrics(outputs=outputs)
    validation_metrics =measure_validation_metrics(outputs=outputs, df_baseline=df_sample)

    results = [x.response for x in outputs]
    inputs = [x.input for x in outputs]
    rating = df_sample["rating"]
    df_results = pd.DataFrame().from_dict(data={"inputs": inputs, "outputs": results, "isDependency": rating})
    df_results.to_csv(f"../data/results/test_{index_name}.csv", index=False)
    mlflow.log_table(data={"inputs": inputs, "outputs": results}, artifact_file="results.json")

    

In [None]:
print(llm_metrics)

In [None]:
print(validation_metrics)