In [13]:
from cval import CVal

config_file = "../config.toml"
env_file = "../.env"

cval = CVal.init(
    config_file=config_file,
    env_file=env_file
)

In [14]:
import pandas as pd

eval_data_file = "../data/eval/dependencies_without_rules.csv"

df = pd.read_csv(eval_data_file, sep=";")

df_sample = df[:10]

df_sample

Unnamed: 0,link_str,project,option_name,option_value,option_file,option_type,option_technology,dependent_option_name,dependent_option_value,dependent_option_file,dependent_option_type,dependent_option_technology,pre_rating,rating
0,mall::::mall-search/pom.xml::::project::::depe...,mall,project.dependencies.dependency_mall-mbg.groupId,com.macro.mall,mall-search/pom.xml,ConfigType.NAME,maven,project.parent_mall.groupId,com.macro.mall,mall-security/pom.xml,ConfigType.NAME,maven,False,Unsure
1,mall::::mall-demo/pom.xml::::project::::depend...,mall,project.dependencies.dependency_spring-boot-st...,org.springframework.boot,mall-demo/pom.xml,ConfigType.NAME,maven,project.dependencies.dependency_spring-boot-st...,org.springframework.boot,mall-portal/pom.xml,ConfigType.NAME,maven,True,True
2,mall::::mall-admin/src/main/resources/applicat...,mall,spring.datasource.druid.stat-view-servlet.logi...,druid,mall-admin/src/main/resources/application-dev.yml,ConfigType.VERSION_NUMBER,spring,spring.datasource.druid.stat-view-servlet.logi...,druid,mall-search/src/main/resources/application-pro...,ConfigType.VERSION_NUMBER,spring,True,True
3,mall::::mall-mbg/pom.xml::::project::::groupId...,mall,project.groupId,com.macro.mall,mall-mbg/pom.xml,ConfigType.NAME,maven,project.parent_mall.groupId,com.macro.mall,mall-mbg/pom.xml,ConfigType.NAME,maven,True,True
4,mall::::mall-portal/pom.xml::::project::::depe...,mall,project.dependencies.dependency_spring-boot-st...,org.springframework.boot,mall-portal/pom.xml,ConfigType.NAME,maven,project.dependencies.dependency_spring-boot-co...,org.springframework.boot,pom.xml,ConfigType.NAME,maven,True,True
5,apollo::::apollo-biz/src/test/resources/applic...,apollo,spring.h2.console.enabled,true,apollo-biz/src/test/resources/application.prop...,ConfigType.BOOLEAN,spring,project.build.plugins.plugin_maven-jar-plugin....,true,pom.xml,ConfigType.IP_ADDRESS,maven,False,False
6,apollo::::apollo-portal/pom.xml::::project::::...,apollo,project.dependencies.dependency_apollo-openapi...,com.ctrip.framework.apollo,apollo-portal/pom.xml,ConfigType.NAME,maven,project.dependencyManagement.dependencies.depe...,com.ctrip.framework.apollo,pom.xml,ConfigType.NAME,maven,False,True
7,apollo::::apollo-assembly/pom.xml::::project::...,apollo,project.dependencies.dependency_apollo-portal....,apollo-portal,apollo-assembly/pom.xml,ConfigType.NAME,maven,COPY.src,apollo-portal,apollo-portal/src/main/docker/Dockerfile,ConfigType.PATH,docker,False,False
8,apollo::::apollo-assembly/pom.xml::::project::...,apollo,project.parent_apollo.relativePath,../pom.xml,apollo-assembly/pom.xml,ConfigType.PATH,maven,project.parent_apollo.relativePath,../pom.xml,apollo-portal/pom.xml,ConfigType.PATH,maven,,True
9,apollo::::apollo-portal/src/test/resources/app...,apollo,spring.h2.console.enabled,true,apollo-portal/src/test/resources/application.p...,ConfigType.BOOLEAN,spring,project.build.plugins.plugin_maven-jar-plugin....,true,pom.xml,ConfigType.IP_ADDRESS,maven,False,False


In [15]:
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric, ContextualRelevancyMetric
from deepeval.test_case import LLMTestCase
from dotenv import dotenv_values
import os

config = dotenv_values(dotenv_path="../.env")
os.environ['OPENAI_API_KEY'] = config["OPENAI_API_KEY"]


from deepeval.models.base_model import DeepEvalBaseLLM
from llama_index.llms.ollama import Ollama

model = Ollama(model="llama3:8b", temperature=0.0)

class LlamaModel(DeepEvalBaseLLM):
    def __init__(
        self,
        model
    ):
        self.model = model

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        model = self.load_model()
        return model.complete(prompt).text

    async def a_generate(self, prompt: str) -> str:
        model = self.load_model()
        res = await model.acomplete(prompt)
        return res.text

    def get_model_name(self):
        return "Custom Llama3 model"


answer_relevancy_metric = AnswerRelevancyMetric(
    threshold=0.7,
    model="gpt-4",
    include_reason=True
)

faithfulness_metric = FaithfulnessMetric(
    threshold=0.7,
    model="gpt-4",
    include_reason=True
)

context_relevancy_metric = ContextualRelevancyMetric(
    threshold=0.7,
    model="gpt-4",
    include_reason=True
)

In [16]:
from data import Dependency
import mlflow
import datetime
import json

index_name = "so-posts"
exp_name = "dependencies_without_rules"
date_time = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

mlflow.set_experiment(experiment_name=exp_name)

with mlflow.start_run(run_name=f"{exp_name}_{date_time}"):

    output = []

    for x in df_sample.to_dict("records"):
        dependency = Dependency(
            project=x["project"],
            option_name=x["option_name"],
            option_value=x["option_value"],
            option_type=x["option_type"].split(".")[-1],
            option_file=x["option_file"],
            option_technology=x["option_technology"],
            dependent_option_name=x["dependent_option_name"],
            dependent_option_value=x["dependent_option_value"],
            dependent_option_type=x["dependent_option_type"].split(".")[-1],
            dependent_option_file=x["dependent_option_file"],
            dependent_option_technology=x["dependent_option_technology"]
        )

        response = cval.query(
            dependency=dependency,
            index_name=index_name
        )

        output.append(response)

    inputs = [x.input for x in output]
    results = [x.response for x in output]
    mlflow.log_table(data={"inputs": inputs, "outputs": results}, artifact_file="results.json")

    



Batches: 100%|██████████| 1/1 [00:00<00:00,  5.66it/s]


Batches: 100%|██████████| 1/1 [00:00<00:00,  5.84it/s]


Batches: 100%|██████████| 1/1 [00:00<00:00,  5.54it/s]


Batches: 100%|██████████| 1/1 [00:00<00:00,  5.87it/s]


Batches: 100%|██████████| 1/1 [00:00<00:00,  4.86it/s]


Batches: 100%|██████████| 1/1 [00:00<00:00,  5.86it/s]


Batches: 100%|██████████| 1/1 [00:00<00:00,  6.13it/s]


Batches: 100%|██████████| 1/1 [00:00<00:00,  6.05it/s]


Batches: 100%|██████████| 1/1 [00:00<00:00,  5.91it/s]


Batches: 100%|██████████| 1/1 [00:00<00:00,  5.87it/s]


In [17]:
faithfulness_scores = []
answer_relevancy_scores = []
context_relevancy_scores = []

false_positives = []
true_positives = []
false_negatives = []
true_negatives = []
accuracy = []

for response, baseline in zip(output, df_sample.to_dict("records")):


    test_case = LLMTestCase(
        input=response.input,
        actual_output=response.response,
        retrieval_context=[source_node.node.get_content() for source_node in response.source_nodes]
    )

    context_relevancy_metric.measure(test_case)
    faithfulness_metric.measure(test_case)
    answer_relevancy_metric.measure(test_case)

    context_relevancy_scores.append(context_relevancy_metric.score)
    faithfulness_scores.append(faithfulness_metric.score)
    answer_relevancy_scores.append(answer_relevancy_metric.score)       
            
    response_dict = json.loads(response.response)
        
    print("Context Relevancy: ", context_relevancy_scores, sum(context_relevancy_scores)/len(context_relevancy_scores))
    print("Answer Relevancy: ", answer_relevancy_scores, sum(answer_relevancy_scores)/len(answer_relevancy_scores))
    print("Faithfulness: ", faithfulness_scores, sum(faithfulness_scores)/len(faithfulness_scores))

Context Relevancy:  [0.0] 0.0
Answer Relevancy:  [1.0] 1.0
Faithfulness:  [1.0] 1.0


In [None]:
true_positives = []
true_negatives = []
false_positives = []
false_negatives = []
accuracy = []

for response, baseline in zip(output, df_sample.to_dict("records")):
    response_dict = json.loads(response.response)


    if baseline["rating"] in ("True", "Unsure") and response_dict["isDependency"]:
        accuracy.append(1)
        true_positives.append(1)
    if baseline["rating"] in ("False") and not response_dict["isDependency"]:
        accuracy.append(1)
        true_negatives.append(1)
    if baseline["rating"] in ("True", "Unsure") and not response_dict["isDependency"]:
        accuracy.append(0)
        false_negatives.append(1)
    if baseline["rating"] in ("False") and response_dict["isDependency"]:
        accuracy.append(0)
        false_positives.append(1)


#precision = sum(true_positives)/(sum(true_positives)+sum(false_positives))
#recall = sum(true_positives)/(sum(true_positives)+sum(false_negatives))
#f1_score = 2 * (precision * recall) / (precision + recall)

print("Accuracy", sum(accuracy)/len(accuracy))
print("TP", sum(true_positives))
print("FP", sum(false_positives))
print("TN", sum(true_negatives))
print("FN", sum(false_negatives))
#print("Precision", precision)
#print("Recall", recall)
#print("F1 Score: ", f1_score)

Accuracy 0.3
TP 0
FP 0
TN 3
FN 7


In [None]:
#inputs = [x.input for x in output]
results = [x.response for x in output]
rating = df_sample["rating"]

df_results = pd.DataFrame().from_dict(data={"outputs": results, "isDependency": rating})
df_results.to_csv(f"../data/results/test_{index_name}.csv", index=False)
