In [1]:
from cval import CVal

config_file = "../config.toml"
env_file = "../.env"

cval = CVal.init(
    config_file=config_file,
    env_file=env_file
)

  from tqdm.autonotebook import tqdm


In [2]:
import pandas as pd

eval_data_file = "../data/eval/dependencies_without_rules.csv"

df = pd.read_csv(eval_data_file, sep=";")

df_sample = df[:5]

df_sample

Unnamed: 0,link_str,project,option_name,option_value,option_file,option_type,option_technology,dependent_option_name,dependent_option_value,dependent_option_file,dependent_option_type,dependent_option_technology,pre_rating,rating,Notes
0,mall::::mall-admin/src/main/resources/applicat...,mall,spring.datasource.druid.stat-view-servlet.logi...,druid,mall-admin/src/main/resources/application-dev.yml,ConfigType.PASSWORD,spring,spring.datasource.druid.stat-view-servlet.logi...,druid,mall-search/src/main/resources/application-pro...,ConfigType.PASSWORD,spring,False,True,"borderline case, false if each service has it..."
1,mall::::mall-mbg/pom.xml::::project::::groupId...,mall,project.groupId,com.macro.mall,mall-mbg/pom.xml,ConfigType.NAME,maven,project.parent_mall.groupId,com.macro.mall,mall-mbg/pom.xml,ConfigType.NAME,maven,True,True,
2,apollo::::apollo-biz/src/test/resources/applic...,apollo,spring.h2.console.enabled,true,apollo-biz/src/test/resources/application.prop...,ConfigType.BOOLEAN,spring,project.build.plugins.plugin_maven-jar-plugin....,true,pom.xml,ConfigType.IP_ADDRESS,maven,False,False,
3,apollo::::apollo-assembly/pom.xml::::project::...,apollo,project.dependencies.dependency_apollo-portal....,apollo-portal,apollo-assembly/pom.xml,ConfigType.NAME,maven,COPY.src,apollo-portal,apollo-portal/src/main/docker/Dockerfile,ConfigType.PATH,docker,False,False,
4,apollo::::apollo-assembly/pom.xml::::project::...,apollo,project.parent_apollo.relativePath,../pom.xml,apollo-assembly/pom.xml,ConfigType.PATH,maven,project.parent_apollo.relativePath,../pom.xml,apollo-portal/pom.xml,ConfigType.PATH,maven,,True,


In [6]:
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric, ContextualRelevancyMetric
from deepeval.test_case import LLMTestCase
from dotenv import dotenv_values
import os

config = dotenv_values(dotenv_path="../.env")
os.environ['OPENAI_API_KEY'] = config["OPENAI_KEY"]


from deepeval.models.base_model import DeepEvalBaseLLM
from llama_index.llms.ollama import Ollama

model = Ollama(model="llama3:70b", temperature=0.0, format="json")

class LlamaModel(DeepEvalBaseLLM):
    def __init__(
        self,
        model
    ):
        self.model = model

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        model = self.load_model()
        return model.complete(prompt).text

    async def a_generate(self, prompt: str) -> str:
        model = self.load_model()
        res = await model.acomplete(prompt)
        return res.text

    def get_model_name(self):
        return "Custom Llama3 model"


llama_model = LlamaModel(model=model)


answer_relevancy_metric = AnswerRelevancyMetric(
    threshold=0.7,
    model="gpt-4o",
    include_reason=True
)

faithfulness_metric = FaithfulnessMetric(
    threshold=0.7,
    model="gpt-4o",
    include_reason=True
)

context_relevancy_metric = ContextualRelevancyMetric(
    threshold=0.7,
    model="gpt-4o",
    include_reason=True
)

In [4]:
from data import Dependency
import mlflow
import datetime
import json

index_name = "so-posts"
exp_name = "dependencies_without_rules"
date_time = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

mlflow.set_experiment(experiment_name=exp_name)

with mlflow.start_run(run_name=f"{exp_name}_{date_time}"):

    output = []

    for x in df_sample.to_dict("records"):
        dependency = Dependency(
            project=x["project"],
            option_name=x["option_name"],
            option_value=x["option_value"],
            option_type=x["option_type"].split(".")[-1],
            option_file=x["option_file"],
            option_technology=x["option_technology"],
            dependent_option_name=x["dependent_option_name"],
            dependent_option_value=x["dependent_option_value"],
            dependent_option_type=x["dependent_option_type"].split(".")[-1],
            dependent_option_file=x["dependent_option_file"],
            dependent_option_technology=x["dependent_option_technology"]
        )

        response = cval.query(
            dependency=dependency,
            index_name=index_name
        )

        output.append(response)

        print("Response: ", response)

    inputs = [x.input for x in output]
    results = [x.response for x in output]
    mlflow.log_table(data={"inputs": inputs, "outputs": results}, artifact_file="results.json")

    

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.38it/s]


Response:  { 
  "plan": "1. Identify the configuration options in question: 'spring.datasource.druid.stat-view-servlet.login-password' in 'mall-admin/src/main/resources/application-dev.yml' and 'mall-search/src/main/resources/application-prod.yml'.\n2. Verify the values of these configuration options: both are set to 'druid'.\n3. Determine if there is any context or documentation indicating that these values must be identical for the system to function correctly.\n4. Evaluate if the identical values are coincidental or if they are required to be the same due to a functional dependency.\n5. Conclude whether a value-equality dependency exists based on the findings.",
  "rationale": "The configuration options 'spring.datasource.druid.stat-view-servlet.login-password' in both 'mall-admin' and 'mall-search' have the same value 'druid'. However, there is no context or documentation provided that indicates these values must be identical for the system to function correctly. The identical valu

Batches: 100%|██████████| 1/1 [00:00<00:00, 16.58it/s]


Response:  { 
  "plan": "1. Identify the configuration options in question: project.groupId and project.parent_mall.groupId, both located in mall-mbg/pom.xml. \n2. Verify the values of these configuration options: both have the value 'com.macro.mall'. \n3. Determine the roles of these configuration options within the Maven project structure. \n4. Assess whether these options need to have identical values for the project to function correctly. \n5. Consider the Maven project hierarchy and inheritance mechanisms to understand if the values must be the same due to Maven's dependency management and project inheritance.",
  "rationale": "In Maven, the groupId is a unique identifier for a project, and it is common for a child project to inherit the groupId from its parent project. The fact that both project.groupId and project.parent_mall.groupId have the same value 'com.macro.mall' suggests that the child project is inheriting the groupId from the parent project. This is a typical Maven con

Batches: 100%|██████████| 1/1 [00:00<00:00, 17.64it/s]


Response:  { 
  "plan": "1. Identify the purpose and context of the configuration option 'spring.h2.console.enabled' in the Spring application. \n2. Identify the purpose and context of the configuration option 'project.build.plugins.plugin_maven-jar-plugin.configuration.archive.manifest.addDefaultSpecificationEntries' in the Maven build process. \n3. Determine if there is any functional or operational overlap between enabling the H2 console in a Spring application and adding default specification entries to the manifest in a Maven build. \n4. Assess if the values being 'true' for both configurations are coincidental or if they must be identical for the application to function correctly. \n5. Conclude whether a value-equality dependency exists based on the analysis.",
  "rationale": "The configuration option 'spring.h2.console.enabled' is a Spring Boot property that enables the H2 database console, which is used for database management and debugging. On the other hand, 'project.build.pl

Batches: 100%|██████████| 1/1 [00:00<00:00, 13.46it/s]


Response:  {
  "plan": "1. Identify the configuration option 'project.dependencies.dependency_apollo-portal.artifactId' in the Maven configuration file 'apollo-assembly/pom.xml'.\n2. Identify the configuration option 'COPY.src' in the Dockerfile located at 'apollo-portal/src/main/docker/Dockerfile'.\n3. Compare the values of these two configuration options to check if they are identical.\n4. Determine if there is any functional or operational dependency between these two options based on their values and usage context.\n5. Conclude whether a value-equality dependency exists between these two configuration options.",
  "rationale": "The configuration option 'project.dependencies.dependency_apollo-portal.artifactId' in the Maven configuration file specifies the artifactId for a dependency, which is 'apollo-portal'. The configuration option 'COPY.src' in the Dockerfile specifies a path, which also includes 'apollo-portal'. While both values contain 'apollo-portal', they serve different pu

Batches: 100%|██████████| 1/1 [00:00<00:00, 21.06it/s]


Response:  { 
  "plan": "1. Identify the configuration options in question: project.parent_apollo.relativePath in apollo-assembly/pom.xml and project.parent_apollo.relativePath in apollo-portal/pom.xml. \n2. Verify the values of these configuration options: both are set to '../pom.xml'. \n3. Understand the purpose of the 'relativePath' configuration in Maven, which is to specify the path to the parent POM file relative to the current POM file. \n4. Determine if the functionality of one configuration option is directly dependent on the value of the other. \n5. Assess if the identical values are coincidental or if they must be the same for the build process to work correctly.",
  "rationale": "The 'relativePath' configuration in Maven specifies the path to the parent POM file. Both configuration options have the same value '../pom.xml', which indicates that they are pointing to the same parent POM file. This is a common practice in multi-module Maven projects where multiple modules share

In [7]:
faithfulness_scores = []
answer_relevancy_scores = []
context_relevancy_scores = []

false_positives = []
true_positives = []
false_negatives = []
true_negatives = []
accuracy = []

for response, baseline in zip(output, df_sample.to_dict("records")):


    test_case = LLMTestCase(
        input=response.input,
        actual_output=response.response,
        retrieval_context=[source_node.node.get_content() for source_node in response.source_nodes]
    )

    context_relevancy_metric.measure(test_case)
    faithfulness_metric.measure(test_case)
    answer_relevancy_metric.measure(test_case)

    context_relevancy_scores.append(context_relevancy_metric.score)
    faithfulness_scores.append(faithfulness_metric.score)
    answer_relevancy_scores.append(answer_relevancy_metric.score)       
        
print("Context Relevancy: ", context_relevancy_scores, sum(context_relevancy_scores)/len(context_relevancy_scores))
print("Answer Relevancy: ", answer_relevancy_scores, sum(answer_relevancy_scores)/len(answer_relevancy_scores))
print("Faithfulness: ", faithfulness_scores, sum(faithfulness_scores)/len(faithfulness_scores))

Context Relevancy:  [0.0, 0.0, 0.0, 0.0, 0.0] 0.0
Answer Relevancy:  [0.6153846153846154, 0.8181818181818182, 0.3125, 0.7142857142857143, 0.9166666666666666] 0.6754037629037629
Faithfulness:  [1.0, 1.0, 1.0, 0.8, 1.0] 0.96


In [9]:
import re

true_positives = []
true_negatives = []
false_positives = []
false_negatives = []
accuracy = []

for response, baseline in zip(output, df_sample.to_dict("records")):

    response_dict = json.loads(response.response)

    if baseline["rating"] in ("True") and response_dict["isDependency"]:
        accuracy.append(1)
        true_positives.append(1)
    if baseline["rating"] in ("False") and not response_dict["isDependency"]:
        accuracy.append(1)
        true_negatives.append(1)
    if baseline["rating"] in ("True") and not response_dict["isDependency"]:
        accuracy.append(0)
        false_negatives.append(1)
    if baseline["rating"] in ("False") and response_dict["isDependency"]:
        accuracy.append(0)
        false_positives.append(1)


#precision = sum(true_positives)/(sum(true_positives)+sum(false_positives))
#recall = sum(true_positives)/(sum(true_positives)+sum(false_negatives))
#f1_score = 2 * (precision * recall) / (precision + recall)

print("Accuracy", sum(accuracy)/len(accuracy))
print("TP", sum(true_positives))
print("FP", sum(false_positives))
print("TN", sum(true_negatives))
print("FN", sum(false_negatives))
#print("Precision", precision)
#print("Recall", recall)
#print("F1 Score: ", f1_score)

Accuracy 0.8
TP 2
FP 0
TN 2
FN 1


In [None]:
#inputs = [x.input for x in output]
results = [x.response for x in output]
rating = df_sample["rating"]

df_results = pd.DataFrame().from_dict(data={"outputs": results, "isDependency": rating})
df_results.to_csv(f"../data/results/test_{index_name}.csv", index=False)


In [None]:
for x in output:
    print(x.input)
    print(x.response)
    break