**Qualitative Analysis of Validation Responses**

In [18]:
import json
import pandas as pd
import os


def get_df_stats(model_name: str):

    data_file = f"../data/evaluation/without/all_dependencies_without_{model_name}.json"
    baseline_file = "../data/evaluation/all_dependencies.csv"

    with open(data_file, "r", encoding="utf-8") as src:
        data = json.load(src)

    df = pd.read_csv(baseline_file)

    response_rating = []
    classification = []
    rationale = []
    plan = []
    uncertainty = []
    

    for entry, (index, row) in zip(data, df.iterrows()):

        assert entry["index"] == index
        
        if "responses" in entry:
            response = entry["responses"][0]
        else:
            response = entry["response"]

        try:
            if "responses" in entry:
                response = entry["responses"][0]
            else:
                response = entry["response"]
            response_dict = json.loads(response)
            isDependency = response_dict["isDependency"]
        except json.JSONDecodeError:
            pass
        except KeyError:
            return

        rating = row["final_rating"]
        
        if str(rating).lower() == "borderline":
            if isDependency:
                response_rating.append(str(isDependency))
                classification.append("TP")
                plan.append(response_dict["plan"])
                rationale.append(response_dict["rationale"])
                uncertainty.append(response_dict["uncertainty"])
            if not isDependency:
                response_rating.append(str(isDependency))
                classification.append("TN")
                plan.append(response_dict["plan"])
                rationale.append(response_dict["rationale"])
                uncertainty.append(response_dict["uncertainty"])

        # FP: The LLM validates a dependency as correct, but the dependency is actually incorrect
        if isDependency and str(rating).lower() == "false":
            response_rating.append(str(isDependency))
            classification.append("FP")
            plan.append(response_dict["plan"])
            rationale.append(response_dict["rationale"])
            uncertainty.append(response_dict["uncertainty"])


        # FN: The LLM validates a dependency as incorrect, but the dependency is actually correct
        if not isDependency and  str(rating).lower() == "true":
            response_rating.append(str(isDependency))
            classification.append("FN")
            plan.append(response_dict["plan"])
            rationale.append(response_dict["rationale"])
            uncertainty.append(response_dict["uncertainty"])

        # TP: The LLM validates a dependency as correct and the dependency is correct
        if isDependency and str(rating).lower() == "true":
            response_rating.append(str(isDependency))
            classification.append("TP")
            plan.append(response_dict["plan"])
            rationale.append(response_dict["rationale"])
            uncertainty.append(response_dict["uncertainty"])

        # TN: The LLM validates a dependency as incorrect and the dependency is incorrect
        if not isDependency and str(rating).lower() == "false":
            response_rating.append(str(isDependency))
            classification.append("TN")
            plan.append(response_dict["plan"])
            rationale.append(response_dict["rationale"])
            uncertainty.append(response_dict["uncertainty"])

    df[f"response_rating"] = response_rating
    df["classification"] = classification
    df["plan"] = plan
    df["rationale"] = rationale
    df["uncertainty"] = uncertainty

    return df


model_names = ["gpt-3.5-turbo-0125", "gpt-4o-2024-05-13", "llama3:8b", "llama3:70b"]


for name in model_names:
    df = get_df_stats(model_name=name)
    df.to_csv(f"../data/evaluation/analysis/without/{name}.csv", index=False)


In [19]:
import pandas as pd

model_names = ['llama3:70b', 'gpt-4o-2024-05-13', 'gpt-3.5-turbo-0125', 'llama3:8b']

df_base = pd.read_csv("../data/evaluation/all_dependencies.csv")

for name in model_names:
    file_name = f"../data/evaluation/analysis/without/{name}.csv"
    df_model = pd.read_csv(file_name)

    classification = df_model["classification"].to_list()

    df_base[f"{name}_classification"] = classification


df_base.to_csv("../data/evaluation/analysis/without/all.csv", index=False)

print(len(df_base))


500


In [20]:
import pandas as pd

df = pd.read_csv("../data/evaluation/analysis/without/all.csv")

# List of model columns to check
model_columns = ['llama3:70b_classification', 'gpt-4o-2024-05-13_classification', 'gpt-3.5-turbo-0125_classification', 'llama3:8b_classification']

# Create a boolean mask where True indicates the presence of 'FP' or 'FN' in any of the specified model columns
#mask = df[model_columns].apply(lambda x: x.isin(['FP', 'FN']), axis=1).any(axis=1)

# Filter the dataframe using the mask
#filtered_df_all = df[mask]

df['FP_FN_count'] = df[model_columns].apply(lambda row: row.isin(['FP', 'FN']).sum(), axis=1)
df['TP_TN_count'] = df[model_columns].apply(lambda row: row.isin(['TP', 'TN']).sum(), axis=1)

#sorted_df_all =  filtered_df_all.sort_values(by='FP_FN_count', ascending=False)

# Save the filtered dataframe if needed
df.to_csv('../data/evaluation/analysis/without/all.csv', index=False)

In [21]:
import pandas as pd
from collections import Counter

df_all = pd.read_csv('../data/evaluation/analysis/without/all.csv')

failure_count = df_all["FP_FN_count"].to_list()
success_count = df_all["TP_TN_count"].to_list()

failure_counter = Counter(failure_count)
success_counter = Counter(success_count)

print(len(df_all))
print(len(failure_count))
print(len(success_counter))

print("Failure counter: ", failure_counter)
print("Success counter: ", success_counter)

500
500
5
Failure counter:  Counter({0: 167, 1: 159, 2: 115, 3: 54, 4: 5})
Success counter:  Counter({4: 167, 3: 159, 2: 115, 1: 54, 0: 5})


In [24]:
import pandas as pd
from collections import Counter

baseline_file = "../data/evaluation/all_dependencies.csv"

df = pd.read_csv(baseline_file)

categories = df["problem_category"].to_list()

category_counter = Counter(categories)

category_counter

Counter({'Maven Dependency': 80,
         'Project Inheritance': 67,
         'Boolean': 57,
         'Generic Names': 29,
         'Maven Properties': 22,
         'POM Version': 20,
         'Others': 19,
         'Datasource': 18,
         'Port': 16,
         'Maven Plugin': 15,
         nan: 15,
         'Different Dockerfiles': 13,
         'Dependency Scope': 11,
         'Project Structure': 11,
         'Packaging Format': 11,
         'Project Aggregation': 10,
         'Module as Dependency': 10,
         'Logging Level': 10,
         'Maven Plugins': 9,
         'Project Version': 8,
         'Dependency': 7,
         'Different Services': 7,
         'Number': 6,
         'JAVA Version': 6,
         'Health Monitoring': 3,
         'Executable': 3,
         'Eureka': 2,
         'Encoding': 2,
         'Version': 2,
         'URL Matching': 2,
         'Environment': 2,
         'Logging': 2,
         'Networks': 2,
         'Commands': 1,
         'project Version': 1,
  