**Qualitative Analysis of Validation Responses**

In [53]:
import json
import pandas as pd


def get_df_stats(model_name: str):

    data_file = f"../data/evaluation/without/all_dependencies_without_{model_name}.json"
    baseline_file = "../data/evaluation/all_dependencies.csv"

    with open(data_file, "r", encoding="utf-8") as src:
        data = json.load(src)

    df = pd.read_csv(baseline_file)

    response_rating = []
    classification = []
    rationale = []
    plan = []
    uncertainty = []
    

    for entry, (index, row) in zip(data, df.iterrows()):

        assert entry["index"] == index
        
        if "responses" in entry:
            response = entry["responses"][0]
        else:
            response = entry["response"]

        try:
            if "responses" in entry:
                response = entry["responses"][0]
            else:
                response = entry["response"]
            response_dict = json.loads(response)
            isDependency = response_dict["isDependency"]
        except json.JSONDecodeError:
            pass
        except KeyError:
            return

        rating = row["final_rating"]
        
        if str(rating).lower() == "borderline":
            if isDependency:
                response_rating.append(str(isDependency))
                classification.append("TP")
                plan.append(response_dict["plan"])
                rationale.append(response_dict["rationale"])
                uncertainty.append(response_dict["uncertainty"])
            if not isDependency:
                response_rating.append(str(isDependency))
                classification.append("TN")
                plan.append(response_dict["plan"])
                rationale.append(response_dict["rationale"])
                uncertainty.append(response_dict["uncertainty"])

        # FP: The LLM validates a dependency as correct, but the dependency is actually incorrect
        if isDependency and str(rating).lower() == "false":
            response_rating.append(str(isDependency))
            classification.append("FP")
            plan.append(response_dict["plan"])
            rationale.append(response_dict["rationale"])
            uncertainty.append(response_dict["uncertainty"])


        # FN: The LLM validates a dependency as incorrect, but the dependency is actually correct
        if not isDependency and  str(rating).lower() == "true":
            response_rating.append(str(isDependency))
            classification.append("FN")
            plan.append(response_dict["plan"])
            rationale.append(response_dict["rationale"])
            uncertainty.append(response_dict["uncertainty"])

        # TP: The LLM validates a dependency as correct and the dependency is correct
        if isDependency and str(rating).lower() == "true":
            response_rating.append(str(isDependency))
            classification.append("TP")
            plan.append(response_dict["plan"])
            rationale.append(response_dict["rationale"])
            uncertainty.append(response_dict["uncertainty"])

        # TN: The LLM validates a dependency as incorrect and the dependency is incorrect
        if not isDependency and str(rating).lower() == "false":
            response_rating.append(str(isDependency))
            classification.append("TN")
            plan.append(response_dict["plan"])
            rationale.append(response_dict["rationale"])
            uncertainty.append(response_dict["uncertainty"])

    df[f"response_rating"] = response_rating
    df["classification"] = classification
    df["plan"] = plan
    df["rationale"] = rationale
    df["uncertainty"] = uncertainty

    return df


model_names = ["gpt-3.5-turbo-0125", "gpt-4o-2024-05-13", "llama3:8b", "llama3:70b"]


for name in model_names:
    df = get_df_stats(model_name=name)
    print(df.columns)
    df.to_csv(f"../data/evaluation/analysis/without/{name}.csv", index=False)


Index(['dependency_category', 'link_str', 'project', 'option_name',
       'option_value', 'option_type', 'option_file', 'option_technology',
       'dependent_option_name', 'dependent_option_value',
       'dependent_option_type', 'dependent_option_file',
       'dependent_option_technology', 'config_type', 'rating', 'final_rating',
       'final_category', 'category', 'explanation', 'response_rating',
       'classification', 'plan', 'rationale', 'uncertainty'],
      dtype='object')
Index(['dependency_category', 'link_str', 'project', 'option_name',
       'option_value', 'option_type', 'option_file', 'option_technology',
       'dependent_option_name', 'dependent_option_value',
       'dependent_option_type', 'dependent_option_file',
       'dependent_option_technology', 'config_type', 'rating', 'final_rating',
       'final_category', 'category', 'explanation', 'response_rating',
       'classification', 'plan', 'rationale', 'uncertainty'],
      dtype='object')
Index(['dependency

In [54]:
import pandas as pd

model_names = ['llama3:70b', 'gpt-4o-2024-05-13', 'gpt-3.5-turbo-0125', 'llama3:8b']

df_base = pd.read_csv("../data/evaluation/all_dependencies.csv")

for name in model_names:
    file_name = f"../data/evaluation/analysis/without/{name}.csv"
    df_model = pd.read_csv(file_name)

    classification = df_model["classification"].to_list()

    df_base[f"{name}_classification"] = classification


df_base.to_csv("../data/evaluation/analysis/without/all.csv", index=False)

print(len(df_base))


500


In [55]:
import pandas as pd

df = pd.read_csv("../data/evaluation/analysis/without/all.csv")

# List of model columns to check
model_columns = ['llama3:70b_classification', 'gpt-4o-2024-05-13_classification', 'gpt-3.5-turbo-0125_classification', 'llama3:8b_classification']

# Create a boolean mask where True indicates the presence of 'FP' or 'FN' in any of the specified model columns
#mask = df[model_columns].apply(lambda x: x.isin(['FP', 'FN']), axis=1).any(axis=1)

# Filter the dataframe using the mask
#filtered_df_all = df[mask]

df['FP_FN_count'] = df[model_columns].apply(lambda row: row.isin(['FP', 'FN']).sum(), axis=1)
df['TP_TN_count'] = df[model_columns].apply(lambda row: row.isin(['TP', 'TN']).sum(), axis=1)

#sorted_df_all =  filtered_df_all.sort_values(by='FP_FN_count', ascending=False)

# Save the filtered dataframe if needed
df.to_csv('../data/evaluation/analysis/without/all.csv', index=False)

In [56]:
import pandas as pd
from collections import Counter

df_all = pd.read_csv('../data/evaluation/analysis/without/all.csv')

failure_count = df_all["FP_FN_count"].to_list()
success_count = df_all["TP_TN_count"].to_list()

failure_counter = Counter(failure_count)
success_counter = Counter(success_count)

print(len(df_all))
print(len(failure_count))
print(len(success_counter))

print("Failure counter: ", failure_counter)
print("Success counter: ", success_counter)

500
500
5
Failure counter:  Counter({0: 167, 1: 159, 2: 115, 3: 54, 4: 5})
Success counter:  Counter({4: 167, 3: 159, 2: 115, 1: 54, 0: 5})


In [57]:
import pandas as pd
from collections import Counter

baseline_file = "../data/evaluation/all_dependencies.csv"

df = pd.read_csv(baseline_file)

categories = df["final_category"].to_list()

categories = [x.lower() for x in categories]

category_counter = Counter(categories)

category_counter

Counter({'maven dependency': 91,
         'project inheritance': 82,
         'boolean': 57,
         'others': 48,
         'version numbers': 44,
         'generic names': 29,
         'maven properties': 22,
         'independent components': 20,
         'datasource': 19,
         'port': 16,
         'maven plugin': 15,
         'project structure': 11,
         'packaging format': 11,
         'project aggregation': 10,
         'module as dependency': 10,
         'maven plugins': 9,
         'integers': 6})

In [58]:
file_name = f"../data/evaluation/analysis/without/gpt-4o-2024-05-13.csv"

df = pd.read_csv(file_name)

df_category = df[df['final_category'] == "Maven Dependency"]

rating_counts = df_category['classification'].value_counts()

rating_counts

classification
TN    53
TP    22
FN    15
FP     1
Name: count, dtype: int64

In [60]:
from typing import List
import pandas as pd

def get_category_stats(category_names: List, model_name: str):

    file_name = f"../data/evaluation/analysis/without/{model_name}.csv"
    df = pd.read_csv(file_name)

    tp = []
    fp = []
    tn = []
    fn = []

    for category_name in category_names:

        df_category = df[df['final_category'].str.lower() == str(category_name)]

        #print(df_category["classification"].unique())

        rating_counts = df_category['classification'].value_counts().to_dict()

        #print(type(rating_counts))
        #print(df_category)
        #print(rating_counts)

        tp.append(rating_counts["TP"] if "TP" in rating_counts else "-")
        tn.append(rating_counts["TN"] if "TN" in rating_counts else "-" )
        fp.append(rating_counts["FP"] if "FP" in rating_counts else "-")
        fn.append(rating_counts["FN"] if "FN" in rating_counts else "-")

    
    data = {
        "category_name": category_names,
        "TP": tp,
        "TN": tn,
        "FP": fp,
        "FN": fn
    }

    df = pd.DataFrame(data)

    print(df)




model_names = ['gpt-4o-2024-05-13', 'gpt-3.5-turbo-0125', 'llama3:70b', 'llama3:8b']
category_names = list(category_counter.keys())

for model_name in model_names:
    print("Model: ", model_name)
    get_category_stats(category_names=category_names, model_name=model_name)

Model:  gpt-4o-2024-05-13
             category_name  TP  TN FP  FN
0          version numbers   9  16  3  16
1                  boolean   -  57  -   -
2      project inheritance  34  10  6  32
3         maven properties  18   -  -   4
4                   others   9  38  1   -
5      project aggregation   8   -  -   2
6         maven dependency  22  53  1  15
7        project structure   7   -  3   1
8            maven plugins   1   7  -   1
9             maven plugin   -  11  -   4
10                    port   5  11  -   -
11    module as dependency   9   -  -   1
12           generic names   -  28  1   -
13        packaging format   -  11  -   -
14              datasource   5  14  -   -
15  independent components   -  20  -   -
16                integers   -   6  -   -
Model:  gpt-3.5-turbo-0125
             category_name  TP  TN  FP  FN
0          version numbers  15   4  15  10
1                  boolean   -  55   2   -
2      project inheritance  33   7   9  33
3         maven pro