In [2]:
from typing import List
import pandas as pd
import json
import os

def compute_validation_effectiveness(baseline_file: str, model_names: List, result_dir: str, index_name: str):

    df_baseline = pd.read_csv(baseline_file)

    #dup_columns = ["option_value", "option_name", "option_file", "dependent_option_name", "dependent_option_value", "dependent_option_file"]
    #duplicates = df_baseline[df_baseline.duplicated(subset=dup_columns)].index.to_list()
    #df_baseline = df_baseline.drop_duplicates(subset=dup_columns, keep="first")
    
    results = []

    for model_name in model_names:
        
        results_file = f"{result_dir}/all_dependencies_{index_name}_{model_name}.json"

        with open(results_file, "r", encoding="utf-8") as src:
            data = json.load(src)

        true_positives = []
        true_negatives = []
        false_positives = []
        false_negatives = []
        accuracy_count = []
        skipped = 0

        for entry, (index, row) in zip(data, df_baseline.iterrows()):

            #if index in duplicates:
            #    continue
            
            response_dict = None
            isDependency = None

            rating = row["final_rating"]
            try:
                if "responses" in entry:
                    response = entry["responses"][0]
                else:
                    response = entry["response"]
                response_dict = json.loads(response, strict=False)
                isDependency = response_dict["isDependency"]
            except (json.JSONDecodeError, KeyError):
                skipped += 1
                continue
                
            if str(rating) == "Borderline":
                if isDependency:
                    accuracy_count.append(1)
                    true_positives.append(1)
                if not isDependency:
                    accuracy_count.append(1)
                    true_negatives.append(1)

            # TP: The LLM validates a dependency as correct and the dependency is correct
            if isDependency and str(rating).lower() == "true":
                accuracy_count.append(1)
                true_positives.append(1)
                
            # FP: The LLM validates a dependency as correct, but the dependency is actually incorrect
            if isDependency and str(rating).lower() == "false":
                accuracy_count.append(0)
                false_positives.append(1)

            # TN: The LLM validates a dependency as incorrect and the dependency is incorrect
            if not isDependency and str(rating).lower() == "false":
                accuracy_count.append(1)
                true_negatives.append(1)

            # FN: The LLM validates a dependency as incorrect, but the dependency is actually correct
            if not isDependency and  str(rating).lower() == "true":
                accuracy_count.append(0)
                false_negatives.append(1)

        
        tp = sum(true_positives)
        fp = sum(false_positives)
        fn = sum(false_negatives)
        tn = sum(true_negatives)
        accuracy = sum(accuracy_count)/len(accuracy_count)


        print(f"Num Failures: {fp + fn}")

        print("Model: ", model_name)
        print("File: ", results_file)
        print("TP", tp)
        print("FP", fp)
        print("TN", tn)
        print("FN", fn)
        print("Sum correct response format: ", sum([tp, fp, tn, fn]))
        print("Entries skipped: ", skipped)

        precision = tp/(tp+fp)
        recall = tp/(tp+fn)
        f1_score = 2 * (precision * recall) / (precision + recall)

        print("Accuracy", accuracy)
        print("Precision", precision)
        print("Recall", recall)
        print("F1 Score: ", f1_score)

        results.append({
            "model_name": model_name,
            "file": results_file,
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1-score": f1_score
        })
    

    df_results = pd.DataFrame(results)
    df_results.to_csv(f"{result_dir}/validation_effectiveness.csv", index=False)


compute_validation_effectiveness(
    baseline_file="../data/results/all_dependencies_updated.csv",
    model_names=["gpt-4o-2024-05-13", "gpt-3.5-turbo-0125", "gpt-3.5-turbo-0125_advanced", "llama3:70b", "llama3:8b", "llama3.1:70b_advanced"], #"llama3.1:70b"
    result_dir="../data/results/config2",
    index_name="all"
)

Num Failures: 116
Model:  gpt-4o-2024-05-13
File:  ../data/results/config2/all_dependencies_all_gpt-4o-2024-05-13.json
TP 114
FP 31
TN 270
FN 85
Sum correct response format:  500
Entries skipped:  0
Accuracy 0.768
Precision 0.7862068965517242
Recall 0.5728643216080402
F1 Score:  0.6627906976744186
Num Failures: 185
Model:  gpt-3.5-turbo-0125
File:  ../data/results/config2/all_dependencies_all_gpt-3.5-turbo-0125.json
TP 155
FP 141
TN 160
FN 44
Sum correct response format:  500
Entries skipped:  0
Accuracy 0.63
Precision 0.5236486486486487
Recall 0.7788944723618091
F1 Score:  0.6262626262626263
Num Failures: 146
Model:  gpt-3.5-turbo-0125_advanced
File:  ../data/results/config2/all_dependencies_all_gpt-3.5-turbo-0125_advanced.json
TP 144
FP 91
TN 210
FN 55
Sum correct response format:  500
Entries skipped:  0
Accuracy 0.708
Precision 0.6127659574468085
Recall 0.7236180904522613
F1 Score:  0.663594470046083
Num Failures: 152
Model:  llama3:70b
File:  ../data/results/config2/all_dependenci

In [3]:
import pandas as pd


file_path = "../data/results/config2/validation_effectiveness.csv"

df = pd.read_csv(file_path)

df

Unnamed: 0,model_name,file,accuracy,precision,recall,f1-score
0,gpt-4o-2024-05-13,../data/results/config2/all_dependencies_all_g...,0.768,0.786207,0.572864,0.662791
1,gpt-3.5-turbo-0125,../data/results/config2/all_dependencies_all_g...,0.63,0.523649,0.778894,0.626263
2,gpt-3.5-turbo-0125_advanced,../data/results/config2/all_dependencies_all_g...,0.708,0.612766,0.723618,0.663594
3,llama3:70b,../data/results/config2/all_dependencies_all_l...,0.696,0.588679,0.78392,0.672414
4,llama3:8b,../data/results/config2/all_dependencies_all_l...,0.538835,0.475758,0.902299,0.623016
5,llama3.1:70b_advanced,../data/results/config2/all_dependencies_all_l...,0.752,0.686567,0.693467,0.69
