**Compute Validation Effectiveness for LLM with Retrieval-Augmented Generation**

In [9]:
import pandas as pd
import json

baseline = "../data/evaluation/all_dependencies.csv"
model_name = "gpt-4o-2024-05-13" # gpt-3.5-turbo-0125 gpt-4o-2024-05-13 llama3:8b llama3:70b
results_file = f"../data/evaluation/config6/all_dependencies_all_{model_name}_100.json"


df_baseline = pd.read_csv(baseline)

with open(results_file, "r", encoding="utf-8") as src:
    data = json.load(src)


true_positives = []
true_negatives = []
false_positives = []
false_negatives = []
accuracy = []

for entry, (index, row) in zip(data, df_baseline.iterrows()):

    if index > 99:
        continue

    rating = row["final_rating"]
    try:
        if "responses" in entry:
            response = entry["responses"][0]
        else:
            response = entry["response"]
        response_dict = json.loads(response)
        isDependency = response_dict["isDependency"]
    except json.JSONDecodeError as error:
        #print(index)
        #print(response_dict)
        #print(type(response_dict))
        try:
            isDependency = response_dict["isDependency"]
        except KeyError as error:
            print(error)
            continue
    except KeyError as error:
        print(error)
        continue
    
    if str(rating) == "Borderline":
        if isDependency:
            accuracy.append(1)
            true_positives.append(1)
        if not isDependency:
            accuracy.append(1)
            true_negatives.append(1)

    # TP: The LLM validates a dependency as correct and the dependency is correct
    if isDependency and str(rating).lower() == "true":
        accuracy.append(1)
        true_positives.append(1)
        
    # FP: The LLM validates a dependency as correct, but the dependency is actually incorrect
    if isDependency and str(rating).lower() == "false":
        accuracy.append(0)
        false_positives.append(1)

    # TN: The LLM validates a dependency as incorrect and the dependency is incorrect
    if not isDependency and str(rating).lower() == "false":
        accuracy.append(1)
        true_negatives.append(1)

    # FN: The LLM validates a dependency as incorrect, but the dependency is actually correct
    if not isDependency and  str(rating).lower() == "true":
        accuracy.append(0)
        false_negatives.append(1)


print("Model: ", model_name)
print("File: ", results_file)
print("TP", sum(true_positives))
print("FP", sum(false_positives))
print("TN", sum(true_negatives))
print("FN", sum(false_negatives))
print("Sum correct response format: ", sum([sum(true_positives), sum(false_positives), sum(true_negatives), sum(false_negatives)]))

precision = sum(true_positives)/(sum(true_positives)+sum(false_positives))
recall = sum(true_positives)/(sum(true_positives)+sum(false_negatives))
f1_score = 2 * (precision * recall) / (precision + recall)

print("Accuracy", sum(accuracy)/len(accuracy))
print("Precision", precision)
print("Recall", recall)
print("F1 Score: ", f1_score)

Model:  gpt-4o-2024-05-13
File:  ../data/evaluation/config6/all_dependencies_all_gpt-4o-2024-05-13_100.json
TP 25
FP 8
TN 51
FN 16
Sum correct response format:  100
Accuracy 0.76
Precision 0.7575757575757576
Recall 0.6097560975609756
F1 Score:  0.6756756756756757


In [27]:
import pandas as pd
import json

baseline = "../data/evaluation/all_dependencies.csv"
model_name = "gpt-4o-2024-05-13" # gpt-3.5-turbo-0125 gpt-4o-2024-05-13 llama3:8b llama3:70b
results_file = f"../data/evaluation/config2/all_dependencies_all_results.json"


df_baseline = pd.read_csv(baseline)

with open(results_file, "r", encoding="utf-8") as src:
    data = json.load(src)


true_positives = []
true_negatives = []
false_positives = []
false_negatives = []
accuracy = []

for entry, (index, row) in zip(data, df_baseline.iterrows()):

    rating = row["final_rating"]
    try:
        response = entry["responses"][model_name]
        response_dict = json.loads(response)
        isDependency = response_dict["isDependency"]
    except json.JSONDecodeError as error:
        #print(index)
        #print(response_dict)
        #print(type(response_dict))
        try:
            isDependency = response_dict["isDependency"]
        except KeyError as error:
            print(error)
            continue
    except KeyError as error:
        print(error)
        continue
    
    if str(rating) == "Borderline":
        if isDependency:
            accuracy.append(1)
            true_positives.append(1)
        if not isDependency:
            accuracy.append(1)
            true_negatives.append(1)

    # TP: The LLM validates a dependency as correct and the dependency is correct
    if isDependency and str(rating).lower() == "true":
        accuracy.append(1)
        true_positives.append(1)
        
    # FP: The LLM validates a dependency as correct, but the dependency is actually incorrect
    if isDependency and str(rating).lower() == "false":
        accuracy.append(0)
        false_positives.append(1)

    # TN: The LLM validates a dependency as incorrect and the dependency is incorrect
    if not isDependency and str(rating).lower() == "false":
        accuracy.append(1)
        true_negatives.append(1)

    # FN: The LLM validates a dependency as incorrect, but the dependency is actually correct
    if not isDependency and  str(rating).lower() == "true":
        accuracy.append(0)
        false_negatives.append(1)


print("Model: ", model_name)
print("File: ", results_file)
print("TP", sum(true_positives))
print("FP", sum(false_positives))
print("TN", sum(true_negatives))
print("FN", sum(false_negatives))
print("Sum correct response format: ", sum([sum(true_positives), sum(false_positives), sum(true_negatives), sum(false_negatives)]))

precision = sum(true_positives)/(sum(true_positives)+sum(false_positives))
recall = sum(true_positives)/(sum(true_positives)+sum(false_negatives))
f1_score = 2 * (precision * recall) / (precision + recall)

print("Accuracy", sum(accuracy)/len(accuracy))
print("Precision", precision)
print("Recall", recall)
print("F1 Score: ", f1_score)

Model:  gpt-4o-2024-05-13
File:  ../data/evaluation/config2/all_dependencies_all_results.json
TP 115
FP 27
TN 286
FN 72
Sum correct response format:  500
Accuracy 0.802
Precision 0.8098591549295775
Recall 0.6149732620320856
F1 Score:  0.6990881458966566
