**Compute Validation Effectiveness for LLM with Retrieval-Augmented Generation**

In [8]:
import pandas as pd
import json

baseline = "../data/evaluation/all_dependencies.csv"
model_name = "llama3:8b" # gpt-3.5-turbo-0125 gpt-4o-2024-05-13 llama3:8b llama3:70b
results_file = f"../data/evaluation/config13/all_dependencies_all_{model_name}.json"


df_baseline = pd.read_csv(baseline)

with open(results_file, "r", encoding="utf-8") as src:
    data = json.load(src)


true_positives = []
true_negatives = []
false_positives = []
false_negatives = []
accuracy = []

for entry, (index, row) in zip(data, df_baseline.iterrows()):

    rating = row["final_rating"]
    try:
        if "responses" in entry:
            response = entry["responses"][0]
        else:
            response = entry["response"]
        response_dict = json.loads(response)
        isDependency = response_dict["isDependency"]
    except json.JSONDecodeError as error:
        #print(index)
        #print(response_dict)
        #print(type(response_dict))
        try:
            isDependency = response_dict["isDependency"]
        except KeyError as error:
            print(error)
            continue
    except KeyError as error:
        print(error)
        continue
    
    if str(rating) == "Borderline":
        if isDependency:
            accuracy.append(1)
            true_positives.append(1)
        if not isDependency:
            accuracy.append(1)
            true_negatives.append(1)

    # TP: The LLM validates a dependency as correct and the dependency is correct
    if isDependency and str(rating).lower() == "true":
        accuracy.append(1)
        true_positives.append(1)
        
    # FP: The LLM validates a dependency as correct, but the dependency is actually incorrect
    if isDependency and str(rating).lower() == "false":
        accuracy.append(0)
        false_positives.append(1)

    # TN: The LLM validates a dependency as incorrect and the dependency is incorrect
    if not isDependency and str(rating).lower() == "false":
        accuracy.append(1)
        true_negatives.append(1)

    # FN: The LLM validates a dependency as incorrect, but the dependency is actually correct
    if not isDependency and  str(rating).lower() == "true":
        accuracy.append(0)
        false_negatives.append(1)


print("Model: ", model_name)
print("File: ", results_file)
print("TP", sum(true_positives))
print("FP", sum(false_positives))
print("TN", sum(true_negatives))
print("FN", sum(false_negatives))
print("Sum correct response format: ", sum([sum(true_positives), sum(false_positives), sum(true_negatives), sum(false_negatives)]))

precision = sum(true_positives)/(sum(true_positives)+sum(false_positives))
recall = sum(true_positives)/(sum(true_positives)+sum(false_negatives))
f1_score = 2 * (precision * recall) / (precision + recall)

print("Accuracy", sum(accuracy)/len(accuracy))
print("Precision", precision)
print("Recall", recall)
print("F1 Score: ", f1_score)

'isDependency'
'isDependency'
Model:  llama3:8b
File:  ../data/evaluation/config13/all_dependencies_all_llama3:8b.json
TP 172
FP 231
TN 72
FN 23
Sum correct response format:  498
Accuracy 0.4899598393574297
Precision 0.4267990074441687
Recall 0.882051282051282
F1 Score:  0.5752508361204013


In [6]:
from typing import List
import pandas as pd
import json
import os

def compute_validation_effectiveness(baseline_file: str, model_names: List, result_dir: str, index_name: str):

    df_baseline = pd.read_csv(baseline_file)
    
    results = []

    for model_name in model_names:
        
        results_file = f"{result_dir}/all_dependencies_{index_name}_{model_name}.json"

        with open(results_file, "r", encoding="utf-8") as src:
            data = json.load(src)

        true_positives = []
        true_negatives = []
        false_positives = []
        false_negatives = []
        accuracy_count = []

        for entry, (index, row) in zip(data, df_baseline.iterrows()):

            rating = row["final_rating"]
            try:
                if "responses" in entry:
                    response = entry["responses"][0]
                else:
                    response = entry["response"]
                response_dict = json.loads(response)
                isDependency = response_dict["isDependency"]
            except json.JSONDecodeError as error:
                #print(index)
                #print(response_dict)
                #print(type(response_dict))
                try:
                    isDependency = response_dict["isDependency"]
                except KeyError as error:
                    print(error)
                    continue
            except KeyError as error:
                print(error)
                continue
            
            if str(rating) == "Borderline":
                if isDependency:
                    accuracy_count.append(1)
                    true_positives.append(1)
                if not isDependency:
                    accuracy_count.append(1)
                    true_negatives.append(1)

            # TP: The LLM validates a dependency as correct and the dependency is correct
            if isDependency and str(rating).lower() == "true":
                accuracy_count.append(1)
                true_positives.append(1)
                
            # FP: The LLM validates a dependency as correct, but the dependency is actually incorrect
            if isDependency and str(rating).lower() == "false":
                accuracy_count.append(0)
                false_positives.append(1)

            # TN: The LLM validates a dependency as incorrect and the dependency is incorrect
            if not isDependency and str(rating).lower() == "false":
                accuracy_count.append(1)
                true_negatives.append(1)

            # FN: The LLM validates a dependency as incorrect, but the dependency is actually correct
            if not isDependency and  str(rating).lower() == "true":
                accuracy_count.append(0)
                false_negatives.append(1)

        
        tp = sum(true_positives)
        fp = sum(false_positives)
        fn = sum(false_negatives)
        tn = sum(true_negatives)
        accuracy = sum(accuracy_count)/len(accuracy_count)


        print("Model: ", model_name)
        print("File: ", results_file)
        print("TP", tp)
        print("FP", fp)
        print("TN", tn)
        print("FN", fn)
        print("Sum correct response format: ", sum([tp, fp, tn, fn]))

        precision = tp/(tp+fp)
        recall = tp/(tp+fn)
        f1_score = 2 * (precision * recall) / (precision + recall)

        print("Accuracy", accuracy)
        print("Precision", precision)
        print("Recall", recall)
        print("F1 Score: ", f1_score)

        results.append({
            "model_name": model_name,
            "file": results_file,
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1-score": f1_score
        })
    

    df_results = pd.DataFrame(results)
    df_results.to_csv(f"{result_dir}/validation_effectiveness.csv", index=False)


compute_validation_effectiveness(
    baseline_file="../data/evaluation/all_dependencies.csv",
    model_names=["gpt-3.5-turbo-0125", "gpt-4o-2024-05-13", "llama3:70b", "llama3:8b"],
    result_dir="../data/evaluation/config2",
    index_name="all"
)

Model:  gpt-3.5-turbo-0125
File:  ../data/evaluation/config2/all_dependencies_all_gpt-3.5-turbo-0125.json
TP 169
FP 90
TN 186
FN 55
Sum correct response format:  500
Accuracy 0.71
Precision 0.6525096525096525
Recall 0.7544642857142857
F1 Score:  0.6997929606625258
Model:  gpt-4o-2024-05-13
File:  ../data/evaluation/config2/all_dependencies_all_gpt-4o-2024-05-13.json
TP 126
FP 16
TN 275
FN 83
Sum correct response format:  500
Accuracy 0.802
Precision 0.8873239436619719
Recall 0.6028708133971292
F1 Score:  0.7179487179487181
Model:  llama3:70b
File:  ../data/evaluation/config2/all_dependencies_all_llama3:70b.json
TP 184
FP 72
TN 202
FN 42
Sum correct response format:  500
Accuracy 0.772
Precision 0.71875
Recall 0.8141592920353983
F1 Score:  0.7634854771784233
'isDependency'
'isDependency'
'isDependency'
'isDependency'
'isDependency'
'isDependency'
'isDependency'
'isDependency'
'isDependency'
Model:  llama3:8b
File:  ../data/evaluation/config2/all_dependencies_all_llama3:8b.json
TP 215
FP

In [9]:
import pandas as pd

file_path = "../data/evaluation/config11/validation_effectiveness.csv"

df = pd.read_csv(file_path)

df

Unnamed: 0,model_name,file,accuracy,precision,recall,f1-score
0,gpt-3.5-turbo-0125,../data/evaluation/config11/all_dependencies_a...,0.632,0.52069,0.770408,0.621399
1,gpt-4o-2024-05-13,../data/evaluation/config11/all_dependencies_a...,0.812,0.828571,0.623656,0.711656
2,llama3:70b,../data/evaluation/config11/all_dependencies_a...,0.686,0.574713,0.765306,0.656455
3,llama3:8b,../data/evaluation/config11/all_dependencies_a...,0.481855,0.429257,0.90404,0.582114
