In [5]:
from collections import Counter
import pandas as pd
import json

def compute_metrics(df: pd.DataFrame, model_name: str) -> None:
    
    true_positives = []
    true_negatives = []
    false_positives = []
    false_negatives = []
    categories = []
    format_failure_count = 0

    for _, row in df.iterrows():

        rating = row["rating"]
        response = row["responses"]

        try:
            response_dict = json.loads(response, strict=False)
            isDependency = response_dict["isDependency"]
        except:
            format_failure_count += 1
            continue


        # TP: The LLM validates a dependency as correct and the dependency is correct
        if isDependency and str(rating).lower() == "true":
            true_positives.append(1)
                
        # FP: The LLM validates a dependency as correct, but the dependency is actually incorrect
        if isDependency and str(rating).lower() == "false":
            false_positives.append(1)
            categories.append(row["final_failure_category"])

        # TN: The LLM validates a dependency as incorrect and the dependency is incorrect
        if not isDependency and str(rating).lower() == "false":
            true_negatives.append(1)
            

        # FN: The LLM validates a dependency as incorrect, but the dependency is actually correct
        if not isDependency and  str(rating).lower() == "true":
            false_negatives.append(1)
            categories.append(row["final_failure_category"])
    

    tp = sum(true_positives)
    fp = sum(false_positives)
    fn = sum(false_negatives)
    tn = sum(true_negatives)

    assert tp + tn + fp + fn + format_failure_count == len(df)

    print(f"Num format failures: {format_failure_count}")
    print(f"Num Failures for RAG config2 {model_name}: {len(df)}")
    print(f"Num Failures for RAG config2 with specialized prompt and few-shot: {model_name}: {fn +fp}")
    print(Counter(categories))

In [9]:
model_name = "llama3:70b" #"gpt-3.5-turbo-0125"
df = pd.read_csv(f"../data/analysis/failures_{model_name}.csv")
compute_metrics(df=df, model_name=model_name)

Num format failures: 0
Num Failures for RAG config2 llama3:70b: 152
Num Failures for RAG config2 with specialized prompt and few-shot: llama3:70b: 58
Counter({'Inheritance and Overrides': 29, 'Configuration Consistency': 24, 'Resource Sharing': 2, 'Inferring Dependencies': 2, 'Port Mapping': 1})
