In [13]:
from collections import Counter
import pandas as pd
import json

def compute_metrics(df: pd.DataFrame, model_name: str) -> None:
    
    true_positives = []
    true_negatives = []
    false_positives = []
    false_negatives = []
    categories = []

    for _, row in df.iterrows():

        rating = row["rating"]
        response = row["responses"]

        response_dict = json.loads(response, strict=False)
        isDependency = response_dict["isDependency"]


        # TP: The LLM validates a dependency as correct and the dependency is correct
        if isDependency and str(rating).lower() == "true":
            true_positives.append(1)
                
        # FP: The LLM validates a dependency as correct, but the dependency is actually incorrect
        if isDependency and str(rating).lower() == "false":
            false_positives.append(1)
            categories.append(row["final_failure_category"])

        # TN: The LLM validates a dependency as incorrect and the dependency is incorrect
        if not isDependency and str(rating).lower() == "false":
            true_negatives.append(1)
            

        # FN: The LLM validates a dependency as incorrect, but the dependency is actually correct
        if not isDependency and  str(rating).lower() == "true":
            false_negatives.append(1)
            categories.append(row["final_failure_category"])
    

    tp = sum(true_positives)
    fp = sum(false_positives)
    fn = sum(false_negatives)
    tn = sum(true_negatives)

    assert tp + tn + fp + fn == len(df)

    print(f"Num Failures for RAG config2 {model_name}: {len(df)}")
    print(f"Num Failures for RAG config2 with specialized prompt and few-shot: {model_name}: {fn +fp}")
    print(Counter(categories))

In [16]:
model_name = "gpt-3.5-turbo-0125" #"gpt-4o-2024-05-13"
df = pd.read_csv(f"../data/analysis/failures_{model_name}.csv")
compute_metrics(df=df, model_name=model_name)

Num Failures for RAG config2 gpt-3.5-turbo-0125: 185
Num Failures for RAG config2 with specialized prompt and few-shot: gpt-3.5-turbo-0125: 82
Counter({'Configuration Consistency': 30, 'Inheritance and Overrides': 28, 'Inferring Dependencies': 6, 'Resource Sharing': 6, 'Exclusive Dependencies': 6, 'Port Mapping': 5, 'Ambiguous Option Values': 1})
