**Qualitative Analysis of Validation Responses**

In [6]:
from typing import List, Dict
import tiktoken

def get_context_length(entry: Dict, encoding_name: str = "cl100k_base") -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(entry["context_str"]))
    return num_tokens


def get_source_types(entry: Dict) -> List[str]:
    context = entry["context"]
    source_types = [x["index"] for x in context]
    return source_types

In [70]:
import json
import pandas as pd
import os


def get_stats(model_name: str):

    data_file = f"../data/evaluation/without/all_dependencies_without_{model_name}.json"
    baseline_file = "../data/evaluation/all_dependencies.csv"
    output_dir = "../data/evaluation/analysis"
    output_file = os.path.join(output_dir, "without_" + model_name + "_stats.csv")

    with open(data_file, "r", encoding="utf-8") as src:
        data = json.load(src)

    df = pd.read_csv(baseline_file)

    stats = []

    for entry, (index, row) in zip(data, df.iterrows()):
        
        if "responses" in entry:
            response = entry["responses"][0]
        else:
            response = entry["response"]

        try:
            if "responses" in entry:
                response = entry["responses"][0]
            else:
                response = entry["response"]
            response_dict = json.loads(response)
            isDependency = response_dict["isDependency"]
        except json.JSONDecodeError as error:
            print(error)
        except KeyError:
            return

        rating = row["final_rating"]
        
        # checl borderline cases
        if str(rating) == "Borderline":
            continue
            #errors.append({
            #    "index": index,
            #    "model_name": model_name,
            #    "rating": "borderline",
            #    "response_rating": str(isDependency),
            #    "term": "None"
            #})

        
        # FP: The LLM validates a dependency as correct, but the dependency is actually incorrect
        if isDependency and str(rating).lower() == "false":
            stats.append({
                "index": index,
                "rating": rating,
                "response_rating": str(isDependency),
                model_name: "FP",
            })

        # FN: The LLM validates a dependency as incorrect, but the dependency is actually correct
        if not isDependency and  str(rating).lower() == "true":
            stats.append({
                "index": index,
                "rating": rating,
                "response_rating": str(isDependency),
                model_name: "FN",
            })


        # TP: The LLM validates a dependency as correct and the dependency is correct
        if isDependency and str(rating).lower() == "true":
            stats.append({
                "index": index,
                "rating": rating,
                "response_rating": str(isDependency),
                model_name: "TP",
            })


        # TN: The LLM validates a dependency as incorrect and the dependency is incorrect
        if not isDependency and str(rating).lower() == "false":
            stats.append({
                "index": index,
                "rating": rating,
                "response_rating": str(isDependency),
                model_name: "TN",
            })


    df_stats = pd.DataFrame(stats)   
    df_stats.to_csv(output_file, index=False)


model_names = ["gpt-3.5-turbo-0125", "gpt-4o-2024-05-13", "llama3:8b", "llama3:70b"]
for name in model_names:
    get_stats(model_name=name)
    

Invalid control character at: line 2 column 236 (char 237)
Invalid control character at: line 2 column 333 (char 334)
Invalid control character at: line 2 column 190 (char 191)
Invalid control character at: line 2 column 241 (char 242)
Invalid control character at: line 2 column 140 (char 141)


In [59]:
import pandas as pd

df_gpt4o = pd.read_csv("../data/evaluation/analysis/without_gpt-4o-2024-05-13_stats.csv")
df_gpt35_turbo = pd.read_csv("../data/evaluation/analysis/without_gpt-3.5-turbo-0125_stats.csv")
df_llama70b = pd.read_csv("../data/evaluation/analysis/without_llama3:70b_stats.csv")
#df_llama8b = pd.read_csv("../data/evaluation/analysis/without_llama3:8b_stats.csv")


df_all = df_gpt4o

df_all["gpt-3.5-turbo-0125"] = df_gpt35_turbo["gpt-3.5-turbo-0125"]
df_all["llama3:70b"] = df_llama70b["llama3:70b"]
#df_baseline["llama3:8b"] = df_llama8b["llama3:8b""]

df_all.to_csv("../data/evaluation/analysis/all.csv", index=False)

In [66]:
import pandas as pd

df = pd.read_csv("../data/evaluation/analysis/all.csv")

# List of model columns to check
model_columns = ['llama3:70b', 'gpt-4o-2024-05-13', 'gpt-3.5-turbo-0125']

# Create a boolean mask where True indicates the presence of 'FP' or 'FN' in any of the specified model columns
mask = df[model_columns].apply(lambda x: x.isin(['FP', 'FN']), axis=1).any(axis=1)

# Filter the dataframe using the mask
filtered_df_all = df[mask]

filtered_df_all['FP_FN_count'] = filtered_df_all[model_columns].apply(lambda row: row.isin(['FP', 'FN']).sum(), axis=1)

sorted_df_all =  filtered_df_all.sort_values(by='FP_FN_count', ascending=False)

# Save the filtered dataframe if needed
sorted_df_all.to_csv('../data/evaluation/analysis/failures.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_all['FP_FN_count'] = filtered_df_all[model_columns].apply(lambda row: row.isin(['FP', 'FN']).sum(), axis=1)


In [67]:
import pandas as pd
from collections import Counter

df = pd.read_csv('../data/evaluation/analysis/failures.csv')

count = df["FP_FN_count"].to_list()

counter = Counter(count)

counter

Counter({1: 160, 2: 77, 3: 17})

In [69]:
import pandas as pd
import json

df_failures = pd.read_csv('../data/evaluation/analysis/failures.csv')
indices = df["index"].to_list()

with open("../data/evaluation/config1/all_dependencies_all.json", "r", encoding="utf-8") as src:
    data = json.load(src)


dependencies = []

for entry in data:
    if entry["index"] in indices:
        dependencies.append(
            {
                "index": entry["index"],
                "dependency": entry["dependency"]
            }
        )

with open('../data/evaluation/analysis/failures.json', "w", encoding="utf-8") as dest:
    json.dump(dependencies, dest, indent=2)
