**Qualitative Analysis of Validation Responses**

In [6]:
from typing import List, Dict
import tiktoken

def get_context_length(entry: Dict, encoding_name: str = "cl100k_base") -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(entry["context_str"]))
    return num_tokens


def get_source_types(entry: Dict) -> List[str]:
    context = entry["context"]
    source_types = [x["index"] for x in context]
    return source_types

In [14]:
import json
import pandas as pd
import os


def get_df_stats(model_name: str):

    data_file = f"../data/evaluation/without/all_dependencies_without_{model_name}.json"
    baseline_file = "../data/evaluation/all_dependencies.csv"
    output_dir = "../data/evaluation/analysis/without"

    with open(data_file, "r", encoding="utf-8") as src:
        data = json.load(src)

    df = pd.read_csv(baseline_file)

    stats = []

    for entry, (index, row) in zip(data, df.iterrows()):
        
        if "responses" in entry:
            response = entry["responses"][0]
        else:
            response = entry["response"]

        try:
            if "responses" in entry:
                response = entry["responses"][0]
            else:
                response = entry["response"]
            response_dict = json.loads(response)
            isDependency = response_dict["isDependency"]
        except json.JSONDecodeError as error:
            pass
        except KeyError:
            return

        rating = row["final_rating"]
        
        # checl borderline cases
        if str(rating) == "Borderline":
            continue
            #errors.append({
            #    "index": index,
            #    "model_name": model_name,
            #    "rating": "borderline",
            #    "response_rating": str(isDependency),
            #    "term": "None"
            #})

        
        # FP: The LLM validates a dependency as correct, but the dependency is actually incorrect
        if isDependency and str(rating).lower() == "false":
            stats.append({
                "index": index,
                "rating": rating,
                "response_rating": str(isDependency),
                model_name: "FP",
            })

        # FN: The LLM validates a dependency as incorrect, but the dependency is actually correct
        if not isDependency and  str(rating).lower() == "true":
            stats.append({
                "index": index,
                "rating": rating,
                "response_rating": str(isDependency),
                model_name: "FN",
            })


        # TP: The LLM validates a dependency as correct and the dependency is correct
        if isDependency and str(rating).lower() == "true":
            stats.append({
                "index": index,
                "rating": rating,
                "response_rating": str(isDependency),
                model_name: "TP",
            })


        # TN: The LLM validates a dependency as incorrect and the dependency is incorrect
        if not isDependency and str(rating).lower() == "false":
            stats.append({
                "index": index,
                "rating": rating,
                "response_rating": str(isDependency),
                model_name: "TN",
            })


    df_stats = pd.DataFrame(stats)   
    return df_stats


model_names = ["gpt-3.5-turbo-0125", "gpt-4o-2024-05-13", "llama3:8b", "llama3:70b"]

dfs = []

for name in model_names:
    dfs.append(get_df_stats(model_name=name))
    

df_base = dfs[0]
df_base = df_base.drop("response_rating", axis=1)

for model_name, df in zip(model_names, dfs):
    df_base[model_name] = df[model_name]


df_base.to_csv("../data/evaluation/analysis/without/all.csv", index=False)

**Merge entry with corresponding dependencies**

In [15]:
import pandas as pd
import json

df = pd.read_csv('../data/evaluation/analysis/without/all.csv')

with open("../data/evaluation/config5/all_dependencies_all.json", "r", encoding="utf-8") as src:
    data = json.load(src)


merged_data = []

for entry in df.to_dict("records"):
    dependency = next(filter(lambda x: x["index"] == entry["index"], data))

    for key, value in dependency["dependency"].items():
        if key not in ["dependency_type", "dependency_category", "dependency_level"]:
            entry[key] = value

    merged_data.append(entry)


df_merged = pd.DataFrame(merged_data)


df_merged.to_csv('../data/evaluation/analysis/without/all.csv', index=False)

**Get all failures**

In [45]:
import pandas as pd

df = pd.read_csv("../data/evaluation/analysis/without/all.csv")

# List of model columns to check
model_columns = ['llama3:70b', 'gpt-4o-2024-05-13', 'gpt-3.5-turbo-0125', 'llama3:8b']

# Create a boolean mask where True indicates the presence of 'FP' or 'FN' in any of the specified model columns
mask = df[model_columns].apply(lambda x: x.isin(['FP', 'FN']), axis=1).any(axis=1)

# Filter the dataframe using the mask
filtered_df_all = df[mask]

filtered_df_all['FP_FN_count'] = filtered_df_all[model_columns].apply(lambda row: row.isin(['FP', 'FN']).sum(), axis=1)

#sorted_df_all =  filtered_df_all.sort_values(by='FP_FN_count', ascending=False)

# Save the filtered dataframe if needed
filtered_df_all.to_csv('../data/evaluation/analysis/without/failures.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_all['FP_FN_count'] = filtered_df_all[model_columns].apply(lambda row: row.isin(['FP', 'FN']).sum(), axis=1)


**Get all successess**

In [46]:
import pandas as pd

df = pd.read_csv("../data/evaluation/analysis/without/all.csv")

# List of model columns to check
model_columns = ['llama3:70b', 'gpt-4o-2024-05-13', 'gpt-3.5-turbo-0125', 'llama3:8b']

# Create a boolean mask where True indicates the presence of 'FP' or 'FN' in any of the specified model columns
mask = df[model_columns].apply(lambda x: x.isin(['TP', 'TN']), axis=1).any(axis=1)

# Filter the dataframe using the mask
filtered_df_all = df[mask]

filtered_df_all['TP_TN_count'] = filtered_df_all[model_columns].apply(lambda row: row.isin(['TP', 'TN']).sum(), axis=1)

#sorted_df_all =  filtered_df_all.sort_values(by='FP_FN_count', ascending=False)

# Save the filtered dataframe if needed
filtered_df_all.to_csv('../data/evaluation/analysis/without/successess.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_all['TP_TN_count'] = filtered_df_all[model_columns].apply(lambda row: row.isin(['TP', 'TN']).sum(), axis=1)


**Create failure and success counter**

In [47]:
import pandas as pd
from collections import Counter

df_fail = pd.read_csv('../data/evaluation/analysis/without/failures.csv')
df_success = pd.read_csv('../data/evaluation/analysis/without/successess.csv')

failure_count = df_fail["FP_FN_count"].to_list()
success_count = df_success["TP_TN_count"].to_list()

failure_counter = Counter(failure_count)
success_counter = Counter(success_count)

print("Failure counter: ", failure_counter)
print("Success coutner: ", success_counter)

Failure counter:  Counter({1: 159, 2: 109, 3: 47, 4: 7})
Success coutner:  Counter({4: 160, 3: 159, 2: 109, 1: 47})


**Get failures per model**

In [16]:
import pandas as pd

df_fail = pd.read_csv('../data/evaluation/analysis/without/failures.csv')
model_columns = ['llama3:70b', 'gpt-4o-2024-05-13', 'gpt-3.5-turbo-0125', 'llama3:8b']

for model in model_columns:
    
    mask = df_fail[[model]].apply(lambda x: x.isin(['FN', 'FP']), axis=1).any(axis=1)
    df_masked = df_fail[mask]

    columns_to_remove = [x for x in model_columns if x != model]
    
    df_masked = df_masked.drop(columns_to_remove, axis=1)
  
    df_masked.to_csv(f"../data/evaluation/analysis/without/failures_{model}.csv", index=False)