**Qualitative Analysis of Validation Responses**

In [5]:
import json
import pandas as pd
import os


def get_df_stats(model_name: str):

    data_file = f"../data/evaluation/without/all_dependencies_without_{model_name}.json"
    baseline_file = "../data/evaluation/all_dependencies.csv"

    with open(data_file, "r", encoding="utf-8") as src:
        data = json.load(src)

    df = pd.read_csv(baseline_file)

    response_rating = []
    classification = []
    rationale = []
    plan = []
    uncertainty = []
    

    for entry, (index, row) in zip(data, df.iterrows()):

        assert entry["index"] == index
        
        if "responses" in entry:
            response = entry["responses"][0]
        else:
            response = entry["response"]

        try:
            if "responses" in entry:
                response = entry["responses"][0]
            else:
                response = entry["response"]
            response_dict = json.loads(response)
            isDependency = response_dict["isDependency"]
        except json.JSONDecodeError:
            pass
        except KeyError:
            return

        rating = row["final_rating"]
        
        if str(rating).lower() == "borderline":
            if isDependency:
                response_rating.append(str(isDependency))
                classification.append("TP")
                plan.append(response_dict["plan"])
                rationale.append(response_dict["rationale"])
                uncertainty.append(response_dict["uncertainty"])
            if not isDependency:
                response_rating.append(str(isDependency))
                classification.append("TN")
                plan.append(response_dict["plan"])
                rationale.append(response_dict["rationale"])
                uncertainty.append(response_dict["uncertainty"])

        # FP: The LLM validates a dependency as correct, but the dependency is actually incorrect
        if isDependency and str(rating).lower() == "false":
            response_rating.append(str(isDependency))
            classification.append("FP")
            plan.append(response_dict["plan"])
            rationale.append(response_dict["rationale"])
            uncertainty.append(response_dict["uncertainty"])


        # FN: The LLM validates a dependency as incorrect, but the dependency is actually correct
        if not isDependency and  str(rating).lower() == "true":
            response_rating.append(str(isDependency))
            classification.append("FN")
            plan.append(response_dict["plan"])
            rationale.append(response_dict["rationale"])
            uncertainty.append(response_dict["uncertainty"])

        # TP: The LLM validates a dependency as correct and the dependency is correct
        if isDependency and str(rating).lower() == "true":
            response_rating.append(str(isDependency))
            classification.append("TP")
            plan.append(response_dict["plan"])
            rationale.append(response_dict["rationale"])
            uncertainty.append(response_dict["uncertainty"])

        # TN: The LLM validates a dependency as incorrect and the dependency is incorrect
        if not isDependency and str(rating).lower() == "false":
            response_rating.append(str(isDependency))
            classification.append("TN")
            plan.append(response_dict["plan"])
            rationale.append(response_dict["rationale"])
            uncertainty.append(response_dict["uncertainty"])

    df[f"response_rating"] = response_rating
    df["classification"] = classification
    df["plan"] = plan
    df["rationale"] = rationale
    df["uncertainty"] = uncertainty

    return df


model_names = ["gpt-3.5-turbo-0125", "gpt-4o-2024-05-13", "llama3:8b", "llama3:70b"]


for name in model_names:
    df = get_df_stats(model_name=name)
    df.to_csv(f"../data/evaluation/analysis/without/{name}.csv", index=False)


**Merge entry with corresponding dependencies**

In [3]:
import pandas as pd
import json

df = pd.read_csv('../data/evaluation/analysis/without/all.csv')

with open("../data/evaluation/config5/all_dependencies_all.json", "r", encoding="utf-8") as src:
    data = json.load(src)


merged_data = []

for entry in df.to_dict("records"):
    dependency = next(filter(lambda x: x["index"] == entry["index"], data))

    for key, value in dependency["dependency"].items():
        if key not in ["dependency_type", "dependency_category", "dependency_level"]:
            entry[key] = value

    merged_data.append(entry)


df_merged = pd.DataFrame(merged_data)


df_merged.to_csv('../data/evaluation/analysis/without/all.csv', index=False)

**Get all failures**

In [4]:
import pandas as pd

df = pd.read_csv("../data/evaluation/analysis/without/all.csv")

# List of model columns to check
model_columns = ['llama3:70b', 'gpt-4o-2024-05-13', 'gpt-3.5-turbo-0125', 'llama3:8b']

# Create a boolean mask where True indicates the presence of 'FP' or 'FN' in any of the specified model columns
mask = df[model_columns].apply(lambda x: x.isin(['FP', 'FN']), axis=1).any(axis=1)

# Filter the dataframe using the mask
filtered_df_all = df[mask]

filtered_df_all['FP_FN_count'] = filtered_df_all[model_columns].apply(lambda row: row.isin(['FP', 'FN']).sum(), axis=1)

#sorted_df_all =  filtered_df_all.sort_values(by='FP_FN_count', ascending=False)

# Save the filtered dataframe if needed
filtered_df_all.to_csv('../data/evaluation/analysis/without/failures.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_all['FP_FN_count'] = filtered_df_all[model_columns].apply(lambda row: row.isin(['FP', 'FN']).sum(), axis=1)


**Get all successess**

In [5]:
import pandas as pd

df = pd.read_csv("../data/evaluation/analysis/without/all.csv")

# List of model columns to check
model_columns = ['llama3:70b', 'gpt-4o-2024-05-13', 'gpt-3.5-turbo-0125', 'llama3:8b']

# Create a boolean mask where True indicates the presence of 'FP' or 'FN' in any of the specified model columns
mask = df[model_columns].apply(lambda x: x.isin(['TP', 'TN']), axis=1).any(axis=1)

# Filter the dataframe using the mask
filtered_df_all = df[mask]

filtered_df_all['TP_TN_count'] = filtered_df_all[model_columns].apply(lambda row: row.isin(['TP', 'TN']).sum(), axis=1)

#sorted_df_all =  filtered_df_all.sort_values(by='FP_FN_count', ascending=False)

# Save the filtered dataframe if needed
filtered_df_all.to_csv('../data/evaluation/analysis/without/successess.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_all['TP_TN_count'] = filtered_df_all[model_columns].apply(lambda row: row.isin(['TP', 'TN']).sum(), axis=1)


**Create failure and success counter**

In [6]:
import pandas as pd
from collections import Counter

df_fail = pd.read_csv('../data/evaluation/analysis/without/failures.csv')
df_success = pd.read_csv('../data/evaluation/analysis/without/successess.csv')

failure_count = df_fail["FP_FN_count"].to_list()
success_count = df_success["TP_TN_count"].to_list()

failure_counter = Counter(failure_count)
success_counter = Counter(success_count)

print("Failure counter: ", failure_counter)
print("Success coutner: ", success_counter)

Failure counter:  Counter({1: 153, 2: 101, 3: 36, 4: 3})
Success coutner:  Counter({4: 161, 3: 153, 2: 101, 1: 36})


**Get results per model**

In [10]:
import pandas as pd
import json

model_columns = ['llama3:70b', 'gpt-4o-2024-05-13', 'gpt-3.5-turbo-0125', 'llama3:8b']



for model in model_columns:
    
    df = pd.read_csv('../data/evaluation/analysis/without/all.csv')
    columns_to_remove = [x for x in model_columns if x != model]
    df = df.drop(columns_to_remove, axis=1)
    #df.to_csv(f"../data/evaluation/analysis/without/{model}.csv", index=False)

    data_file = f"../data/evaluation/without/all_dependencies_without_{model_name}.json"
    with open(data_file, "r", encoding="utf-8") as src:
        data = json.load(src)

    columns_to_remove = [x for x in model_columns if x != model]
    print(columns_to_remove)

    merged_data = []

    for entry in df.to_dict("records"):
        data_entry = response = next(filter(lambda x: x["index"] == entry["index"], data))
        response = data_entry["response"]
        response_dict = json.loads(response)

        plan = response_dict["plan"]
        rationale = response_dict["rationale"]
        uncertainty = response_dict["uncertainty"]

        entry["plan"] = response_dict["plan"]
        entry["rationale"] = response_dict["rationale"]
        entry["uncertainty"] = response_dict["uncertainty"]

        merged_data.append(entry)

    df_merged = pd.DataFrame(merged_data)
    df_merged.to_csv(f"../data/evaluation/analysis/without/{model}.csv", index=False)


['gpt-4o-2024-05-13', 'gpt-3.5-turbo-0125', 'llama3:8b']


JSONDecodeError: Invalid control character at: line 2 column 236 (char 237)