In [29]:
from typing import List, Dict
import tiktoken

def get_context_length(entry: Dict, encoding_name: str = "cl100k_base") -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(entry["context_str"]))
    return num_tokens


def get_source_types(entry: Dict) -> List[str]:
    context = entry["context"]
    source_types = [x["index"] for x in context]
    return source_types


import json
import pandas as pd
import os


def get_df_stats(model_name: str, config: str):

    data_file = f"../data/evaluation/{config}/all_dependencies_all_{model_name}.json"
    baseline_file = "../data/evaluation/all_dependencies.csv"

    with open(data_file, "r", encoding="utf-8") as src:
        data = json.load(src)

    df = pd.read_csv(baseline_file)

    stats = []

    for entry, (index, row) in zip(data, df.iterrows()):

        context_sources = get_source_types(entry)
        context_length = get_context_length(entry)

        if "responses" in entry:
            response = entry["responses"][0]
        else:
            response = entry["response"]

        try:
            if "responses" in entry:
                response = entry["responses"][0]
            else:
                response = entry["response"]
            response_dict = json.loads(response)
            isDependency = response_dict["isDependency"]
        except json.JSONDecodeError as error:
            pass
        except KeyError:
            stats.append({
                "index": index,
                "rating": rating,
                "response_rating": "None",
                model_name: "FP",
                "context_length": context_length,
                "context_sources": context_sources,
                "context_relevance": "none"
            })
            continue

        rating = row["final_rating"]
                
        # FP: The LLM validates a dependency as correct, but the dependency is actually incorrect
        if isDependency and str(rating).lower() == "false":
            stats.append({
                "index": index,
                "rating": rating,
                "response_rating": str(isDependency),
                model_name: "FP",
                "context_length": context_length,
                "context_sources": context_sources,
                "context_relevance": "none"
            })

        # FN: The LLM validates a dependency as incorrect, but the dependency is actually correct
        if not isDependency and  str(rating).lower() == "true":
            stats.append({
                "index": index,
                "rating": rating,
                "response_rating": str(isDependency),
                model_name: "FN",
                "context_length": context_length,
                "context_sources": context_sources,
                "context_relevance": "none"
            })


        # TP: The LLM validates a dependency as correct and the dependency is correct
        if isDependency and str(rating).lower() == "true":
            stats.append({
                "index": index,
                "rating": rating,
                "response_rating": str(isDependency),
                model_name: "TP",
                "context_length": context_length,
                "context_sources": context_sources,
                "context_relevance": "none"
            })


        # TN: The LLM validates a dependency as incorrect and the dependency is incorrect
        if not isDependency and str(rating).lower() == "false":
            stats.append({
                "index": index,
                "rating": rating,
                "response_rating": str(isDependency),
                model_name: "TN",
                "context_length": context_length,
                "context_sources": context_sources,
                "context_relevance": "none"
            })

    df_stats = pd.DataFrame(stats)   
    return df_stats


model_names = ["gpt-3.5-turbo-0125", "gpt-4o-2024-05-13", "llama3:8b", "llama3:70b"] #"gpt-3.5-turbo-0125", "gpt-4o-2024-05-13", "llama3:8b", "llama3:70b"
config = "config5"
dfs = []

for name in model_names:
    dfs.append(get_df_stats(model_name=name, config=config))

df_base = dfs[0]
df_base = df_base.drop("response_rating", axis=1)

for model_name, df in zip(model_names, dfs):
    df_base[model_name] = df[model_name]


new_order = ["index", "rating", "gpt-3.5-turbo-0125", "gpt-4o-2024-05-13", "llama3:8b", "llama3:70b", "context_length", "context_relevance"]
df_reordered = df_base[new_order]
df_reordered.to_csv(f"../data/evaluation/analysis/context/all_{config}.csv", index=False)

In [31]:
import pandas as pd

df = pd.read_csv(f"../data/evaluation/analysis/context/all_{config}.csv")

# List of model columns to check
model_columns = ['llama3:70b', 'gpt-4o-2024-05-13', 'gpt-3.5-turbo-0125', 'llama3:8b']

# Create a boolean mask where True indicates the presence of 'FP' or 'FN' in any of the specified model columns
#mask = df[model_columns].apply(lambda x: x.isin(['FP', 'FN']), axis=1).any(axis=1)
mask = df[model_columns].apply(lambda x: x.isin(['TP', 'TN']), axis=1).any(axis=1)

# Filter the dataframe using the mask
filtered_df_all = df[mask]

#filtered_df_all['FP_FN_count'] = filtered_df_all[model_columns].apply(lambda row: row.isin(['FP', 'FN']).sum(), axis=1)
filtered_df_all['TP_TN_count'] = filtered_df_all[model_columns].apply(lambda row: row.isin(['TP', 'TN']).sum(), axis=1)

#sorted_df_all =  filtered_df_all.sort_values(by='FP_FN_count', ascending=False)

# Save the filtered dataframe if needed
#filtered_df_all.to_csv(f'../data/evaluation/analysis/context/failures_all_{config}.csv', index=False)
filtered_df_all.to_csv(f'../data/evaluation/analysis/context/successess_all_{config}.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_all['TP_TN_count'] = filtered_df_all[model_columns].apply(lambda row: row.isin(['TP', 'TN']).sum(), axis=1)
