In [13]:
import pandas as pd
import altair as alt
import os
pd.set_option("display.max_colwidth", None)  

# Data collection benchmark evaluation

## Load data

In [14]:
load_specific_file = None

if load_specific_file:
    df = pd.read_csv(f"../../{load_specific_file}")
    print(f"LOADING SPECIFIC: {load_specific_file}")
else:
    csv_files = [f for f in os.listdir("../..") if f.startswith("collector_test_results_") and f.endswith(".csv")]
    csv_files.sort(key=lambda x: os.path.getctime(os.path.join("../..", x)))
    latest_file = csv_files[-1]
    df = pd.read_csv(f"../../{latest_file}")
    print(f"LOADING LATEST: {latest_file}")
    
print(df[["model", "agent_type"]].drop_duplicates())
print(f"Rows: {len(df)}")

LOADING LATEST: collector_test_results_2025-02-04_12-17-12.csv
         model         agent_type
0  gpt-4o-mini  structured_output
1       gpt-4o  structured_output
3       gpt-4o      function_call
4  gpt-4o-mini      function_call
Rows: 1560


## Summary metrics

GPT4o mini with structured output excels in the "perfect" metric which counts the test cases where _all_ fields are answered correctly, including the ones where we expect an empty answer.

In [15]:
df.groupby(["agent_type", "model"])["perfect"].mean().sort_values(ascending=False)

agent_type         model      
structured_output  gpt-4o-mini    0.841026
                   gpt-4o         0.658974
function_call      gpt-4o         0.517949
                   gpt-4o-mini    0.474359
Name: perfect, dtype: float64

In [16]:
print("Performance by model")
metrics = ["perfect", "num_correct_non_empty", 
           "num_correct_empty", "num_wrong_not_filled", "num_wrong_expected_empty", "num_wrong_different_value"]

df.groupby(["agent_type", "model"])[metrics].sum().sort_values("perfect", ascending=False)

Performance by model


Unnamed: 0_level_0,Unnamed: 1_level_0,perfect,num_correct_non_empty,num_correct_empty,num_wrong_not_filled,num_wrong_expected_empty,num_wrong_different_value
agent_type,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
structured_output,gpt-4o-mini,328,693,3535,47,7,8
structured_output,gpt-4o,257,617,3540,127,2,4
function_call,gpt-4o,202,544,3538,203,4,1
function_call,gpt-4o-mini,185,460,3539,281,3,7


## Detailed look at error cases

In [17]:
fields = [col[len("field_"):] for col in df.columns if col.startswith("field_")]
fields = ['location_type'] # to look at one specific field

for field in fields:
    print("-" * 80)
    print(f"Field: {field}")
    print("-" * 80)
    fltr = ~df[f"field_{field}"].isna()
    for _, row in df[fltr].iterrows():
        expected: dict = eval((row["expected_items"]))
        expected_value = expected.get(field,"")
        extracted_value = dict(eval(row["agent_data"]))[field]
        # print(row["user_message"], row["agent_data"])
        if expected_value != extracted_value:
            print(f"User message: '{row['user_message']}'")
            print(f"expected: '{expected_value}', extracted: '{extracted_value}'")
            print(f"Agent tpye: {row['agent_type']}, model: {row['model']},")
            print()
    print(f"done with field {field}")
    print()

print("done")
        

--------------------------------------------------------------------------------
Field: location_type
--------------------------------------------------------------------------------
User message: 'Hey, I'm Mary! I'm looking for a small city car for my commute.'
expected: 'city', extracted: ''
Agent tpye: function_call, model: gpt-4o-mini,

User message: 'I just moved to a small village.'
expected: 'rural', extracted: 'small town'
Agent tpye: structured_output, model: gpt-4o-mini,

User message: 'I live in a small village.'
expected: 'rural', extracted: 'small town'
Agent tpye: structured_output, model: gpt-4o-mini,

User message: 'We're in a village.'
expected: 'rural', extracted: 'small town'
Agent tpye: structured_output, model: gpt-4o-mini,

User message: 'We're in a farm.'
expected: 'rural', extracted: ''
Agent tpye: function_call, model: gpt-4o-mini,

User message: 'I live in in the suburbs of a city.'
expected: 'suburb', extracted: ''
Agent tpye: function_call, model: gpt-4o-min

## Performace by field analysis

In [18]:
field_cols = sorted([col for col in df.columns if col.startswith("field_")])

pivots = [] # for aggregating into a chart

for field in field_cols:
    fltr = df[field].notnull()
    pv = df.pivot_table(index=["agent_type", "model"], columns=field, values="expected_items", aggfunc="count").fillna(0).astype(int)

    pv_normalized = pv / pv.sum(axis=1).values.reshape(-1, 1)
    pivots.append(pv_normalized.assign(field=field).reset_index())

    if False: # as counts
        print(pv.to_string())   
        print()
    if False: # as percentages
        print(pv_normalized.to_string())
        print()


pv = pd.concat(pivots, ignore_index=True).fillna(0)
pv["field"] = pv["field"].apply(lambda x: x[len("field_"):])
pv["agent"] = pv["agent_type"].map(lambda s: s.split("_")[0]) + " / " + pv["model"]

c1 = alt.Chart(pv, title="Performance").mark_bar().encode(
    x = "correct_non_empty",
    y = "agent",
    color = "agent_type",
    row = "field",
) 

c2 = alt.Chart(pv, width=130).mark_bar().encode(
    x = "wrong_not_filled",
    y = alt.Y("agent", axis=alt.Axis(title=None, labels=False)),
    color = "agent_type",
    row = alt.Row("field", title=None),
)
c3 = alt.Chart(pv, width=130).mark_bar().encode(
    x = "wrong_expected_empty",
    y = alt.Y("agent", axis=alt.Axis(title=None, labels=False)),
    color = "agent_type",
    row = alt.Row("field", title=None),
)
c4 = alt.Chart(pv, width=130).mark_bar().encode(
    x = "wrong_different_value",
    y = alt.Y("agent", axis=alt.Axis(title=None, labels=False)),
    color = "agent_type",
    row = alt.Row("field", title=None),
)

(c1 | c2 | c3 | c4)  #.resolve_scale(x="shared")

## Timing analysis

In [26]:
# Create a histogram of the "time_taken" column

cap_time = 15
df_time_cap = df.copy()
df_time_cap.loc[df_time_cap["time_taken"] > cap_time, "time_taken"] = cap_time
histogram = alt.Chart(df_time_cap, height=200).mark_bar().encode(
    alt.X("time_taken:Q", bin=alt.Bin(maxbins=100)),
    y='count()',
    column='model',
    row='agent_type',
    color='agent_type',
).properties(
    title=f'Histogram of Time Taken, capped at {cap_time} seconds'
)

histogram

In [20]:
df.groupby(["agent_type", "model"])["time_taken"].agg(["mean", "median", "std"])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,median,std
agent_type,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
function_call,gpt-4o,1.432485,1.126109,1.11765
function_call,gpt-4o-mini,4.294668,1.011623,10.651632
structured_output,gpt-4o,2.068095,1.819761,0.841707
structured_output,gpt-4o-mini,1.288888,1.076586,0.73975


# Old baseline

In [21]:
df_base = pd.read_csv("../../collector_test_results_2025-01-30_10-15-12 BASELINE.csv")

field_cols = [col for col in df_base.columns if col.startswith("field_")]

for field in field_cols:
    fltr = df_base[field].notnull()
    pv = df_base.pivot_table(index="agent", columns=field, values="expected_items", aggfunc="count").fillna(0).astype(int)
    print(pv)
    print(pv / pv.sum(axis=1).values.reshape(-1, 1))
    print()

field_color_preference  correct_non_empty  wrong_not_filled
agent                                                      
FCA gpt-3.5                            62               118
FCA gpt4o                             131                49
FCA gpt4o-mini                         81                99
field_color_preference  correct_non_empty  wrong_not_filled
agent                                                      
FCA gpt-3.5                      0.344444          0.655556
FCA gpt4o                        0.727778          0.272222
FCA gpt4o-mini                   0.450000          0.550000

field_car_model_preference  correct_non_empty  wrong_not_filled
agent                                                          
FCA gpt-3.5                               140                44
FCA gpt4o                                 128                56
FCA gpt4o-mini                             85                99
field_car_model_preference  correct_non_empty  wrong_not_filled
agent          