In [2]:
from IPython.lib.deepreload import reload
%load_ext autoreload
%autoreload 2

In [3]:
from llm_calc.lib.experiment import  ls_client, make_df
from llm_calc.lib.config import config
from llm_calc.lib.datamodel import Arm, ArmSlug, Model, ModelSlug
from llm_calc.lib.datacore import datacore
from os.path import join as path_join
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd

In [37]:
datasets = make_df(ls_client.list_datasets())

In [38]:
# get last few datasets and assign a dataset_name to work with
last_few_dataset_names = [datasets.iloc[i]["name"]+": "+datasets.iloc[i]["description"] for i in range(10)]
last_few_dataset_names = "\n--- ".join(last_few_dataset_names)
print(f"Last few datasets:\n--- {last_few_dataset_names}")
last_dataset = datasets.iloc[0]["name"]
print(f"Last dataset: {last_dataset}")


Last few datasets:
--- overt-reclamation: lllama_ci redo case #500
--- pastoral-constitution: gpt_omc case #750
--- melted-cage: gpt4_rag, and gpt_ci case #750
--- efficacious-romaine: gpt4_rag_ci case #750
--- successful-angina: llama_rag_ci case #750
--- known-increase: all but rag_ci, case 750
--- yielding-gearshift: gpt4_rag_ci case #500
--- hissing-boatload: llama_rag_ci case #500
--- proud-yam: llamna_omc + all gpt arms, all calcs  starting at case #500
--- productive-independent: all arms all calcs  starting at case #500
Last dataset: overt-reclamation


choose datasets to include
because of timeout errors, I was not able to run it all in one go, but it was fragmented into chunks

### for cases 0-250: 
- dataset goofy-tower
   - llama_base
   - llama_ci
   - llama_rag
   - llama_rag_ci
      - llama_omc (errored - exclude)
   - all GPT arms duplicated (excluded)
        - gpt4_base
        - gpt4_ci
        - gpt4_rag
        - gpt4_rag_ci
        - gpt4_omc
- dataset loud-company
   - gpt4_base
   - gpt4_ci
   - gpt4_rag
   - gpt4_rag_ci
   - gpt4_omc
- dataset sedate-bidding
   - llama_omc
### for cases 250-500: 
- dataset puny-gasket
   - llama_base (89% valid)
   - llama_ci (100% valid)
   - llama_rag (86% valid)
   - llama_rag_ci (24% valid) -- to exlcude, timeout error
- dataset synonymous-marsh -- likely wont use
   - llama_base (90% valid, repeats from above)
   - llama_ci  (100% valid, repeats from above)
   - llama_rag  (86% valid, repeats from above)
- dataset crazy-textbook
   - llama_rag_ci (100% valid)
- dataset confused-intent:
   - llama_omc (100% valid)
- dataset quarrelsome-pinkie
   - gpt4_base (100% valid)
   - gpt4_ci  (100% valid)
   - gpt4_rag  (100% valid)
   - gpt4_rag_ci (0% valid) -- to exlcude, timeout error
   - gpt4_omc  (100% valid)
- dataset disagreeable-apple:
   - gpt4_rag_ci (100% valid)

### for cases 500-750:
- yielding-gearshift
    - gpt4_rag_ci (100% valid)
- hissing-boatload 
    - llama_rag_ci 
- proud-yam
    - gpt4_base
    - gpt4_ci
    - gpt4_rag 
    - gpt4_rag_ci (0% valid) -- to exclude, timeout error
    - gpt4_omc
    - llama_omc
- productive-independent
    - llama_base (90% valid) -- acceptable, consistent with other runs
    - llama_ci  (74% valid) -- to exclude, many timeout errors
    - llama_rag  (89% valid) -- acceptable, consistent with other runs
    - llama_rag_ci  (14% valid)   --- to exclude, timeout error
- overt-reclamation
    - llama_ci (100% valid)

### for cases 750 - 1000
- melted-cage
   - gpt4_rag
   - gpt4_ci
- efficacious-romaine
   - gpt4_rag_ci
- successful-angina
   - llama_rag_ci
- known-increase
   - llama_base (92% valid) -- acceptable, consistent with other runs
   - llama_ci (100% valid)
   - llama_rag (86% valid) -- acceptable, consistent with other runs
   - gpt4_ci (4% valid) -- to exclude, timeout error
   - gpt4_base (100% valid)
   - llama_omc (100% valid)
- pastoral-constitution
    - gpt4_omc


In [41]:
chosen_datasets = [
    "goofy-tower", "loud-company", "sedate-bidding",
    "puny-gasket", "crazy-textbook", "quarrelsome-pinkie", "disagreeable-apple", "confused-intent",
    "yielding-gearshift", "hissing-boatload", "proud-yam", "productive-independent", "overt-reclamation",
    "melted-cage", "efficacious-romaine", "successful-angina", "known-increase", "pastoral-constitution",
    ]
print( f"Chosen datasets: \n" + "\n".join(chosen_datasets))

Chosen datasets: 
goofy-tower
loud-company
sedate-bidding
puny-gasket
crazy-textbook
quarrelsome-pinkie
disagreeable-apple
confused-intent
yielding-gearshift
hissing-boatload
proud-yam
productive-independent
overt-reclamation
melted-cage
efficacious-romaine
successful-angina
known-increase
pastoral-constitution


In [42]:
# get runs from chosen dataset, assign to df
from llm_calc.util import util
exp_runs = pd.DataFrame()
for dataset_name in chosen_datasets:
    util.h2(f"Getting dataset {dataset_name}")
    dataset = ls_client.read_dataset(dataset_name=dataset_name)
    dataset_id = dataset.id
    last_experiments = make_df(
        ls_client.list_projects(reference_dataset_id=dataset_id)
    )
    
    for i, experiment in last_experiments.iterrows():
        util.log_mini_task(f"Getting group {i} from experiment {dataset_name}")
        exp = make_df(ls_client.list_runs(is_root=True, project_id=experiment.id))
        exp["metadata_arm"] = exp.extra.map(lambda x: x['metadata']['arm'])
        util.rprint(exp.metadata_arm.value_counts())
        exp["dataset_name"] = dataset_name
        exp["dataset_id"] = dataset_id
        exp["dataset_group"] = i
        exp_runs = pd.concat([exp_runs, exp])
        util.log_mini_task(f"...Added {len(exp)} runs from group {i}\n\n")
# feedback_stats = pd.json_normalize(exp_runs.feedback_stats)
# exp_runs = pd.concat([exp_runs, feedback_stats.reindex(exp_runs.index)], axis=1)
df = exp_runs

In [66]:
df.shape

(10000, 74)

In [67]:
# remove "goofy-tower" runs with gpt4o as they are not valid (used gpt4 turbo instead)
df = df[~((df.dataset_name == "goofy-tower") & (df.metadata_arm == "gpt4_base"))]
df = df[~((df.dataset_name == "goofy-tower") & (df.metadata_arm == "gpt4_ci"))]
df = df[~((df.dataset_name == "goofy-tower") & (df.metadata_arm == "gpt4_rag"))]
df = df[~((df.dataset_name == "goofy-tower") & (df.metadata_arm == "gpt4_rag_ci"))]
df = df[~((df.dataset_name == "goofy-tower") & (df.metadata_arm == "gpt4_omc"))]

# remove "goofy-tower" runs with llama OMC as ran out of credits and half errored
df = df[~((df.dataset_name == "goofy-tower") & (df.metadata_arm == "llama_omc"))]

# exclude those from dataset puny-gasket with metadata_arm llama_rag_ci
df = df[~((df.dataset_name == "puny-gasket") & (df.metadata_arm == "llama_rag_ci"))]
# # exclude those from dataset quarrelsome-pinkie with metadata_arm gpt4_rag_ci
df = df[~((df.dataset_name == "quarrelsome-pinkie") & (df.metadata_arm == "gpt4_rag_ci"))]

# exclude those from dataset proud-yam with metadata_arm gpt4_rag_ci  
df = df[~((df.dataset_name == "proud-yam") & (df.metadata_arm == "gpt4_rag_ci"))]
# exclude those from dataset productive-independent with metadata_arm llama_rag_ci 
df = df[~((df.dataset_name == "productive-independent") & (df.metadata_arm == "llama_rag_ci"))]
# exclude those from dataset productive-independent with metadata_arm llama_ci
df = df[~((df.dataset_name == "productive-independent") & (df.metadata_arm == "llama_ci"))]

# exclude those from dataset known-increase with metadata_arm gpt4_ci
df = df[~((df.dataset_name == "known-increase") & (df.metadata_arm == "gpt4_ci"))]


df.groupby(["dataset_name","metadata_arm"]).status.value_counts()


dataset_name            metadata_arm  status 
confused-intent         llama_omc     success    250
crazy-textbook          llama_rag_ci  success    250
disagreeable-apple      gpt4_rag_ci   success    250
efficacious-romaine     gpt4_rag_ci   success    250
goofy-tower             llama_base    success    250
                        llama_ci      success    250
                        llama_rag     success    250
                        llama_rag_ci  success    250
hissing-boatload        llama_rag_ci  success    250
known-increase          gpt4_base     success    250
                        llama_base    success    250
                        llama_ci      success    250
                        llama_omc     success    250
                        llama_rag     success    250
loud-company            gpt4_base     success    250
                        gpt4_ci       success    250
                        gpt4_omc      success    250
                        gpt4_rag      success    250


In [68]:
df = df.reset_index(drop=True)

In [69]:
# view available data from experiments
print("run keys: \n" + ", ".join(df.columns))
details = df.iloc[0].outputs["details"]
print("\ndetails keys: \n" + ", ".join(details.keys()))
case = df.iloc[0].outputs["details"]["case"]
print("\ncase keys: \n" + ", ".join(case.keys()))

run keys: 
id, name, start_time, run_type, end_time, extra, error, serialized, events, inputs, outputs, reference_example_id, parent_run_id, tags, attachments, session_id, child_run_ids, child_runs, feedback_stats, app_path, manifest_id, status, prompt_tokens, completion_tokens, total_tokens, first_token_time, total_cost, prompt_cost, completion_cost, parent_run_ids, trace_id, dotted_order, in_dataset, metadata_arm, dataset_name, dataset_id, dataset_group, arm_slug, experiment_name, experiment_desc, experiment_start, experiment_end, num_errored_attempts, num_attempts, was_error, output_object, raw_response, intermediate_steps, calculator_slug, patient_name, patient_id, vignette, reference_answer, criteria, options, was_correct, performance, ci_was_run, final_answer_valid, runtime_error_count, count, output_answer, output_explanation, output_able_to_answer, steps, num_steps, last_tool_call, last_tool_call_name, arm_description, arm_name, calculator_name, model_slug, model_name, arm_tool

In [None]:
# extract important details from outputs>details and details>case
df["arm_slug"] = df.outputs.map(lambda x: x["details"]["arm_slug"])
df["experiment_name"] = df.outputs.map(lambda x: x["details"]["experiment"]["slug"])
df["experiment_desc"] = df.outputs.map(lambda x: x["details"]["experiment"]["description"])
df["experiment_start"] = df.outputs.map(lambda x: x["details"]["experiment"]["start_datetime"])
df["experiment_end"] = df.outputs.map(lambda x: x["details"]["experiment"]["end_datetime"])
df["num_errored_attempts"] = df.outputs.map(lambda x: x["details"]["num_errored_attempts"])
df["num_attempts"] = df.outputs.map(lambda x: x["details"]["num_attempts"])
df["was_error"] = df.outputs.map(lambda x: x["details"]["was_error"])
df["output_object"] = df.outputs.map(lambda x: x["details"]["output_object"])
df["raw_response"] = df.outputs.map(lambda x: x["details"]["raw_response"])
df["intermediate_steps"] = df.outputs.map(lambda x: x["details"]["intermediate_steps"])
df["calculator_slug"] = df.outputs.map(lambda x: x["details"]["case"]["calculator_slug"])
df["patient_name"] = df.outputs.map(lambda x: x["details"]["case"]["name"])
df["patient_id"] = df.outputs.map(lambda x: x["details"]["case"]["id"])
df["vignette"] = df.outputs.map(lambda x: x["details"]["case"]["vignette"])
df["reference_answer"] = df.outputs.map(lambda x: x["details"]["case"]["correct_output"])
df["criteria"] = df.outputs.map(lambda x: x["details"]["case"]["options"])
df["options"] = df.outputs.map(lambda x: x["details"]["case"]["options"]) # duplicate for proxy sake


In [71]:
# feedback stats
was_correct = df.feedback_stats.map(lambda x: x['% correct (all)']['avg']) == 100
df["was_correct"] = was_correct
df.loc[was_correct, "performance"] = "Correct"
df.loc[~was_correct, "performance"] = "Incorrect"
df["ci_was_run"] = df.feedback_stats.map(lambda x: x['code_interpreter was run']['avg']) == 100
df["final_answer_valid"] = df.feedback_stats.map(lambda x: x.get('final answer valid', {}).get('avg',0) == 100)
df["runtime_error_count"] = df.feedback_stats.map(lambda x: x.get('runtime_error_count', {}).get('avg',0))
df["count"] = 1

In [72]:
# outputs
no_output = df.output_object.isna()
df.loc[no_output, "was_error"] = True
df.loc[no_output, "performance"] = "No valid output"
number_with_no_output = len(df[no_output])
print(f"There were {number_with_no_output} runs without valid output, they have been marked as such")


There were 208 runs without valid output, they have been marked as such


In [73]:
# for remaining, extract details
df.loc[~no_output, "output_answer"] = df[~no_output].outputs.map(lambda x: x["details"]["output_object"]["answer"])
df.loc[~no_output, "output_explanation"] = df[~no_output].outputs.map(lambda x: x["details"]["output_object"]["explanation"])
df.loc[~no_output, "output_able_to_answer"] = df[~no_output].outputs.map(lambda x: x["details"]["output_object"]["able_to_answer"])

In [74]:
# clean up intermediate steps 
# set na's to 0
df["steps"] = df.outputs.map(lambda x: x["details"]["intermediate_steps"])
is_na = df["steps"].isna()
df.loc[~is_na,"num_steps"] = df[~is_na].steps.map(lambda x: len(x))
df.loc[is_na,"num_steps"] = 0

In [75]:
# print rows with > 1 steps
print(f"Ther were {len(df[df.num_steps > 0])} runs with 1 or more tool calls and {len(df[df.num_steps > 1])} with more than 1 step (most of these are callback errors to LLM ")


Ther were 6000 runs with 1 or more tool calls and 30 with more than 1 step (most of these are callback errors to LLM 


In [76]:
has_steps = df.num_steps > 0
df.loc[has_steps,"last_tool_call"] = df[has_steps].steps.map(lambda x: x[-1][0])
df.loc[~has_steps,"last_tool_call"] = None

In [77]:
df.loc[has_steps,"last_tool_call_name"] = df[has_steps].last_tool_call.map(lambda x: x.get('tool'))
df.loc[~has_steps,"last_tool_call_name"] = None
df.last_tool_call_name.value_counts()   

last_tool_call_name
code_interpreter         4000
calculate_wells_dvt       200
calculate_cci             200
calculate_caprini_vte     200
calculate_hasbled         200
calculate_sofa            200
calculate_gad7            200
calculate_nihss           200
calculate_psi_port        200
calculate_ariscat         200
calculate_meld_na         200
Name: count, dtype: int64

In [78]:
# get metadata
arms = datacore.arms
models = datacore.models
calculators = datacore.calculators

In [None]:
# add pretty names
arm_descriptions = {x.slug.value: x.description for x in arms}
df['arm_description'] = df.arm_slug.map(lambda x: arm_descriptions[x])
arm_names = {x.slug.value: x.name for x in arms}
df['arm_name'] = df.arm_slug.map(lambda x: arm_names[x])
pretty_calculators = {x.slug.value: x.pretty_slug for x in calculators}
df['calculator_name'] = df.calculator_slug.map(lambda x: pretty_calculators[x])
get_arm_by_slug = {x.slug.value: x for x in arms}
df['model_slug'] = df.arm_slug.map(lambda x: get_arm_by_slug.get(x).model.value)
model_names = {x.slug.value: x.name for x in models}
df['model_name'] = df.model_slug.map(lambda x: model_names[x])
df['arm_tools'] = df.arm_slug.map(lambda x: x.replace("gpt4_","").replace("llama_",""))

# calculator-arm permutations
df['calculator_arm'] = str(df['calculator_name']) + ' on ' + str(df['arm_name'])

# convert to categories
df["arm_slug"] = df["arm_slug"].astype("category")
df["model_slug"] = df["model_slug"].astype("category")
df["calculator_name"] = df["calculator_name"].astype("category")
df["arm_name"] = df["arm_name"].astype("category")
df["arm_description"] = df["arm_description"].astype("category")
df["model_name"] = df["model_name"].astype("category")
df["arm_tools"] = df["arm_tools"].astype("category")
df["performance"] = df["performance"].astype("category")
df["experiment_name"] = df["experiment_name"].astype("category")
df["experiment_desc"] = df["experiment_desc"].astype("category")
df["patient_name"] = df["patient_name"].astype("category")
df["vignette"] = df["vignette"].astype("category")
df['calculator_arm'] = df['calculator_arm'].astype("category")

In [80]:

df.columns

Index(['id', 'name', 'start_time', 'run_type', 'end_time', 'extra', 'error',
       'serialized', 'events', 'inputs', 'outputs', 'reference_example_id',
       'parent_run_id', 'tags', 'attachments', 'session_id', 'child_run_ids',
       'child_runs', 'feedback_stats', 'app_path', 'manifest_id', 'status',
       'prompt_tokens', 'completion_tokens', 'total_tokens',
       'first_token_time', 'total_cost', 'prompt_cost', 'completion_cost',
       'parent_run_ids', 'trace_id', 'dotted_order', 'in_dataset',
       'metadata_arm', 'dataset_name', 'dataset_id', 'dataset_group',
       'arm_slug', 'experiment_name', 'experiment_desc', 'experiment_start',
       'experiment_end', 'num_errored_attempts', 'num_attempts', 'was_error',
       'output_object', 'raw_response', 'intermediate_steps',
       'calculator_slug', 'patient_name', 'patient_id', 'vignette',
       'reference_answer', 'criteria', 'options', 'was_correct', 'performance',
       'ci_was_run', 'final_answer_valid', 'runtime_err

# Validity Checks

In [81]:
df.arm_name.value_counts()
df.groupby(["experiment_name","model_slug","arm_tools"],observed=True).final_answer_valid.sum()
df.groupby(["model_slug","arm_tools","experiment_name"],observed=True).final_answer_valid.sum()
# df.groupby(["model_slug","arm_tools"],observed=True).final_answer_valid.sum()


model_slug  arm_tools  experiment_name       
gpt4o       base       known-increase            250
                       loud-company              250
                       proud-yam                 250
                       quarrelsome-pinkie        250
            ci         loud-company              250
                       melted-cage               250
                       proud-yam                 250
                       quarrelsome-pinkie        250
            omc        loud-company              250
                       pastoral-constitution     250
                       proud-yam                 250
                       quarrelsome-pinkie        250
            rag        loud-company              250
                       melted-cage               250
                       proud-yam                 250
                       quarrelsome-pinkie        250
            rag_ci     disagreeable-apple        250
                       efficacious-romaine       250


In [82]:
df.arm_name.value_counts()
df.experiment_name.value_counts()
df.groupby(["experiment_name","model_slug","arm_tools"],observed=False).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,name,start_time,run_type,end_time,extra,error,serialized,events,inputs,...,output_explanation,output_able_to_answer,steps,num_steps,last_tool_call,last_tool_call_name,arm_description,arm_name,calculator_name,model_name
experiment_name,model_slug,arm_tools,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
confused-intent,gpt4o,base,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
confused-intent,gpt4o,ci,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
confused-intent,gpt4o,omc,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
confused-intent,gpt4o,rag,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
confused-intent,gpt4o,rag_ci,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yielding-gearshift,llama3_1,base,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
yielding-gearshift,llama3_1,ci,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
yielding-gearshift,llama3_1,omc,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
yielding-gearshift,llama3_1,rag,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [83]:
df.groupby(["experiment_name","model_slug","arm_tools"],observed=True).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,name,start_time,run_type,end_time,extra,error,serialized,events,inputs,...,output_explanation,output_able_to_answer,steps,num_steps,last_tool_call,last_tool_call_name,arm_description,arm_name,calculator_name,model_name
experiment_name,model_slug,arm_tools,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
confused-intent,llama3_1,omc,250,250,250,250,250,250,0,0,250,250,...,250,250,250,250,250,250,250,250,250,250
crazy-textbook,llama3_1,rag_ci,250,250,250,250,250,250,0,0,250,250,...,250,250,250,250,250,250,250,250,250,250
disagreeable-apple,gpt4o,rag_ci,250,250,250,250,250,250,0,0,250,250,...,250,250,250,250,250,250,250,250,250,250
efficacious-romaine,gpt4o,rag_ci,250,250,250,250,250,250,0,0,250,250,...,250,250,250,250,250,250,250,250,250,250
goofy-tower,llama3_1,base,250,250,250,250,250,250,0,0,250,250,...,226,226,226,250,0,0,250,250,250,250
goofy-tower,llama3_1,ci,250,250,250,250,250,250,0,0,250,250,...,250,250,250,250,250,250,250,250,250,250
goofy-tower,llama3_1,rag,250,250,250,250,250,250,0,0,250,250,...,222,222,222,250,0,0,250,250,250,250
goofy-tower,llama3_1,rag_ci,250,250,250,250,250,250,0,0,250,250,...,250,250,250,250,250,250,250,250,250,250
hissing-boatload,llama3_1,rag_ci,250,250,250,250,250,250,0,0,250,250,...,250,250,250,250,250,250,250,250,250,250
known-increase,gpt4o,base,250,250,250,250,250,250,0,0,250,250,...,250,250,250,250,0,0,250,250,250,250


In [84]:
# make sure that all runs have same vignettes for same patient
df.groupby(["calculator_name","vignette"],observed=True).id.count()

calculator_name  vignette                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
ARISCAT          A 62-year-old patient is scheduled for an ORIF of the left humerous, with anticipated duration of 2.5 hours; it is scheduled as an elective procedure. They have a preoperative SpO2 of 88% on room air. They had a recent cold, but d

In [85]:
pd.DataFrame(df.groupby(["calculator_name","metadata_arm"],observed=True).id.count())


Unnamed: 0_level_0,Unnamed: 1_level_0,id
calculator_name,metadata_arm,Unnamed: 2_level_1
ARISCAT,gpt4_base,100
ARISCAT,gpt4_ci,100
ARISCAT,gpt4_omc,100
ARISCAT,gpt4_rag,100
ARISCAT,gpt4_rag_ci,100
...,...,...
Wells DVT,llama_base,100
Wells DVT,llama_ci,100
Wells DVT,llama_omc,100
Wells DVT,llama_rag,100


In [86]:
# it is possible for non-unique vignettes to occur as it is pseudorandom and limited number of permulations
vig_counts = pd.DataFrame(df.groupby(["calculator_name","vignette"],observed=True).id.count())
# if all unique, all counts should be 10 (number of calulators)
vig_counts.id.value_counts()

id
10    966
20     17
Name: count, dtype: int64

### Calculate error amount

In [None]:
# df = pd.read_pickle(path_join(config.RESULTS_DATA_PATH,"most_recent/dataset_results_most-recent.pkl"))

In [13]:
df["error_real"] = df["output_answer"] - df["reference_answer"].astype(float)
df["error_absolute"] = df["error_real"].abs()

In [14]:
from llm_calc.lib.datacore import datacore
from llm_calc.lib.datamodel import Arm, ArmSlug, Model, ModelSlug, Calculator, CalculatorSlug
from typing import List

calculators: List[Calculator] = datacore.get_calculators()

In [36]:
# Create a dictionary to map calculator slugs to their mild and severe cutoffs
calculator_cutoffs = {calc.slug.value: (calc.mild_cutoff, calc.severe_cutoff) for calc in calculators}
calculator_min_max = {calc.slug.value: (calc.minimum_score, calc.maximum_score) for calc in calculators}

# Define a function to calculate the adjusted clinical error real
def calculate_adjusted_error(row):
    mild_cutoff, severe_cutoff = calculator_cutoffs[row['calculator_slug']]
    if pd.notna(mild_cutoff) and pd.notna(severe_cutoff):
        return row['error_real'] / (severe_cutoff - mild_cutoff)
    else:
        return None
    
# Define a function to calculate the adjusted clinical error real
def calculate_adjusted_error_second_method(row):
    minimum_score, maxiumum_score = calculator_min_max[row['calculator_slug']]
    if pd.notna(minimum_score) and pd.notna(maxiumum_score):
        return row['error_real'] / (maxiumum_score - minimum_score)
    else:
        return None

# Apply the function to each row in the dataframe
df['adjusted_clinical_error_real']: pd.Series = df.apply(calculate_adjusted_error, axis=1) # type: ignore
df['adjusted_clinical_error_2_real']: pd.Series = df.apply(calculate_adjusted_error_second_method, axis=1) # type: ignore

# also include straight percentage error compared to reference answer
df['percentage_error_real'] = df.apply(lambda x: float(x['error_real']) / float(x['reference_answer']) if float(x['reference_answer']) != 0 else None, axis=1)
df['percentage_error_absolute'] = df.apply(lambda x: abs(x['percentage_error_real']) if x['percentage_error_real'] else None, axis=1)

# absolute adjusted clinical error
df['adjusted_clinical_error_absolute'] = df['adjusted_clinical_error_real'].abs()
df['adjusted_clinical_error_2_absolute'] = df['adjusted_clinical_error_2_real'].abs()

# Export

### Reduce file size

In [37]:
# reduce file size
# find which colums take most space
print(df.memory_usage(deep=True).sort_values(ascending=False))
df_mini = df.drop(columns=["outputs","feedback_stats","steps","raw_response","intermediate_steps","child_run_ids",
                           "parent_run_id","attachments","tags","manifest_id","session_id","criteria","events",
                           "inputs","reference_example_id","reference_answer","output_object","output_explanation","adjusted_clinical_error_real",
                           "adjusted_clinical_error_absolute","error_real","error_absolute"])

output_explanation    6670499
app_path              2670000
child_run_ids         2430048
feedback_stats        1921760
extra                 1920000
                       ...   
ci_was_run              10000
was_error               10000
was_correct             10000
final_answer_valid      10000
Index                     132
Length: 84, dtype: int64


In [38]:
# output_slug = "_".join([dataset[:6] for dataset in chosen_datasets]) # type: ignore

#export full run set to archive to most recent folder (git ignored)
# df.to_pickle(path_join(config.RESULTS_DATA_PATH,f"archive/dataset_results_{output_slug}.pkl"))
df.to_pickle(path_join(config.RESULTS_DATA_PATH,f"most_recent/dataset_results_most-recent.pkl"))

# export mini as most-recent
filename = path_join(config.RESULTS_DATA_PATH,"dataset_results_most-recent.pkl")
df_mini.to_pickle(filename)
# test load
df_read = pd.read_pickle(filename)
df_read.head(3)

Unnamed: 0,id,name,start_time,run_type,end_time,extra,error,serialized,child_runs,app_path,...,arm_name,calculator_name,model_slug,model_name,arm_tools,real_error,adjusted_clinical_error_2_real,adjusted_clinical_error_2_absolute,percentage_error_real,percentage_error_absolute
0,9dccbc97-c466-4737-968b-c186b7acb265,execute,2024-10-30 10:25:18.834813,chain,2024-10-30 10:25:35.156023,"{'metadata': {'revision_id': '324c698-dirty', 'arm': 'llama_rag_ci...",,,,/o/80231dbb-1e31-4379-b804-df697b777bc6/projects/p/87a9983e-9b6d-4...,...,Llama + RAG + CI,NIHSS,llama3_1,name,rag_ci,6.0,0.142857,0.142857,0.285714,0.285714
1,679a36f6-9c0d-45c6-b9cc-1ac7c2e04397,execute,2024-10-30 10:25:17.462899,chain,2024-10-30 10:25:34.858674,"{'metadata': {'revision_id': '324c698-dirty', 'arm': 'llama_rag_ci...",,,,/o/80231dbb-1e31-4379-b804-df697b777bc6/projects/p/87a9983e-9b6d-4...,...,Llama + RAG + CI,PSIPORT,llama3_1,name,rag_ci,80.0,0.206718,0.206718,0.432432,0.432432
2,11eb1b84-c39e-487b-bcaa-53d844436a99,execute,2024-10-30 10:25:15.635823,chain,2024-10-30 10:25:24.644082,"{'metadata': {'revision_id': '324c698-dirty', 'arm': 'llama_rag_ci...",,,,/o/80231dbb-1e31-4379-b804-df697b777bc6/projects/p/87a9983e-9b6d-4...,...,Llama + RAG + CI,HASBLED,llama3_1,name,rag_ci,1.0,0.111111,0.111111,0.25,0.25


In [39]:
from llm_calc.util import util
df.iloc[0]

id                                    9dccbc97-c466-4737-968b-c186b7acb265
name                                                               execute
start_time                                      2024-10-30 10:25:18.834813
run_type                                                             chain
end_time                                        2024-10-30 10:25:35.156023
                                                      ...                 
adjusted_clinical_error_absolute                                  0.428571
adjusted_clinical_error_2_real                                    0.142857
adjusted_clinical_error_2_absolute                                0.142857
percentage_error_real                                             0.285714
percentage_error_absolute                                         0.285714
Name: 0, Length: 83, dtype: object