In [None]:
# This notebook needs to be run in the root folder.
import pandas as pd
from spade_v3.candidate_gen import generate_candidate_assertions
from spade_v3.execute_assertions import execute_candidate_assertions
from spade_v3.label_results import label_responses, prepare_for_optimizer
from spade_v3.check_subsumes import evaluate_all_subsumes, collate_subsumption_results
from spade_v3.optimizer import select_functions
# from rich import print

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
dataset_name = "sportroutine"
from paper_experiments.sportroutine.prompt_templates import TEMPLATES

prompt_template_strings = TEMPLATES
from paper_experiments.sportroutine.examples import EXAMPLES

EXAMPLE = EXAMPLES[0]
print(f"There are {len(prompt_template_strings)} templates and {len(EXAMPLES)} examples.")

In [None]:
assertions = await generate_candidate_assertions(prompt_template_strings, EXAMPLE)

In [None]:
# Append some stuff

beginning = """from litellm import acompletion


async def ask_llm(prompt, response, question):
    # Placeholder for asking an expert a true/false question
    # In practice, this would involve a complex implementation potentially requiring human input

    messages = [
        {
            "content": f"You are an evaluator for a large language model pipeline that turns exercise video texts into markdown programs of step by step instructions:\\n\\n{prompt}\\n\\nHere is the response:\\n{response}",
            "role": "system",
        },
        {
            "content": f"{question}\\nOnly answer yes or no.",
            "role": "user",
        },
    ]

    response = await acompletion(
        model="azure/gpt-35-turbo",
        messages=messages,
    )

    # get the cost
    completion_tokens = response["usage"]["completion_tokens"]
    prompt_tokens = response["usage"]["prompt_tokens"]

    # get the response
    reply = response["choices"][0]["message"]["content"]

    if "yes" in reply.lower():
        return prompt_tokens, completion_tokens, True

    return prompt_tokens, completion_tokens, False
"""

assertion_text = beginning + assertions[1][0]

In [None]:
# Print last prompt template
print(prompt_template_strings[-1])

In [None]:
# Dump to functions.py

prefix = f"/Users/shreyashankar/Documents/projects/promptdelta/paper_experiments/{dataset_name}"

with open(f"{prefix}/candidate_assertions.py", "w") as f:
    f.write(assertion_text)

In [None]:
from paper_experiments.sportroutine.candidate_assertions import ALL_FUNCTIONS
from paper_experiments.sportroutine.examples import EXAMPLES


res = await execute_candidate_assertions(dataset_name, prompt_template_strings[-1], EXAMPLES, ALL_FUNCTIONS)


In [None]:
prefix = f"/Users/shreyashankar/Documents/projects/promptdelta/paper_experiments/{dataset_name}"
label_df = label_responses(res, 0.6)
print(label_df["label"].mean())
label_df.to_csv(f"{prefix}/labeled_responses.csv", index=False)

In [None]:
label_df = pd.read_csv(f"{prefix}/labeled_responses.csv")
print(label_df["label"].mean())

# Print stats
print(f"Num candidate functions: {len(ALL_FUNCTIONS)}")
print(f"Num good examples: {len(label_df[label_df['label'] == 1])}")
print(f"Num bad examples: {len(label_df[label_df['label'] == 0])}")

optimizer_dict = prepare_for_optimizer(res, label_df)

In [None]:
response = res["response"].values[0]

K = await evaluate_all_subsumes(optimizer_dict["M"], ALL_FUNCTIONS, optimizer_dict["func_order"], prompt_template, response)

In [None]:
K.sum()

In [None]:
# Pickle all the results
import inspect

path_name = f"/Users/shreyashankar/Documents/projects/promptdelta/paper_experiments/{dataset_name}/optimizer_input.pkl"

optimizer_dict["K"] = K
optimizer_dict["spade_functions"] = {func.__name__: inspect.getsource(func) for func in ALL_FUNCTIONS}
import pickle

with open(path_name, "wb") as f:
    pickle.dump(optimizer_dict, f)

In [None]:
subsumption_df = collate_subsumption_results(optimizer_dict["M"], ALL_FUNCTIONS, optimizer_dict["func_order"], optimizer_dict["K"])
subsumption_df.to_csv(f"/Users/shreyashankar/Documents/projects/promptdelta/paper_experiments/{dataset_name}/subsumption_results.csv")

In [None]:
from spade_v3.check_subsumes import sample_subsumption_prompts_and_results

subsumption_sample_llm_labeled = await sample_subsumption_prompts_and_results(optimizer_dict["M"], ALL_FUNCTIONS, optimizer_dict["func_order"], optimizer_dict["K"], prompt_template_strings[-1], response)
subsumption_sample_llm_labeled.to_csv(f"/Users/shreyashankar/Documents/projects/promptdelta/paper_experiments/subsumption/data/{dataset_name}.csv", index=False)

In [None]:
# Solve the optimization problem

optimizer_res = select_functions(path_name, tau=0.25, alpha=0.6)

# Pickle the results
import pickle
with open(f"/Users/shreyashankar/Documents/projects/promptdelta/paper_experiments/{dataset_name}/optimizer_results.pkl", "wb") as f:
    pickle.dump(optimizer_res, f)


In [None]:
# Visualize performance

# Turn into DF
import pandas as pd

optimizer_res_df = []
for method in ["spade_base", "spade_cov", "spade_sub"]:
    optimizer_res_df.append({"method": method, "ffr": optimizer_res[method]["ffr"], "example failure coverage": optimizer_res[method]["coverage"], "frac_functions_selected": optimizer_res[method]["frac_functions_selected"], "frac_non_subsumed_excluded_functions": optimizer_res[method]["frac_non_subsumed_excluded_functions"]})
    
optimizer_res_df = pd.DataFrame(optimizer_res_df)
print(optimizer_res_df)


## Evaluating Subsumption with Higher Precision

In [None]:
from spade_v3.check_subsumes import identify_subsumption_pairs
from paper_experiments.sportroutine.candidate_assertions import ALL_FUNCTIONS
import pickle

dataset_name = "sportroutine"
path_name = f"/Users/shreyashankar/Documents/projects/promptdelta/paper_experiments/{dataset_name}/optimizer_input.pkl"
with open(path_name, "rb") as f:
    optimizer_dict = pickle.load(f)
K_and_pretty_df = await identify_subsumption_pairs(optimizer_dict["M"], ALL_FUNCTIONS, optimizer_dict["func_order"])

K_and_pretty_df["K"].sum() / (K_and_pretty_df["K"].shape[0] * K_and_pretty_df["K"].shape[1])

In [None]:
# Pickle all the results
import inspect
from spade_v3.optimizer import select_functions
from rich import print

path_name = f"/Users/shreyashankar/Documents/projects/promptdelta/paper_experiments/{dataset_name}/optimizer_input.pkl"

optimizer_dict["K"] = K_and_pretty_df["K"]
optimizer_dict["spade_functions"] = {func.__name__: inspect.getsource(func) for func in ALL_FUNCTIONS}
import pickle

with open(path_name, "wb") as f:
    pickle.dump(optimizer_dict, f)
    
# Dump subsumption df
K_and_pretty_df["human_readable_results"].to_csv(f"/Users/shreyashankar/Documents/projects/promptdelta/paper_experiments/{dataset_name}/subsumption_results.csv", index=False)

# Solve the optimization problem
optimizer_res = select_functions(path_name, tau=0.25, alpha=0.6)

# Turn into DF
import pandas as pd

optimizer_res_df = []
for method in ["spade_base", "spade_cov", "spade_sub"]:
    optimizer_res_df.append({"method": method, "ffr": optimizer_res[method]["ffr"], "example failure coverage": optimizer_res[method]["coverage"], "frac_functions_selected": optimizer_res[method]["frac_functions_selected"], "frac_non_subsumed_excluded_functions": optimizer_res[method]["frac_non_subsumed_excluded_functions"]})
    
optimizer_res_df = pd.DataFrame(optimizer_res_df)
print(optimizer_res_df)

# Pickle the results
import pickle
with open(f"/Users/shreyashankar/Documents/projects/promptdelta/paper_experiments/{dataset_name}/optimizer_results.pkl", "wb") as f:
    pickle.dump(optimizer_res, f)