In [1]:
# This notebook needs to be run in the root folder.
import pandas as pd
from spade.candidate_gen import generate_candidate_assertions
from spade.execute_assertions import execute_candidate_assertions
from spade.label_results import label_responses, prepare_for_optimizer
from spade.check_subsumes import evaluate_all_subsumes, collate_subsumption_results
from spade.optimizer import select_functions
# from rich import print

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shreyashankar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
%load_ext autoreload
%autoreload 2

In [5]:
dataset_name = "subsumption"


prompt_templates = [
    "You are an expert Python programmer and are helping me deduplicate assertions for my LLM pipeline.\n\nHere is my first function:\n{func_a_src}\n\nHere is my second function: {func_b_src}\n\nIs there some input that function `{func_b}` returns False for while function `{func_a}` returns True? If both functions contain calls to `ask_llm` where the prompts are similar, your answer should be no. Return your answer as a JSON in ```json ``` markers with keys `answer` (yes or no) and `input` (None if your answer is no).",
    "You are an expert Python programmer and are helping me remove redundant assertions for my LLM pipeline. My pipeline prompt template is `{prompt_template}` and an example response={response}.\n\nHere is my first function:\n{func_a_src}\n\nHere is my second function: {func_b_src}\n\nIs there some different response such that function `{func_b}` returns False for while function `{func_a}` returns True? If both functions contain `ask_llm` calls to check for the same thing, your answer should be no. Return your answer as a JSON within ```json ``` ticks with keys `answer` (yes or no) and `response` (\"N/A\" if your answer is no).",
    "You are an expert Python programmer and are helping me remove redundant assertions for my LLM pipeline. My pipeline prompt template is `{prompt_template}` and an example response={response}.\n\nHere is my first function:\n{func_a_src}\n\nHere is my second function: {func_b_src}\n\nDoes the first function imply or not imply the second function? In other words, is there an example such that function `{func_b}` returns False for while function `{func_a}` returns True? If both functions contain `ask_llm` calls to check for the same thing, your answer should be no (meaning the first function implies the second). Return your answer as a JSON within ```json ``` ticks with keys `answer` (yes or no) and `response` (\"N/A\" if your answer is no). Yes means the first does not imply the second, and no means the first implies the second."
]

print(f"There are {len(prompt_templates)} prompt templates")

prompt_template_strings = []
from paper_experiments.subsumption.examples import EXAMPLES

EXAMPLE = EXAMPLES[0]

prompt_template_strings = prompt_templates
    
prompt_template_strings

There are 3 prompt templates


['You are an expert Python programmer and are helping me deduplicate assertions for my LLM pipeline.\n\nHere is my first function:\n{func_a_src}\n\nHere is my second function: {func_b_src}\n\nIs there some input that function `{func_b}` returns False for while function `{func_a}` returns True? If both functions contain calls to `ask_llm` where the prompts are similar, your answer should be no. Return your answer as a JSON in ```json ``` markers with keys `answer` (yes or no) and `input` (None if your answer is no).',
 'You are an expert Python programmer and are helping me remove redundant assertions for my LLM pipeline. My pipeline prompt template is `{prompt_template}` and an example response={response}.\n\nHere is my first function:\n{func_a_src}\n\nHere is my second function: {func_b_src}\n\nIs there some different response such that function `{func_b}` returns False for while function `{func_a}` returns True? If both functions contain `ask_llm` calls to check for the same thing, y

In [9]:
assertions = await generate_candidate_assertions(prompt_template_strings, EXAMPLE)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shreyashankar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Generating assertions...
Prompt diff: --- 

+++ 

@@ -0,0 +1,4 @@

+You are an expert Python programmer and are helping me deduplicate assertions for my LLM pipeline.
+Here is my first function:
{func_a_src}

Here is my second function: {func_b_src}

Is there some input that function `{func_b}` returns False for while function `{func_a}` returns True?
+If both functions contain calls to `ask_llm` where the prompts are similar, your answer should be no.
+Return your answer as a JSON in ```json ``` markers with keys `answer` (yes or no) and `input` (None if your answer is no).
Generating assertions...
Prompt diff: --- 

+++ 

@@ -1,4 +1,5 @@

-You are an expert Python programmer and are helping me deduplicate assertions for my LLM pipeline.
-Here is my first function:
{func_a_src}

Here is my second function: {func_b_src}

Is there some input that function `{func_b}` returns False for while function `{func_a}` returns True?
-If both functions contain calls to `ask_llm` where the prompts 

In [7]:
# Print last prompt template
print(prompt_template_strings[-1])

You are an expert Python programmer and are helping me remove redundant assertions for my LLM pipeline. My pipeline prompt template is `{prompt_template}` and an example response={response}.

Here is my first function:
{func_a_src}

Here is my second function: {func_b_src}

Does the first function imply or not imply the second function? In other words, is there an example such that function `{func_b}` returns False for while function `{func_a}` returns True? If both functions contain `ask_llm` calls to check for the same thing, your answer should be no (meaning the first function implies the second). Return your answer as a JSON within ```json ``` ticks with keys `answer` (yes or no) and `response` ("N/A" if your answer is no). Yes means the first does not imply the second, and no means the first implies the second.


In [14]:
from paper_experiments.subsumption.candidate_assertions import ALL_FUNCTIONS
from paper_experiments.subsumption.examples import EXAMPLES


res = await execute_candidate_assertions(dataset_name, prompt_template_strings[-1], EXAMPLES, ALL_FUNCTIONS)


959d9287de8ef84c52fe8533f29612c8fc3c2a320f5818ee50bcf28195864f47
f470233bc39f1e72ad15b22cc71229d332e230fa8391b5023ca2ac097324437f
Found cached results
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 second

In [54]:
import os

# Get parent directory of file
# parent_dir = os.getcwd()
# label_df = pd.read_csv(os.path.join(parent_dir, "labeled_data.csv"))

label_df = label_responses(res, 0.75)

print(label_df["label"].mean())

# Print stats
print(f"Num candidate functions: {len(ALL_FUNCTIONS)}")
print(f"Num good examples: {len(label_df[label_df['label'] == 1])}")
print(f"Num bad examples: {len(label_df[label_df['label'] == 0])}")

optimizer_dict = prepare_for_optimizer(res, label_df)

Found 4 good functions: ['assert_no_direct_code_in_response', 'assert_func_b_func_a_disagreement_check', 'assert_no_ask_llm_calls', 'assert_correct_implication_relationship']
0.7014925373134329
Num candidate functions: 20
Num good examples: 47
Num bad examples: 20


In [55]:
import numpy as np
from paper_experiments.subsumption.labeled_subsumption_pairs import LABELED_SUBSUMPTION_PAIRS

K = np.zeros((len(optimizer_dict["cost"]), len(optimizer_dict["cost"])))

for a, b in LABELED_SUBSUMPTION_PAIRS:
    K[optimizer_dict["func_order"][a], optimizer_dict["func_order"][b]] = 1



In [56]:
K.sum()

19.0

In [57]:
# Pickle all the results
import inspect

path_name = f"/Users/shreyashankar/Documents/projects/promptdelta/paper_experiments/{dataset_name}/optimizer_input.pkl"

optimizer_dict["K"] = K
optimizer_dict["spade_functions"] = {func.__name__: inspect.getsource(func) for func in ALL_FUNCTIONS}
import pickle

with open(path_name, "wb") as f:
    pickle.dump(optimizer_dict, f)

In [58]:
subsumption_df = collate_subsumption_results(optimizer_dict["M"], ALL_FUNCTIONS, optimizer_dict["func_order"], optimizer_dict["K"])
subsumption_df.to_csv(f"/Users/shreyashankar/Documents/projects/promptdelta/paper_experiments/{dataset_name}/subsumption_results.csv", index=False)

Found 9 subsumptions out of 380 pairs.


In [68]:
# Solve the optimization problem

optimizer_res = select_functions(path_name, tau=0.25, alpha=0.6)

# Pickle the results
import pickle
with open(f"/Users/shreyashankar/Documents/projects/promptdelta/paper_experiments/{dataset_name}/optimizer_results.pkl", "wb") as f:
    pickle.dump(optimizer_res, f)


Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /Users/shreyashankar/miniforge3/envs/promptdelta/lib/python3.10/site-packages/pulp/solverdir/cbc/osx/64/cbc /var/folders/nq/ldkhrrws0xb9whw7b6rpzhc00000gn/T/087f1edea12b4aff952bad8bd968ea6b-pulp.mps timeMode elapsed branch printingOptions all solution /var/folders/nq/ldkhrrws0xb9whw7b6rpzhc00000gn/T/087f1edea12b4aff952bad8bd968ea6b-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 2707 COLUMNS
At line 10497 RHS
At line 13200 BOUNDS
At line 14648 ENDATA
Problem MODEL has 2702 rows, 1447 columns and 4875 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 0.6 - 0.01 seconds
Cgl0002I 572 variables fixed
Cgl0003I 20 fixed, 0 tightened bounds, 0 strengthened rows, 0 substitutions
Cgl0003I 1 fixed, 0 tightened bounds, 0 strengthened rows, 0 substitutions
Cgl0004I processed model has 0 rows, 0 columns (

Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /Users/shreyashankar/miniforge3/envs/promptdelta/lib/python3.10/site-packages/pulp/solverdir/cbc/osx/64/cbc /var/folders/nq/ldkhrrws0xb9whw7b6rpzhc00000gn/T/3dea121716ba444e8047a7b054b436ea-pulp.mps timeMode elapsed branch printingOptions all solution /var/folders/nq/ldkhrrws0xb9whw7b6rpzhc00000gn/T/3dea121716ba444e8047a7b054b436ea-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 4727 COLUMNS
At line 18596 RHS
At line 23319 BOUNDS
At line 25607 ENDATA
Problem MODEL has 4722 rows, 2287 columns and 9274 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 8 - 0.01 seconds
Cgl0002I 984 variables fixed
Cgl0003I 356 fixed, 0 tightened bounds, 286 strengthened rows, 909 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 2 strengthened rows, 177 substitutions
Cgl0004I processed model has 21 rows, 9 col

In [63]:
# Visualize performance

# Turn into DF
import pandas as pd

optimizer_res_df = []
for method in ["spade_base", "spade_cov", "spade_sub"]:
    optimizer_res_df.append({"method": method, "ffr": optimizer_res[method]["ffr"], "example failure coverage": optimizer_res[method]["coverage"], "frac_functions_selected": optimizer_res[method]["frac_functions_selected"], "frac_non_subsumed_excluded_functions": optimizer_res[method]["frac_non_subsumed_excluded_functions"]})
    
optimizer_res_df = pd.DataFrame(optimizer_res_df)
print(optimizer_res_df)


       method  ffr  example failure coverage  frac_functions_selected  \
0  spade_base  0.0                       0.8                     0.20   
1   spade_cov  0.0                       0.7                     0.05   
2   spade_sub  0.0                       0.8                     0.20   

   frac_non_subsumed_excluded_functions  
0                                  0.00  
1                                  0.15  
2                                  0.00  
