In [1]:
# This notebook needs to be run in the root folder.
import pandas as pd
from spade.candidate_gen import generate_candidate_assertions
from spade.execute_assertions import execute_candidate_assertions
from spade.label_results import prepare_for_optimizer
from spade.check_subsumes import evaluate_all_subsumes, collate_subsumption_results
from spade.optimizer import select_functions
from rich import print

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shreyashankar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
dataset_name = "product"
from evalgen_experiments.product.prompt_templates import TEMPLATES

prompt_template_strings = TEMPLATES
EXAMPLE = {
    "document": """Product ID: B000NWD020
Title: Yardley of London Moisturizing Soap Sweet Summer Aloe and Avocado 3+1
Average Rating: 4.1
Number of Ratings: 19
Features: ['Fresh Aloe & Avocado with Creamy Avocado & Olive Oil Extracts', 'The actual product may be different than product image']
Description: ['INDICATIONS: A light, refreshing scent of aloe with a hint of sweet cucumber. This gentle soap contains aloe vera to help soothe and heal the skin. Not tested on animals.']
Price: 12.45
Store: Yardley
Details: {""Item Form"": ""Cream"", ""Skin Type"": ""Dry"", ""Brand"": ""Yardley"", ""Scent"": ""Cucumber,Avocado"", ""Age Range (Description)"": ""Senior,Adult,Tween,Teen,Child"", ""Is Discontinued By Manufacturer"": ""No"", ""Item model number"": ""LORNAMEAD866673"", ""UPC"": ""041840873713"", ""Manufacturer"": ""Yardley""}
Review Text: My daughter uses this soap for my grandson's acute excema!  It works wonders!  It clears his skin right up, and it smells good too!  LOVE THIS SOAP! Typical of Yardley, a nicely scented soap that I look forward to using.  I would buy it, or any Yardley soap again (love the lemon and lavender scents too,)  They make me feel refreshed. Great soap with light scent. Decent price. Smells wonderful, great as a skin moisturizer.  Shipped QUICKLY and the price was definitely right.  I will buy this again. I will never order this product again!! This soap makes my skin dry out!!! It smells great though, but smell is not everything. Great product I was disappointed to see them sold in my local dollar store. *sigh* I was disappointed to see them sold in my local dollar store. *sigh*"""
}

print(prompt_template_strings)
print(f"There are {len(prompt_template_strings)} templates.")

In [4]:
assertions = await generate_candidate_assertions(prompt_template_strings, EXAMPLE)

Generating assertions...
Prompt diff: --- 

+++ 

@@ -0,0 +1,10 @@

+You are an expert copywriter.
+You need to write an e-commerce product description based on the product details and customer reviews.
+Your description should be SEO-optimized.
+It should use an active voice and include the product's features, benefits, unique selling points without overpromising, and a call to action for the buyer.
+Benefits describe how product features will work for the buyer, addressing exactly how the product will improve their lives.
+Clearly distinguish between features (e.g., lightweight, USB-chargeable) and benefits (e.g., convenience, nutritious drinks on-the-go).
+Don't mention weaknesses of the product or use generic or repetitive language.
+Divide your description into readable chunks divided by relevant subheadings.
+Keep your description around 200 words, in Markdown format.
+{document}
Generating assertions...
Prompt diff: --- 

+++ 

@@ -5,6 +5,9 @@

 Benefits describe how product fea

In [5]:
# Append some stuff

beginning = """from litellm import acompletion


async def ask_llm(response, question):
    # Placeholder for asking an expert a true/false question
    # In practice, this would involve a complex implementation potentially requiring human input

    messages = [
        {
            "content": f"You are a helpful assistant. Here is a response you will be evaluating:\\n{response}",
            "role": "system",
        },
        {
            "content": f"{question}\\nOnly answer yes or no.",
            "role": "user",
        },
    ]

    response = await acompletion(
        model="azure/gpt-35-turbo",
        messages=messages,
    )

    # get the cost
    completion_tokens = response["usage"]["completion_tokens"]
    prompt_tokens = response["usage"]["prompt_tokens"]

    # get the response
    reply = response["choices"][0]["message"]["content"]

    if "yes" in reply.lower():
        return prompt_tokens, completion_tokens, True

    return prompt_tokens, completion_tokens, False
"""

assertion_text = beginning + assertions[1][0]

In [6]:
# Dump to functions.py

prefix = f"/Users/shreyashankar/Documents/projects/spade-experiments/evalgen_experiments/{dataset_name}"

with open(f"{prefix}/candidate_assertions.py", "w") as f:
    f.write(assertion_text)

In [4]:
# Construct reply df with cols prompt,example,response,model

# Load examples from the csv
import pandas as pd
import os
curr_dir = os.getcwd()
df = pd.read_csv(f"{curr_dir}/dataset_graded.csv")
record_list = df.to_dict(orient="records")

EXAMPLES = []
for record in record_list:
    EXAMPLES.append({
        "document": record["Var: document"],
        "id": record["Metavar: id"],
    })


reply_df = df[["Prompt", "Var: document", "Response"]].copy()
reply_df["model"] = "gpt-3"
reply_df["example"] = reply_df["Var: document"].apply(lambda x: str({"document": x}))

# Drop the Var: document col
reply_df.drop(columns=["Var: document"], inplace=True)

# Rename cols
reply_df.columns = ["prompt", "response", "model", "example"]
reply_df.head()

Unnamed: 0,prompt,response,model,example
0,You are an expert copywriter. You need to writ...,# Yardley of London Moisturizing Soap Sweet Su...,gpt-3,{'document': '\nProduct ID: B000NWD020\nTitle:...
1,You are an expert copywriter. You need to writ...,# Magick Botanicals Conditioner for Thinning H...,gpt-3,{'document': '\nProduct ID: B0011DN60Q\nTitle:...
2,You are an expert copywriter. You need to writ...,"# Supernail China Silk Wrap, 72 Inch\n\n## Enh...",gpt-3,{'document': '\nProduct ID: B001MP47WE\nTitle:...
3,You are an expert copywriter. You need to writ...,# Stage Blood 4 oz\n\n## Realistic Stage Blood...,gpt-3,{'document': '\nProduct ID: B002EV3JBU\nTitle:...
4,You are an expert copywriter. You need to writ...,# MERMAID Vegetable Glycerin Bar Soap\n\n**Pri...,gpt-3,{'document': '\nProduct ID: B00355HFQE\nTitle:...


In [5]:
from evalgen_experiments.product.candidate_assertions import ALL_FUNCTIONS

res = await execute_candidate_assertions(dataset_name, prompt_template_strings[-1], EXAMPLES, ALL_FUNCTIONS, reply_df=reply_df)


082e9b8b136b215a0e98a3a78e867789f01711fc94f3e8d10e76a8ff1a01cedc
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 seconds
Sleeping for 2 

In [6]:
label_df = pd.DataFrame({"response": df["Response"].values, "label": df["grade"].values})
# Print stats
print(f"Num candidate functions: {len(ALL_FUNCTIONS)}")
print(f"Num good examples: {len(label_df[label_df['label'] == 1])}")
print(f"Num bad examples: {len(label_df[label_df['label'] == 0])}")

optimizer_dict = prepare_for_optimizer(res, label_df)

In [7]:
# Pickle all the results
import inspect

path_name = f"/Users/shreyashankar/Documents/projects/spade-experiments/evalgen_experiments/{dataset_name}/optimizer_input.pkl"

# optimizer_dict["K"] = K
optimizer_dict["spade_functions"] = {func.__name__: inspect.getsource(func) for func in ALL_FUNCTIONS}
import pickle

with open(path_name, "wb") as f:
    pickle.dump(optimizer_dict, f)

## Evaluating Subsumption With High Precision

In [8]:
from spade.check_subsumes import identify_subsumption_pairs
from evalgen_experiments.product.candidate_assertions import ALL_FUNCTIONS
import pickle

dataset_name = "product"
path_name = f"/Users/shreyashankar/Documents/projects/spade-experiments/evalgen_experiments/{dataset_name}/optimizer_input.pkl"
with open(path_name, "rb") as f:
    optimizer_dict = pickle.load(f)
K_and_pretty_df = await identify_subsumption_pairs(optimizer_dict["M"], ALL_FUNCTIONS, optimizer_dict["func_order"])

K_and_pretty_df["K"].sum() / (K_and_pretty_df["K"].shape[0] * K_and_pretty_df["K"].shape[1])

Based on the descriptions and functionality of the assertion functions provided, the following pairs of functions are redundant or one implies the other:

1. `assert_approx_200_words(response: str)` and `assert_proper_length(response: str)`:
- `assert_proper_length` implies `assert_approx_200_words` because `assert_proper_length` checks for 200 to 300 words which includes the range of "approximately 200" words (180 to 220) that `assert_approx_200_words` checks for. However, the reverse is not true because `assert_proper_length` also allows up to 300 words.

2. `assert_markdown_format_a(response: str)` and `assert_markdown_formatting_b(response: str)`:
- Not all Markdown features are checked similarly, but `assert_markdown_formatting_b` requires the response to start with a header `# ` and have at least one subheader, these are also criteria in `assert_markdown_format_a`. This means if `assert_markdown_formatting_b` returns True, there must be at least one header and one subheader, so p

0.0175

In [9]:
# Pickle all the results
import inspect
from spade.optimizer import select_functions

path_name = f"/Users/shreyashankar/Documents/projects/spade-experiments/evalgen_experiments/{dataset_name}/optimizer_input.pkl"

optimizer_dict["K"] = K_and_pretty_df["K"]
optimizer_dict["spade_functions"] = {func.__name__: inspect.getsource(func) for func in ALL_FUNCTIONS}
import pickle

with open(path_name, "wb") as f:
    pickle.dump(optimizer_dict, f)
    
# Dump subsumption df
K_and_pretty_df["human_readable_results"].to_csv(f"/Users/shreyashankar/Documents/projects/spade-experiments/evalgen_experiments/{dataset_name}/subsumption_results.csv", index=False)

# Solve the optimization problem
optimizer_res = select_functions(path_name, tau=0.11, alpha=0.2)

# Turn into DF
import pandas as pd

optimizer_res_df = []
for method in ["spade_base", "spade_cov", "spade_sub"]:
    optimizer_res_df.append({"method": method, "ffr": optimizer_res[method]["ffr"], "example failure coverage": optimizer_res[method]["coverage"], "frac_functions_selected": optimizer_res[method]["frac_functions_selected"], "frac_non_subsumed_excluded_functions": optimizer_res[method]["frac_non_subsumed_excluded_functions"]})
    
optimizer_res_df = pd.DataFrame(optimizer_res_df)
print(optimizer_res_df)

# Pickle the results
# import pickle
# with open(f"/Users/shreyashankar/Documents/projects/promptdelta/paper_experiments/{dataset_name}/optimizer_results.pkl", "wb") as f:
#     pickle.dump(optimizer_res, f)

Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /Users/shreyashankar/miniforge3/envs/promptdelta/lib/python3.10/site-packages/pulp/solverdir/cbc/osx/64/cbc /var/folders/nq/ldkhrrws0xb9whw7b6rpzhc00000gn/T/4afdd655646f4e3e97519de262ded34d-pulp.mps timeMode elapsed branch printingOptions all solution /var/folders/nq/ldkhrrws0xb9whw7b6rpzhc00000gn/T/4afdd655646f4e3e97519de262ded34d-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 4056 COLUMNS
At line 15415 RHS
At line 19467 BOUNDS
At line 21637 ENDATA
Problem MODEL has 4051 rows, 2169 columns and 7000 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 0.2 - 0.01 seconds
Cgl0002I 1149 variables fixed
Cgl0003I 46 fixed, 0 tightened bounds, 48 strengthened rows, 0 substitutions
Cgl0003I 57 fixed, 0 tightened bounds, 0 strengthened rows, 0 substitutions
Cgl0003I 13 fixed, 0 tightened bounds, 0 stre

Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /Users/shreyashankar/miniforge3/envs/promptdelta/lib/python3.10/site-packages/pulp/solverdir/cbc/osx/64/cbc /var/folders/nq/ldkhrrws0xb9whw7b6rpzhc00000gn/T/9ab1f9ac6f7549d28f00d6bb9b9bb75a-pulp.mps timeMode elapsed branch printingOptions all solution /var/folders/nq/ldkhrrws0xb9whw7b6rpzhc00000gn/T/9ab1f9ac6f7549d28f00d6bb9b9bb75a-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 4916 COLUMNS
At line 18862 RHS
At line 23774 BOUNDS
At line 26384 ENDATA
Problem MODEL has 4911 rows, 2609 columns and 8707 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 12.7146 - 0.01 seconds
Cgl0002I 1555 variables fixed
Cgl0003I 51 fixed, 0 tightened bounds, 0 strengthened rows, 4 substitutions
Cgl0003I 2 fixed, 0 tightened bounds, 0 strengthened rows, 0 substitutions
Cgl0003I 4 fixed, 0 tightened bounds, 0 str



In [10]:
from rich import print as rprint

for k, v in optimizer_res.items():
    if "spade" in str(k):
        rprint(k)
        rprint(v["selected_function_names"])
        
        # Print the functions
        for func in v["selected_function_names"]:
            print(optimizer_dict["spade_functions"][func])
        
        rprint("\n\n\n")