# 1. Populating Annotation File

In [5]:
from populate_eval import evaluate_insights
import tqdm, json

anno_fn = "data/eval_benchmark_data.json"

prompt_key, eval_prompt_fn = "prompted", "prompts/eval_summhay.txt"
# prompt_key, eval_prompt_fn = "9fs", "prompts/eval_summhay_9fs.txt"

with open(anno_fn, "r") as f:
    anno_dataset = json.load(f)

cost_per_model = {}

for eval_model in ["gpt-4o", "claude3-opus", "gemini-1.5-pro", "gpt3.5", "claude3-haiku"]:
    eval_key = f"predictions_{prompt_key}_{eval_model}"

    for sample in tqdm.tqdm(anno_dataset, desc=eval_model):
        if eval_key in sample:
            continue
        sample[eval_key], cost = evaluate_insights(sample["reference_insights"], sample["summary"], eval_model, eval_prompt_fn=eval_prompt_fn, return_cost=True)
        cost_per_model[eval_key] = cost

        with open(anno_fn, "w") as f:
            json.dump(anno_dataset, f, indent=2)

# print(cost_per_model)

gpt-4o: 100%|██████████| 200/200 [00:00<00:00, 716362.77it/s]
claude3-opus: 100%|██████████| 200/200 [00:00<00:00, 678689.97it/s]
gemini-1.5-pro: 100%|██████████| 200/200 [00:00<00:00, 763989.80it/s]
gpt3.5: 100%|██████████| 200/200 [00:00<00:00, 750994.45it/s]
claude3-haiku: 100%|██████████| 200/200 [00:00<00:00, 680893.51it/s]

{}





# 2. Auto-Evaluation Benchmark

In [17]:
import json, pandas as pd, numpy as np
from IPython.display import display

anno_fn = "data/eval_benchmark_data.json"

def map_to_score(cov_label):
    if cov_label.lower() in ["partial_coverage", "partially_covered"]:
        return 0.5
    elif cov_label.lower() in ["full_coverage", "fully_covered"]:
        return 1
    elif cov_label.lower() in ["no_coverage", "not_covered"]:
        return 0
    return None

with open(anno_fn, "r") as f:
    annotated_data = json.load(f)

prediction_keys = [k for k in annotated_data[-1] if k.startswith("prediction")]
cost_per_sample = {'predictions_prompted_gpt-4o': 0.03435, 'predictions_prompted_claude3-haiku': 0.0019875, 'predictions_prompted_claude3-opus': 0.11895, 'predictions_prompted_gemini-1.5-pro': 0.075815, 'predictions_prompted_gpt3.5': 0.006682, 'predictions_9fs_gpt-4o': 0.13063503}

predictions = {key: [] for key in prediction_keys}
pred_links = {key: {} for key in prediction_keys}
labels, label_links = [], {}

for d in annotated_data:
    insight2label = {i["insight_id"]: i["coverage"] for i in d["annotation"]}
    label_links.update({(d["summkey"], i["insight_id"]): i["candidate_id"] for i in d["annotation"]})
    for insight_id in insight2label:
        labels.append(map_to_score(insight2label[insight_id]))
    for key in prediction_keys:
        insight2pred = {i["insight_id"]: i["coverage"] for i in d[key]}
        pred_links[key].update({(d["summkey"], i["insight_id"]): i["bullet_id"] for i in d[key]})
        for insight_id in insight2label:
            predictions[key].append(map_to_score(insight2pred[insight_id]))

stid2data = {}
for d in annotated_data:
    if d["subtopic_id"] not in stid2data:
        stid2data[d["subtopic_id"]] = []
    stid2data[d["subtopic_id"]].append(d)

for stid in stid2data:
    samples = stid2data[stid]
    anno_scores = []
    pred_scores = {}
    for sample in samples:
        anno_score = sum([map_to_score(a["coverage"]) for a in sample["annotation"]])
        anno_scores.append(anno_score)

        for k in sample:
            if k in prediction_keys:
                k = k
                if k not in pred_scores:
                    pred_scores[k] = []
                pred_scores[k].append(sum([map_to_score(a["coverage"]) for a in sample[k]]))

results = []
for key in prediction_keys:
    corrcoef = np.corrcoef(labels, predictions[key])
    linking_vals = [pred_links[key][k]-1 == int(label_links[k]) for k in label_links if type(pred_links[key][k]) == int and label_links[k] != "no_selection"]

    acc_linking = np.mean(linking_vals)

    prompt, model = key.replace("predictions_", "").split("_")
    model_name = model + ("-9fs" if "9fs" in prompt else "")

    total_cost = cost_per_sample[key] * len(annotated_data)

    results.append({"Model": model_name, "Instance-Level Corr Coeff": corrcoef[0, 1],
                    "Accuracy (linking)": acc_linking, "Cost": total_cost})

display(pd.DataFrame(results).round(3).sort_values("Instance-Level Corr Coeff", ascending=False).set_index("Model").style.set_caption("Results for Insight Coverage Prediction").format({"Cost": "${:,.2f}".format, "Instance-Level Corr Coeff": "{:.3f}".format, "Accuracy (linking)": "{:.3f}".format}))

Unnamed: 0_level_0,Instance-Level Corr Coeff,Accuracy (linking),Cost
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
gemini-1.5-pro,0.751,0.893,$15.16
gpt-4o-9fs,0.719,0.892,$26.13
gpt-4o,0.716,0.889,$6.87
claude3-opus,0.677,0.879,$23.79
claude3-haiku,0.498,0.877,$0.40
gpt3.5,0.495,0.867,$1.34


# 3. Auto-Eval Bias Analysis

In [13]:
import numpy as np, pandas as pd, re

print("Delta = (Average of Predicted Coverage - Average of Annotated Coverage)")

autoeval_bias_results = {}
length_delta_results = {}
length_score_results = {}

def calc_num_words(summary_bullets):
    summary = "\n".join(summary_bullets)
    summary = re.sub(r"\s{2,}", " ", summary)
    return summary.count(" ") + 1

for key in ["predictions_prompted_gpt-4o", "predictions_prompted_claude3-opus", "predictions_prompted_gemini-1.5-pro"]:
    predictor_model = key.split("_")[-1]
    length_delta_results[predictor_model] = []
    length_score_results[predictor_model] = []
    gen_model_deltas = {}

    for d in annotated_data:
        insight2label = {i["insight_id"]: i["coverage"] for i in d["annotation"]}
        labels, predictions = [], []
        insight2pred = {i["insight_id"]: i["coverage"] for i in d[key]}

        for insight_id in insight2label:
            labels.append(map_to_score(insight2label[insight_id]))
            predictions.append(map_to_score(insight2pred[insight_id]))

        gen_model = d["summkey"].split("_")[-1]

        num_words = calc_num_words(d["summary"]) / len(d["summary"]) # per-bullet length normalization
        if gen_model not in gen_model_deltas:
            gen_model_deltas[gen_model] = []
        score_delta = np.mean(predictions)-np.mean(labels)
        gen_model_deltas[gen_model].append(score_delta)
        length_delta_results[predictor_model].append((score_delta, num_words))
        length_score_results[predictor_model].append((np.mean(predictions), num_words))

    gen_model_deltas = {k: np.mean(v) for k, v in gen_model_deltas.items()}
    for summarizing_model in gen_model_deltas:
        if summarizing_model not in autoeval_bias_results:
            autoeval_bias_results[summarizing_model] = {"Summarizing Model": summarizing_model}
        autoeval_bias_results[summarizing_model][predictor_model] = gen_model_deltas[summarizing_model]

results = list(autoeval_bias_results.values())
# sort by average of deltas
results = sorted(results, key=lambda x: np.mean([v for k, v in x.items() if k != "Summarizing Model"]))

# Let's add a row for the average deltas
average_row = {"Summarizing Model": "Avg. Summarizer Bias"}
for key in results[0]:
    if key == "Summarizing Model":
        continue
    average_row[key] = np.mean([r[key] for r in results])
results.append(average_row)

# add the length bias, which is a correlation
length_bias_row = {k: np.corrcoef([v for v, _ in length_delta_results[k]], [w for _, w in length_delta_results[k]])[0, 1] for k in length_delta_results}
length_bias_row["Summarizing Model"] = "Length <-> Delta (Correlation)"
results.append(length_bias_row)

length_score_bias_row = {k: np.corrcoef([v for v, _ in length_score_results[k]], [w for _, w in length_score_results[k]])[0, 1] for k in length_score_results}
length_score_bias_row["Summarizing Model"] = "Length <-> Score (Correlation)"
results.append(length_score_bias_row)

results = pd.DataFrame(results)
# results.set_index("Summarizing Model", inplace=True)
results = results.round(3)
display(results)

Delta = (Average of Predicted Coverage - Average of Annotated Coverage)


Unnamed: 0,Summarizing Model,gpt-4o,claude3-opus,gemini-1.5-pro
0,claude3-sonnet,0.027,-0.001,-0.012
1,gemini-1.5-flash,0.032,0.009,-0.025
2,claude3-opus,0.04,0.011,0.019
3,gpt3.5,0.009,0.05,0.048
4,gemini-1.5-pro,0.057,0.042,0.041
5,gpt4-turbo,0.063,0.08,0.042
6,claude3-haiku,0.057,0.088,0.052
7,gpt-4o,0.07,0.078,0.08
8,command-r-plus,0.064,0.128,0.071
9,Avg. Summarizer Bias,0.047,0.054,0.035
