In [None]:
from anthropic.types.message_create_params import MessageCreateParamsNonStreaming
from prompts import VERIFICATION_SYSTEM_PROMPT, VERIFICATION_USER_PROMPT
from anthropic.types.messages.batch_create_params import Request
from pathlib import Path
import pandas as pd
import anthropic
import json
import re

client = anthropic.Anthropic()

# Run once to save questions for eval

In [None]:
one_hop = all_questions_df.query("Label == 'DDI_Bio_1_hop'").sample(n=40, random_state=42)
two_hop = all_questions_df.query("Label == 'DDI_Bio_2_hop'").sample(n=30, random_state=42)
three_hop = all_questions_df.query("Label == 'DDI_Bio_3_hop'").sample(n=30, random_state=42)

pd.concat([one_hop, two_hop, three_hop]).to_csv("../data/Eval/questions_for_Paul.csv", index=False)

# Automatic (Claude) Eval

In [None]:
def create_batch_request(df):
    requests = []
    for row in df.itertuples():
        requests.append(Request(
            custom_id=f"{row.Entities.replace(" ", "-")}-Evaluation", # Have to do this - their naming convention.
            params=MessageCreateParamsNonStreaming(
                model="claude-3-7-sonnet-20250219",
                max_tokens=1024,
                system= [{"type": "text", "text": VERIFICATION_SYSTEM_PROMPT, "cache_control": {"type": "ephemeral"}}],
                messages=[{"role": "user",
                           "content": VERIFICATION_USER_PROMPT.format(row.Question_Background, row.Question, row.Answer)}])))
    return requests

formatted_requests = create_batch_request(pd.read_csv("../data/Eval/questions_for_Paul.csv"))

message_batch = client.messages.batches.create(requests=formatted_requests)
print(message_batch)

# Claude response processing

In [None]:
claude_responses = []
with Path("../data/Eval/calude_eval_results.jsonl").open("r") as file:
    for line in file.readlines():
        claude_responses.append(json.loads(line))

def parse_metric(metric, text):
    regex = re.search(rf"({metric} REASONING:)(.*?)({metric} SCORE: \d)", text, re.DOTALL)
    reason = regex.group(2).strip()
    score = int(regex.group(3).split(": ")[1])
    return (reason, score)

all_response_fields = []
for response in claude_responses:
    idx = response["custom_id"]
    text = response["result"]["message"]["content"][0]["text"] 
    
    # Remove markdown headers and bold formatting
    text = re.sub(r'[*#]+', '', text)

    clarity_reason, clarity_score = parse_metric("CLARITY", text)
    coverage_reason, coverage_score = parse_metric("COVERAGE", text)
    assumptions_reason, assumptions_score = parse_metric("ASSUMPTIONS", text)
    inferable_reason, inferable_score = parse_metric("INFERABLE", text)

    all_response_fields.append((idx, clarity_reason, clarity_score, coverage_reason, coverage_score,
                               assumptions_reason, assumptions_score, inferable_reason, inferable_score))

pd.DataFrame(data=all_response_fields, columns=["ID", "clarity_reason", "clarity_score", "coverage_reason", "coverage_score", 
                                                "assumptions_reason", "assumptions_score", 
                                                "inferable_reason", 
                                                "inferable_score"]).to_csv("../data/Eval/claude_eval_parsed.csv", index=False)

# Comparing both 

In [None]:
original_set = pd.read_csv("../data/Eval/questions_for_Paul.csv")
original_set = original_set[["Entities", "Question", "Answer", "Label"]]

pauls_eval = pd.read_csv("../data/Eval/BioMolMQA - Evaluation.csv")
pauls_eval.drop(columns="Question", inplace=True)

pauls_eval_extended = pd.concat([original_set, pauls_eval], axis=1)
pauls_eval_extended["Entities"] = pauls_eval_extended["Entities"].apply(lambda x: x.replace(" ", "-")+"-Evaluation")
pauls_eval_extended.rename(columns={"Entities": "ID"}, inplace=True)

claudes_eval = pd.read_csv("../data/Eval/claude_eval_parsed.csv")

merged_df = pd.merge(pauls_eval_extended, claudes_eval)

In [None]:
def calc_average(df, annotator):
    if annotator == "human":
        cols = ["CLARITY", "COVERAGE", "ASSUMPTIONS", "INFERABLE"]
        for col in cols:
            print(f"HUMAN {col}: {df[col].mean()}")
    else:
        cols = ["clarity_score", "coverage_score", "assumptions_score", "inferable_score"]
        for col in cols:
            print(f"LLM {col}: {df[col].mean()}")

calc_average(merged_df, "human"), calc_average(merged_df, "llm")