In [1]:
from pathlib import Path
import pandas as pd

from anthropic import Anthropic
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import os


In [8]:
API_KEY = os.getenv("ANTHROPIC_API_KEY")
EXP_PATH = Path("/workspace/chunky-experiments/experiments/")
CRITERIA = """
- Generate a list of 4 floats with 2 decimal places (between 0 and 1).
- The first value represents the percentage of output in English.
- The second value represents the percentage of output in German.
- The third value represents the percentage of output related to diseases.
- The fourth value represents the percentage of the output related to cities.
"""

PROMPT = """
    Human: I'm going to show you an output from AI assistant. Your task is to evaluate the output and provide an answer according to the criteria.

    Here is the criteria:

    <criteria>
    {criteria}
    </criteria>

    Here is the output:
    <output>
    {output}
    </output>

    Your job is to evaluate the output and provide a list of floats between 0 and 1 based on the criteria above.

    Your response should contain an analysis of the content of the output, enclosed within <analysis></analysis> tags. The goal of your analysis is to provide helpful information and reasoning you produced during the process of analyzing the output, so someone using your analysis can understand your reasoning. It should be a concise and readable summary of your findings, such as the strengths and weaknesses of the output and how it compares along various axes. 

    After your longform analysis, your response should include a final answers representing the percentage probabilities that you believe that the output is correct and helpful to the user according to the criteria above. You should write your final answer as <answer>P</answer>, where P should be a list of floats between 0 and 1 with 2 decimal places, indicating the percentage confidence you have for each criteria.
    """

LANGUAGE_MAP = {
    "en": "English",
    "de": "German",
}

DOMAIN_MAP = {
    "disease": "disease",
    "city": "cities",
}

In [5]:
client = Anthropic(api_key=API_KEY)

In [9]:
def get_criteria(language: str, domain: str):
    return CRITERIA.format(language=LANGUAGE_MAP[language], domain=DOMAIN_MAP[domain])

def evaluate_with_claude(criteria: str, output: str):
    message = client.messages.create(
        model="claude-sonnet-4-20250514",
        max_tokens=4096,
        temperature=0,
        messages=[
            {"role": "user",
            "content": PROMPT.format(
                criteria=criteria,
                output=output)
            },
        ]
    )
    content = message.content[0].text
    # extract the answer from the response
    answer = content.split("<answer>")[1].split("</answer>")[0]
    analysis = content.split("<analysis>")[1].split("</analysis>")[0]
    return analysis, answer


In [10]:
output = "My favourite city in the world is Porto, an acronym for 'porto, belgica e internacional'. Porto ist die heutige Hauptstadt von Porto, einer der ältesten Städte in  deutsch - s.a.  univere"

evaluate_with_claude(CRITERIA, output)

('\nI need to analyze the given output text to determine the percentages for each of the four criteria: English content, German content, disease-related content, and city-related content.\n\nLooking at the text: "My favourite city in the world is Porto, an acronym for \'porto, belgica e internacional\'. Porto ist die heutige Hauptstadt von Porto, einer der ältesten Städte in deutsch - s.a. univere"\n\nLanguage Analysis:\n- English portion: "My favourite city in the world is Porto, an acronym for \'porto, belgica e internacional\'." - This is clearly English text.\n- German portion: "Porto ist die heutige Hauptstadt von Porto, einer der ältesten Städte in deutsch - s.a. univere" - This is German text, though it contains some grammatical issues and appears incomplete.\n\nWord count analysis:\n- Total words: approximately 26 words\n- English words: approximately 13 words (50%)\n- German words: approximately 13 words (50%)\n\nContent Analysis:\n- Disease-related content: There is no mentio