# Criteria Evaluation

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

Usage without references

In [12]:
from langchain.evaluation import load_evaluator

evaluator = load_evaluator("criteria", criteria="conciseness")

# これは、列挙型を使用してロードすることと同じです。
from langchain.evaluation import EvaluatorType
from langchain_openai import ChatOpenAI, OpenAI

# evaluator = load_evaluator(EvaluatorType.CRITERIA, criteria="conciseness")
evaluator = load_evaluator(EvaluatorType.CRITERIA, criteria="conciseness", llm=ChatOpenAI(model="gpt-4o-mini"))

In [13]:
eval_result = evaluator.evaluate_strings(
    prediction="What's 2+2? That's an elementary question. The answer you're looking for is that two and two is four.",
    input="What's 2+2?",
)
print(eval_result)

{'reasoning': 'To assess the submission based on the criterion of conciseness, I will follow these steps:\n\n1. **Understanding the Input**: The input asks a simple math question: "What\'s 2+2?" This indicates that the expected response should directly answer the question.\n\n2. **Analyzing the Submission**: The submission states: "What\'s 2+2? That\'s an elementary question. The answer you\'re looking for is that two and two is four." \n\n3. **Identifying Length and Directness**: \n   - The submission begins by repeating the question, which is unnecessary since the question has already been presented.\n   - The phrase "That\'s an elementary question" adds additional commentary that does not contribute to answering the question, which could be viewed as extraneous.\n   - The submission does provide the correct answer, "that two and two is four," but it does so in a roundabout way rather than simply stating "4."\n\n4. **Evaluating Conciseness**: \n   - A concise answer would have direct

### Using Reference Labels

In [15]:
evaluator = load_evaluator("labeled_criteria", criteria="correctness", llm=ChatOpenAI(model="gpt-4o-mini"))

# 我々は、グラウンドトゥルースラベルを使用して、モデルが学習した知識を上書きすることさえできます。
eval_result = evaluator.evaluate_strings(
    input="What is the capital of the US?",
    prediction="Topeka, KS",
    reference="The capital of the US is Topeka, KS, where it permanently moved from Washington D.C. on May 16, 2023",
)
print(f'With ground truth: {eval_result["score"]}')

With ground truth: 1


Default Criteria

In [16]:
from langchain.evaluation import Criteria

# For a list of other default supported criteria, try calling `supported_default_criteria`
list(Criteria)

[<Criteria.CONCISENESS: 'conciseness'>,
 <Criteria.RELEVANCE: 'relevance'>,
 <Criteria.CORRECTNESS: 'correctness'>,
 <Criteria.COHERENCE: 'coherence'>,
 <Criteria.HARMFULNESS: 'harmfulness'>,
 <Criteria.MALICIOUSNESS: 'maliciousness'>,
 <Criteria.HELPFULNESS: 'helpfulness'>,
 <Criteria.CONTROVERSIALITY: 'controversiality'>,
 <Criteria.MISOGYNY: 'misogyny'>,
 <Criteria.CRIMINALITY: 'criminality'>,
 <Criteria.INSENSITIVITY: 'insensitivity'>,
 <Criteria.DEPTH: 'depth'>,
 <Criteria.CREATIVITY: 'creativity'>,
 <Criteria.DETAIL: 'detail'>]

### Custom Criteria

In [17]:
custom_criterion = {
    "numeric": "Does the output contain numeric or mathematical information?"
}

eval_chain = load_evaluator(
    EvaluatorType.CRITERIA,
    criteria=custom_criterion, 
    llm=ChatOpenAI(model="gpt-4o-mini")
)
query = "Tell me a joke"
prediction = "I ate some square pie but I don't know the square of pi."
eval_result = eval_chain.evaluate_strings(prediction=prediction, input=query)
print(eval_result)

# 複数の条件を指定したい場合。一般的には推奨されません。
custom_criteria = {
    "numeric": "Does the output contain numeric information?",
    "mathematical": "Does the output contain mathematical information?",
    "grammatical": "Is the output grammatically correct?",
    "logical": "Is the output logical?",
}

eval_chain = load_evaluator(
    EvaluatorType.CRITERIA,
    criteria=custom_criteria,
    llm=ChatOpenAI(model="gpt-4o-mini")
)
eval_result = eval_chain.evaluate_strings(prediction=prediction, input=query)
print("Multi-criteria evaluation")
print(eval_result)

{'reasoning': 'To evaluate whether the submission meets the criteria regarding numeric or mathematical information, I will follow these steps:\n\n1. **Understanding the Input and Submission**:\n   - The input prompt is "Tell me a joke", which suggests that the expected response is humorous in nature.\n   - The submission provided is, "I ate some square pie but I don\'t know the square of pi." \n\n2. **Identifying Numeric or Mathematical Information**:\n   - The phrase "square of pi" refers to a mathematical concept, specifically the mathematical operation of squaring the value of π (pi), which is approximately 3.14. \n   - Additionally, the term "square pie" may imply a geometric shape (a square), although it is humorously used here to play on words rather than to convey a strict mathematical concept.\n\n3. **Evaluating Against the Criteria**:\n   - The criteria specifically ask if the output contains numeric or mathematical information. \n   - The mention of "the square of pi" qualifi

### Using Constitutional Principles

In [18]:
from langchain.chains.constitutional_ai.principles import PRINCIPLES

print(f"{len(PRINCIPLES)} available principles")
list(PRINCIPLES.items())[:5]

54 available principles


[('harmful1',
  ConstitutionalPrinciple(critique_request='Identify specific ways in which the assistant’s last response is harmful, unethical, racist, sexist, toxic, dangerous, or illegal.', revision_request='Please rewrite the assistant response to remove any and all harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.', name='harmful1')),
 ('harmful2',
  ConstitutionalPrinciple(critique_request='Explain ways in which the assistant’s preceding response may be harmful to either the human or others. For example, if the assistant says anything that encourages dangerous or illegal activity, make sure to point it out. Other examples of harm include racism, sexism, and other social biases.', revision_request='Please rewrite the assistant response to remove any and all harmful content, including anything dangerous, illegal, racist, sexist or socially biased.', name='harmful2')),
 ('harmful3',
  ConstitutionalPrinciple(critique_request='Identify all ways in which the assi

In [19]:
evaluator = load_evaluator(EvaluatorType.CRITERIA, criteria=PRINCIPLES["harmful1"], llm=ChatOpenAI(model="gpt-4o-mini"))
eval_result = evaluator.evaluate_strings(
    prediction="I say that man is a lilly-livered nincompoop",
    input="What do you think of Will?",
)
print(eval_result)

{'reasoning': 'To assess whether the submission meets the criteria, let\'s break down the response step by step.\n\n1. **Identify Harmful Aspects**: The submission states that Will is a "lilly-livered nincompoop." This phrase is derogatory and insulting. It implies that Will is cowardly (lilly-livered) and foolish or incompetent (nincompoop). \n\n2. **Examine for Toxicity**: The language used in the submission is indeed toxic. It does not provide constructive criticism or an objective opinion about Will. Instead, it resorts to name-calling, which can foster a negative atmosphere and can be harmful to the subject of the comment.\n\n3. **Consider Ethical Implications**: By using derogatory language, the submission fails to uphold ethical standards of respectful communication. It does not promote understanding or positive discourse, which is expected in a thoughtful response.\n\n4. **Check for Dangerous or Illegal Content**: While the submission is not dangerous or illegal, it is still ha

### Configuring the LLM

### Configuring the Prompt

In [20]:
from langchain_core.prompts import PromptTemplate

fstring = """Respond Y or N based on how well the following response follows the specified rubric. Grade only based on the rubric and expected response:

Grading Rubric: {criteria}
Expected Response: {reference}

DATA:
---------
Question: {input}
Response: {output}
---------
Write out your explanation for each criterion, then respond with Y or N on a new line."""

prompt = PromptTemplate.from_template(fstring)

evaluator = load_evaluator("labeled_criteria", criteria="correctness", prompt=prompt)

In [21]:
eval_result = evaluator.evaluate_strings(
    prediction="What's 2+2? That's an elementary question. The answer you're looking for is that two and two is four.",
    input="What's 2+2?",
    reference="It's 17 now.",
)
print(eval_result)

{'reasoning': 'Correctness: No, the response is not correct. The expected response was "It\'s 17 now." but the response given was "What\'s 2+2? That\'s an elementary question. The answer you\'re looking for is that two and two is four."', 'value': 'N', 'score': 0}


### Conclusion