In [None]:
# Example 1: https://github.com/redhat-et/foundation-models-for-documentation/blob/master/notebooks/llm-evaluation/langchain-evaluation.ipynb

import os
import json
from dotenv import load_dotenv
load_dotenv()

from langchain.evaluation import load_evaluator, EvaluatorType, Criteria
from langchain_openai import ChatOpenAI

llm=ChatOpenAI(model_name="gpt-3.5-turbo", openai_api_key=os.environ["OPENAI_API_KEY"])


In [21]:
list(EvaluatorType)

[<EvaluatorType.QA: 'qa'>,
 <EvaluatorType.COT_QA: 'cot_qa'>,
 <EvaluatorType.CONTEXT_QA: 'context_qa'>,
 <EvaluatorType.PAIRWISE_STRING: 'pairwise_string'>,
 <EvaluatorType.SCORE_STRING: 'score_string'>,
 <EvaluatorType.LABELED_PAIRWISE_STRING: 'labeled_pairwise_string'>,
 <EvaluatorType.LABELED_SCORE_STRING: 'labeled_score_string'>,
 <EvaluatorType.AGENT_TRAJECTORY: 'trajectory'>,
 <EvaluatorType.CRITERIA: 'criteria'>,
 <EvaluatorType.LABELED_CRITERIA: 'labeled_criteria'>,
 <EvaluatorType.STRING_DISTANCE: 'string_distance'>,
 <EvaluatorType.EXACT_MATCH: 'exact_match'>,
 <EvaluatorType.REGEX_MATCH: 'regex_match'>,
 <EvaluatorType.PAIRWISE_STRING_DISTANCE: 'pairwise_string_distance'>,
 <EvaluatorType.EMBEDDING_DISTANCE: 'embedding_distance'>,
 <EvaluatorType.PAIRWISE_EMBEDDING_DISTANCE: 'pairwise_embedding_distance'>,
 <EvaluatorType.JSON_VALIDITY: 'json_validity'>,
 <EvaluatorType.JSON_EQUALITY: 'json_equality'>,
 <EvaluatorType.JSON_EDIT_DISTANCE: 'json_edit_distance'>,
 <EvaluatorTy

In [22]:
list(Criteria)

[<Criteria.CONCISENESS: 'conciseness'>,
 <Criteria.RELEVANCE: 'relevance'>,
 <Criteria.CORRECTNESS: 'correctness'>,
 <Criteria.COHERENCE: 'coherence'>,
 <Criteria.HARMFULNESS: 'harmfulness'>,
 <Criteria.MALICIOUSNESS: 'maliciousness'>,
 <Criteria.HELPFULNESS: 'helpfulness'>,
 <Criteria.CONTROVERSIALITY: 'controversiality'>,
 <Criteria.MISOGYNY: 'misogyny'>,
 <Criteria.CRIMINALITY: 'criminality'>,
 <Criteria.INSENSITIVITY: 'insensitivity'>,
 <Criteria.DEPTH: 'depth'>,
 <Criteria.CREATIVITY: 'creativity'>,
 <Criteria.DETAIL: 'detail'>]

In [15]:
evaluator = load_evaluator(EvaluatorType.CRITERIA, llm=llm, criteria="conciseness")
eval_result = evaluator.evaluate_strings(
  prediction="8",
  input="5+5?",
)
print(json.dumps(eval_result, indent=2))

{
  "reasoning": "1. Conciseness:\n- The submission is \"8\" which is not concise or to the point as it is not the correct answer to the input \"5+5?\"\n- Therefore, the submission does not meet the criteria of conciseness.\n\nN",
  "value": "N",
  "score": 0
}


In [17]:
evaluator=load_evaluator(EvaluatorType.CRITERIA, llm=llm, criteria="helpfulness")
eval_result = evaluator.evaluate_strings(
  prediction="8",
  input="5+5?",
)
print(json.dumps(eval_result, indent=2))

{
  "reasoning": "1. Is the submission helpful, insightful, and appropriate?\n- The submission provided an incorrect answer to the given task of 5+5, which is 10. \n- The answer of 8 is not helpful, insightful, or appropriate in this context.\n- Therefore, the submission does not meet the criteria.\n\nN",
  "value": "N",
  "score": 0
}


In [18]:
custom_crt={"numeric": "Does the output contain numeric or mathematical information?"}
evaluator = load_evaluator(EvaluatorType.CRITERIA, llm=llm, criteria=custom_crt)
eval_result = evaluator.evaluate_strings(
  prediction="8",
  input="5+5?",
)
print(json.dumps(eval_result, indent=2))

{
  "reasoning": "1. The input is a mathematical question \"5+5?\", which is numeric in nature.\n2. The submission \"8\" is also numeric, as it is a numerical value.\n\nTherefore, the submission meets the criteria.",
  "value": "Y",
  "score": 1
}


In [19]:
custom_criteria = {
    "numeric": "Does the output contain numeric information?",
    "mathematical": "Does the output contain mathematical information?",
    "grammatical": "Is the output grammatically correct?",
    "logical": "Is the output logical?",
}
evaluator = load_evaluator(EvaluatorType.CRITERIA, llm=llm, criteria=custom_criteria)
eval_result = evaluator.evaluate_strings(
  prediction="8",
  input="5+5?",
)
print(json.dumps(eval_result, indent=2))

{
  "reasoning": "- numeric: The submission \"8\" contains numeric information, as it is a number.\n- mathematical: The submission \"8\" does not contain the correct mathematical information, as the correct answer to 5+5 is 10.\n- grammatical: The submission \"8\" is grammatically correct.\n- logical: The submission \"8\" is not logical in the context of the given task, as 5+5 does not equal 8.",
  "value": "N",
  "score": 0
}


In [14]:
evaluator = load_evaluator(EvaluatorType.LABELED_CRITERIA, llm=llm, criteria=Criteria.CORRECTNESS)
eval_result = evaluator.evaluate_strings(
    prediction = "Moon is not the satellite of the earth",
    reference= "Moon is the satellite of the earth",
    input = "Name the satellite of the earth?"
)
print(json.dumps(eval_result, indent=2))

{
  "reasoning": "1. Correctness:\n- The submission states that the Moon is not the satellite of the earth.\n- The reference clearly states that the Moon is the satellite of the earth.\n- Therefore, the submission is incorrect based on the factual information provided in the reference.",
  "value": "N",
  "score": 0
}
