**Make sure you load the API keys for cloud providers!**

You can set your environment keys yourself or use a script. Please note that since keys are private, they are not included in the repository.

In [4]:
# setting the environment variables, the keys
import sys
import os

sys.path.insert(0, os.path.abspath('..'))

from config import set_environment
# for the keys - as explained early in chapter 2
set_environment()

# Exact Match Evaluation

In [1]:
from langchain.evaluation import load_evaluator, ExactMatchStringEvaluator

prompt = "What is the current Federal Reserve interest rate?"
reference_answer = "0.25%"  # Suppose this is the correct answer.

# Example predictions:
prediction_correct = "0.25%"
prediction_incorrect = "0.50%"

# Initialize an Exact Match evaluator that ignores case differences.
exact_evaluator = ExactMatchStringEvaluator(ignore_case=True)

# Evaluate the correct prediction.
exact_result_correct = exact_evaluator.evaluate_strings(
    prediction=prediction_correct, reference=reference_answer
)
print("Exact match result (correct answer):", exact_result_correct)

# Evaluate an incorrect prediction.
exact_result_incorrect = exact_evaluator.evaluate_strings(
    prediction=prediction_incorrect, reference=reference_answer
)
print("Exact match result (incorrect answer):", exact_result_incorrect)

Exact match result (correct answer): {'score': 1}
Exact match result (incorrect answer): {'score': 0}


# LLM-as-Judge Evaluation

In [6]:
from langchain_community.chat_models import ChatOpenAI
from langchain.evaluation.scoring import ScoreStringEvalChain

# Initialize the evaluator LLM
llm = ChatOpenAI(temperature=0, model_name="gpt-4")

# Create the ScoreStringEvalChain from the LLM
chain = ScoreStringEvalChain.from_llm(llm=llm)

# Define the finance-related input, prediction, and reference answer
finance_input = "What is the current Federal Reserve interest rate?"
finance_prediction = "The current interest rate is 0.25%."
finance_reference = "The Federal Reserve's current interest rate is 0.25%."

# Evaluate the prediction using the scoring chain
result_finance = chain.evaluate_strings(
    input=finance_input,
    prediction=finance_prediction,
)

print("Finance Evaluation Result:")
print(result_finance)

This chain was only tested with GPT-4. Performance may be significantly worse with other models.


Finance Evaluation Result:
{'reasoning': "The assistant's response is not verifiable as it does not provide a date or source for the information. The Federal Reserve interest rate changes over time and is not static. Therefore, without a specific date or source, the information provided could be incorrect. The assistant should have advised the user to check the Federal Reserve's official website or a reliable financial news source for the most current rate. The response lacks depth and accuracy. Rating: [[3]]", 'score': 3}


# LLM-as-Judge Evaluation with Reference Answer

In [7]:
from langchain_community.chat_models import ChatOpenAI
from langchain.evaluation.scoring import LabeledScoreStringEvalChain

# Initialize the evaluator LLM
llm = ChatOpenAI(temperature=0, model_name="gpt-4")

# Create the evaluation chain that can use reference answers
labeled_chain = LabeledScoreStringEvalChain.from_llm(llm=llm)

# Define the finance-related input, prediction, and reference answer
finance_input = "What is the current Federal Reserve interest rate?"
finance_prediction = "The current interest rate is 0.25%."
finance_reference = "The Federal Reserve's current interest rate is 0.25%."

# Evaluate the prediction against the reference
labeled_result = labeled_chain.evaluate_strings(
    input=finance_input,
    prediction=finance_prediction,
    reference=finance_reference,
)

print("Finance Evaluation Result (with reference):")
print(labeled_result)

Finance Evaluation Result (with reference):
{'reasoning': "The assistant's response is helpful, relevant, and correct. It directly answers the user's question about the current Federal Reserve interest rate. However, it lacks depth as it does not provide any additional information or context about the interest rate, such as how it is determined or what it means for the economy. Rating: [[8]]", 'score': 8}


# Criteria Evaluation (Tone and Conciseness)


In [8]:
from langchain.evaluation import load_evaluator
from langchain.chat_models import ChatOpenAI

evaluation_llm = ChatOpenAI(model="gpt-4", temperature=0)

prompt_health = "What is a healthy blood pressure range for adults?"
prediction_health = (
    "A normal blood pressure reading is typically around 120/80 mmHg. "
    "It's important to follow your doctor's advice for personal health management!"
)

# Evaluate conciseness
conciseness_evaluator = load_evaluator("criteria", criteria="conciseness", llm=evaluation_llm)
conciseness_result = conciseness_evaluator.evaluate_strings(
    prediction=prediction_health, input=prompt_health
)
print("Conciseness evaluation result:", conciseness_result)

# Evaluate friendliness with custom criterion
custom_friendliness = {
    "friendliness": "Is the response written in a friendly and approachable tone?"
}
friendliness_evaluator = load_evaluator("criteria", criteria=custom_friendliness, llm=evaluation_llm)
friendliness_result = friendliness_evaluator.evaluate_strings(
    prediction=prediction_health, input=prompt_health
)
print("Friendliness evaluation result:", friendliness_result)


Conciseness evaluation result: {'reasoning': "The criterion is conciseness. This means the submission should be brief, to the point, and not contain unnecessary information.\n\nLooking at the submission, it provides a direct answer to the question, stating that a normal blood pressure reading is around 120/80 mmHg. This is a concise answer to the question.\n\nThe submission also includes an additional sentence advising to follow a doctor's advice for personal health management. While this information is not directly related to the question, it is still relevant and does not detract from the conciseness of the answer.\n\nTherefore, the submission meets the criterion of conciseness.\n\nY", 'value': 'Y', 'score': 1}
Friendliness evaluation result: {'reasoning': "The criterion is to assess whether the response is written in a friendly and approachable tone. The submission provides the information in a straightforward manner and ends with a suggestion to follow doctor's advice for personal 

# JSON Format Validation

In [9]:
from langchain.evaluation import JsonValidityEvaluator

# Initialize the JSON validity evaluator.
json_validator = JsonValidityEvaluator()

valid_json_output = '{"company": "Acme Corp", "revenue": 1000000, "profit": 200000}'
invalid_json_output = '{"company": "Acme Corp", "revenue": 1000000, "profit": 200000,}'

# Evaluate the valid JSON.
valid_result = json_validator.evaluate_strings(prediction=valid_json_output)
print("JSON validity result (valid):", valid_result)

# Evaluate the invalid JSON.
invalid_result = json_validator.evaluate_strings(prediction=invalid_json_output)
print("JSON validity result (invalid):", invalid_result)

JSON validity result (valid): {'score': 1}
JSON validity result (invalid): {'score': 0, 'reasoning': 'Expecting property name enclosed in double quotes: line 1 column 63 (char 62)'}
