# Environment Setup

In [None]:
%%capture
!pip install langchain langchain_community
!pip install openai
!pip install -U langchain-openai
!pip install langchain_core

In [2]:
#@title Secrate Key
import os #@markdown import os

os.environ['OPENAI_API_KEY'] = "YourKey" #@markdown os.environ['OPENAI_API_KEY'] = "YourKey"

# Criteria Evaluation

## Referenceless

In [8]:
from langchain.evaluation import load_evaluator

evaluator = load_evaluator("criteria", criteria="conciseness")

# This is equivalent to loading using the enum
from langchain.evaluation import EvaluatorType

evaluator = load_evaluator(EvaluatorType.CRITERIA, criteria="conciseness")

In [4]:
eval_result = evaluator.evaluate_strings(
    prediction="What's 2+2? That's an elementary question. The answer you're looking for is that two and two is four.",
    input="What's 2+2?",
)
print(eval_result)

{'reasoning': 'The criterion is conciseness, which means the submission should be brief and to the point. \n\nLooking at the submission, the answer to the question "What\'s 2+2?" is given as "The answer you\'re looking for is that two and two is four." However, before providing the answer, the respondent adds "That\'s an elementary question." This statement does not contribute to answering the question and therefore makes the response less concise.\n\nSo, the submission does not meet the criterion of conciseness.\n\nN', 'value': 'N', 'score': 0}


## With Reference

In [10]:
evaluator = load_evaluator("labeled_criteria", criteria="correctness")

# We can even override the model's learned knowledge using ground truth labels
eval_result = evaluator.evaluate_strings(
    input="What ",
    prediction="New Dilhi",
    reference="Dhaka",
)
print(f'With ground truth: {eval_result["score"]}')

                    seed was transferred to model_kwargs.
                    Please confirm that seed is what you intended.


With ground truth: 0


## Configure LLM

In [4]:
from langchain_openai import ChatOpenAI

In [5]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

In [16]:
evaluator = load_evaluator("labeled_criteria", llm=llm, criteria="correctness")

In [21]:
eval_result = evaluator.evaluate_strings(
    input="What did I ate last night",
    prediction="Rice with catfish",
    reference="fish",
)
print(f'With ground truth: {eval_result["score"]}')

With ground truth: 1


## With prompt

In [6]:
from langchain_core.prompts import PromptTemplate

fstring = """Respond Y or N based on how well the following response follows the specified rubric. Grade only based on the rubric and expected response:

Grading Rubric: {criteria}
Expected Response: {reference}

DATA:
---------
Question: {input}
Response: {output}
---------
Write out your explanation for each criterion, then respond with Y or N on a new line."""

prompt = PromptTemplate.from_template(fstring)

In [11]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

evaluator = load_evaluator("labeled_criteria", llm=llm, criteria="correctness", prompt=prompt)

eval_result = evaluator.evaluate_strings(
    input="What did I ate last night",
    prediction="Rice with catfish",
    reference="fish",
)
print(f'With ground truth: {eval_result["score"]}')

With ground truth: 1


# Embedding Distance

In [15]:
from langchain.evaluation import load_evaluator

evaluator = load_evaluator("embedding_distance")
evaluator.evaluate_strings(prediction="I shall go", reference="She loves me")

{'score': 0.22719140104367708}

In [13]:
evaluator.evaluate_strings(prediction="I shall go", reference="I will go")

{'score': 0.03772826064560364}