# Grading Tests

### 1. Random Langchain Experiments

In [2]:
from langchain.evaluation import load_evaluator
from langchain.evaluation import EvaluatorType
from langchain.evaluation import Criteria

import langchain
langchain.verbose = True

evaluator = load_evaluator("criteria", criteria="conciseness")
evaluator = load_evaluator(EvaluatorType.CRITERIA, criteria="conciseness")

evaluator.set_verbose(True)
eval_result = evaluator.evaluate_strings(
    prediction="What's 2+2? That's an elementary question. The answer you're looking for is that two and two is four.",
    input="What's 2+2?",
)
print(eval_result)

{'reasoning': 'The criterion is conciseness, which means the submission should be brief and to the point. \n\nLooking at the submission, the answer to the question "What\'s 2+2?" is given as "The answer you\'re looking for is that two and two is four." However, before providing the answer, the respondent adds unnecessary information by stating "That\'s an elementary question." \n\nThis additional statement does not contribute to answering the question and therefore makes the response less concise. \n\nSo, based on the criterion of conciseness, the submission does not meet the criterion.\n\nN', 'value': 'N', 'score': 0}


### 2. Actual Grading Experiments

In [7]:
import json
with open("queries_alt.json", "r") as f:
    queries = json.load(f)

In [8]:
from dataclasses import dataclass, asdict
import openai
from typing import Tuple

@dataclass
class LLMRequest:
    criteria: dict[str, str]
    inputs: dict[str, str]
    output: str

def _execute_grader(
    criteria: dict[str, str],
    input_variables: dict[str, str],
    output: str,
) -> Tuple[str, str]:
    system_msg = "You are an evaluator trying to grade the response of an agent based on the provided JSON data. Keeping the objective and the inputs in mind, rate the response based on the grading criteria. You always use the function provided."
    llm_request = LLMRequest(
        criteria=criteria, inputs=input_variables, output=output
    )
    user_msg = f"""{json.dumps(asdict(llm_request))}

    Given the above inputs, response and criteria, report your evaluation rating along with the reasoning. Think step by step.
    """
    functions = [
        {
            "name": "report_ratings",
            "description": "report the results of the evaluation",
            "parameters": {
                "type": "object",
                "properties": {
                    "evaluations": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "name": {
                                    "type": "string",
                                    "description": "The name of the grading criteria"
                                },
                                "rating": {
                                    "type": "string",
                                    "enum": ["Yes", "No"],
                                    "description": "Yes if the response meets the grading criteria. No if it does not."
                                },
                                "reason": {
                                    "type": "string",
                                    "description": "The reason for the rating."
                                }
                            },
                            "required": ["rating", "reason"]
                        }
                    }
                },
                "required": ["evaluations"],
            }
        }
    ]

    response = openai.ChatCompletion.create(  # type: ignore
        model="gpt-4",
        temperature=0.1,
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg},
        ],
        functions=functions,
        function_call={"name": "report_ratings"},
    )["choices"][0]["message"]
    assert response["content"] is None
    assert response["function_call"]["name"] == "report_ratings"
    ratings = json.loads(response["function_call"]["arguments"])
    return ratings

def run_evaluation(question, context, answer):
    criteria = {
        "verifiability": "Could the answer have been derived solely from the context? 'Yes' if all the data suggested in the answer is present in the context. 'No' if there is some data in the answer that is not present in the context.",
        "validity": "Is the answer correct given the context and question?",
        "relevance": "Is the submission referring to a real quote from the text?",
        "coherence": "Is the submission coherent, well-structured, and organized?",
        "conciseness": "Is the response concise and to the point?",
    }
    return _execute_grader(
        criteria=criteria,
        input_variables={"question": question, "context": context},
        output=answer,
    )

In [10]:
query = queries[0]
ratings = run_evaluation(question=query['question'], context=query['context'], answer=query['answer'])
print(ratings)

fake_answer = """It is generally suggested that advisors, who typically contribute less value than directors, should receive a lower equity grant. A common suggestion is an annual grant of 25% to 50% of the four-year grant you'd give a junior engineer. This would equate to 1x - 2x a junior engineer's grant if the advisor stays engaged for four years. Also Barrack Obama says that life is hard, but not simple. He states that in his 45th presidential address."""
ratings = run_evaluation(question=query['question'], context=query['context'], answer=fake_answer)
print(ratings)

{'evaluations': [{'name': 'verifiability', 'rating': 'Yes', 'reason': 'The answer provided by the agent can be verified from the context provided. The context mentions that advisors typically receive a lower equity grant and that a common suggestion is an annual grant of 25% to 50% of the four-year grant given to a junior engineer.'}, {'name': 'validity', 'rating': 'Yes', 'reason': 'The answer is valid as it correctly interprets the information given in the context. The agent correctly states that the amount of equity given to an advisor can vary and that it is generally less than what is given to directors.'}, {'name': 'relevance', 'rating': 'Yes', 'reason': "The agent's response is relevant to the question asked. The question asks about the amount of equity that should be given to an advisor, and the agent's response provides a detailed answer based on the context provided."}, {'name': 'coherence', 'rating': 'Yes', 'reason': "The agent's response is coherent and well-structured. It p

In [6]:
query = queries[0]
# ratings = run_evaluation(question=query['question'], context=query['context'], answer=query['answer'])

fake_answer = """It is generally suggested that advisors, who typically contribute less value than directors, should receive a lower equity grant. A common suggestion is an annual grant of 25% to 50% of the four-year grant you'd give a junior engineer. This would equate to 1x - 2x a junior engineer's grant if the advisor stays engaged for four years. Also Barrack Obama says that life is hard, but not simple. He states that in his 45th presidential address."""
ratings = run_evaluation(question=query['question'], context=query['context'], answer=fake_answer)
ratings

KeyboardInterrupt: 

In [None]:
query = queries[0]
# ratings = run_evaluation(question=query['question'], context=query['context'], answer=query['answer'])

fake_answer = """It is generally suggested that advisors, who typically contribute less value than directors, should receive a lower equity grant. A common suggestion is an annual grant of 25% to 50% of the four-year grant you'd give a junior engineer. This would equate to 1x - 2x a junior engineer's grant if the advisor stays engaged for four years. Also Barrack Obama says that life is hard, but not simple. He states that in his 45th presidential address."""
ratings = run_evaluation(question=query['question'], context=query['context'], answer=fake_answer)
ratings

{'evaluations': [{'name': 'verifiability',
   'rating': 'No',
   'reason': 'The response includes a quote from Barack Obama that is not present in the context provided.'},
  {'name': 'validity',
   'rating': 'Yes',
   'reason': 'The response correctly summarizes the information from the context regarding the equity grant for advisors.'},
  {'name': 'relevance',
   'rating': 'No',
   'reason': 'The quote from Barack Obama is not relevant to the question or the context.'},
  {'name': 'coherence',
   'rating': 'Yes',
   'reason': 'The response is coherent and well-structured, except for the irrelevant quote from Barack Obama.'},
  {'name': 'conciseness',
   'rating': 'No',
   'reason': 'The response includes unnecessary information (the quote from Barack Obama) that does not contribute to answering the question.'}]}

In [None]:
ratings

[{'name': 'verifiability',
  'rating': 'No',
  'reason': 'The response includes a quote from Barack Obama that is not present in the context provided.'},
 {'name': 'validity',
  'rating': 'Yes',
  'reason': 'The response correctly summarizes the information from the context regarding the equity grant for advisors.'},
 {'name': 'relevance',
  'rating': 'No',
  'reason': 'The quote from Barack Obama is not relevant to the question or the context.'},
 {'name': 'coherence',
  'rating': 'Yes',
  'reason': 'The response is coherent and well-structured, except for the irrelevant quote from Barack Obama.'},
 {'name': 'conciseness',
  'rating': 'No',
  'reason': 'The response includes unnecessary information (the quote from Barack Obama) that does not contribute to answering the question.'}]

In [None]:
def _calculate_score(ratings: list[dict[str, str]]) -> float:
    num_criteria = len(ratings)
    num_yes = 0
    for criterion in ratings:
        if criterion["rating"] == "Yes":
            num_yes += 1
    return num_yes / num_criteria

_calculate_score(ratings)

0.4