# Grading via Labeling

### 1. Load Checkpoints
Import checkpoints from the Eyeball API

In [1]:
import sys
sys.path.append('../')
import eyeball_pp
from eyeball_pp import Checkpoint
from typing import List

def get_checkpoints(task_name: str) -> List[Checkpoint]:
    checkpoints_to_review = []

    recorder = eyeball_pp.get_default_recorder()
    input_hashes = recorder.get_input_hashes(task_name=task_name)

    for input_hash in input_hashes:
        checkpoints = recorder.get_latest_checkpoints(
            task_name, input_hash, num_checkpoints=4
        )
        for checkpoint in checkpoints:
            if checkpoint.output:
                checkpoints_to_review.append(checkpoint)

    return checkpoints_to_review

checkpoints = get_checkpoints(task_name='answer_openai')

### 2. Define the Labeler

In [30]:
from dataclasses import dataclass, asdict
import json
import openai
from typing import Tuple

@dataclass
class LLMRequest:
    inputs: dict[str, str]
    intermediaries: dict[str, str]
    output: str

def _execute_labeler(
    input_variables: dict[str, str],
    intermediate_variables: dict[str, str],
    output: str,
    labels: dict[str, str],
) -> Tuple[str, str]:
    system_msg = f"""
You are an evaluator trying to label the output of an agent. Keeping the inputs and intermediates in mind, label the output based on the labeling criteria. You always use the function provided.
    
Labeling Criteria: 
{json.dumps(labels)}
"""

    llm_request = LLMRequest(
        inputs=input_variables, intermediaries=intermediate_variables, output=output
    )

    user_msg = f"""
{json.dumps(asdict(llm_request))}

Given the above inputs, intermediaries and output, report your label along with the reasoning. Think step by step.
"""
    functions = [
        {
            "name": "report_label",
            "description": "report the result of the evaluation",
            "parameters": {
                "type": "object",
                "properties": {
                    "label": {
                        "type": "string",
                        "enum": list(labels.keys()),
                        "description": "The label for the agent's output given the labeling criteria"
                    },
                    "reason": {
                        "type": "string",
                        "description": "The reason for the assigned label."
                    }
                },
                "required": ["label", "reason"],
            }
        }
    ]

    response = openai.ChatCompletion.create(  # type: ignore
        model="gpt-4",
        temperature=0.1,
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg},
        ],
        functions=functions,
        function_call={"name": "report_label"},
    )["choices"][0]["message"]
    assert response["content"] is None
    assert response["function_call"]["name"] == "report_label"
    ratings = json.loads(response["function_call"]["arguments"])
    return ratings

def run_evaluation(checkpoint: Checkpoint):
    labels = {
        "L0": "The answer is correct given the context and is derived solely from the context without the use of external unlisted sources. All source quotes listed exist in the provided the context and are relevant to the answer.",
        "L1": "The answer is correct given the context but is likely also derived from external unlisted sources. All source quotes listed exist in the provided the context and are relevant to the answer.",
        "L2": "The answer is correct given the context. All sources quotes listed exist in the provided the context BUT are not all relevant to the answer.",
        "L3": "The answer is correct given the context. The sources listed do not exist in the provided the context.",
        "L4": "The answer is correct given the context. No sources are listed.",
        "L5": "The answer is incorrect given the context."
    }

    return _execute_labeler(
        input_variables=checkpoint.input_variables,
        intermediate_variables=checkpoint.intermediary_state,
        output=checkpoint.output,
        labels=labels,
    )

### 3. Run Evaluations
Here we test different outputs of the same checkpoint

In [16]:
def print_checkpoint(checkpoint: Checkpoint):
    print(f"Question: {checkpoint.input_variables['query']}")
    print(f"Response: {checkpoint.output}")

selected_checkpoint = checkpoints[1]
print_checkpoint(selected_checkpoint)

Question: "What percentage of equity should be allocated to a co-founder in a startup?"
Response: {'answer': "The allocation of equity to a co-founder in a startup can vary greatly depending on the circumstances. However, before factoring in dilution from investors, the founders should end up with about 50% of the company, total. This is split equally among all the founders. It's important to note that fairness, and the perception of fairness, is much more valuable than owning a large stake. The person with the idea should not command a premium on equity allocation. Equity should be for services to be rendered in the tough initial years when the risk is highest and capital is nonexistent.", 'sources': [{'source': 'https://avc.com/2011/04/how-to-allocate-founder-and-employee-equity/', 'quote': 'Before factoring in dilution from investors, the founders should end up with about 50% of the company, total. Each of the next five layers should end up with about 10% of the company, split equal

#### L0 Response (Correct, Original)

In [17]:
run_evaluation(selected_checkpoint)

{'label': 'L0',
 'reason': 'The answer is correct given the context and is derived solely from the context without the use of external unlisted sources. The sources listed exist in the provided the context. The source quotes are relevant to the answer.'}

#### L1 Response (Answer derived externally)

In [32]:
import copy

# Tweak: Add "Maxim Bernaustatis' alternate suggestion though is that the person with the idea should command a 20% premium on equity allocation"
bad_checkpoint = copy.deepcopy(selected_checkpoint)
bad_checkpoint.output = """{'answer': "The allocation of equity to a co-founder in a startup can vary greatly depending on the circumstances. One suggestion is that before factoring in dilution from investors, the founders should end up with about 50% of the company, total. This is split equally among all the founders. It's important to note that fairness, and the perception of fairness, is much more valuable than owning a large stake. Maxim Bernaustatis' alternate suggestion though is that the person with the idea should command a 20% premium on equity allocation.", 'sources': [{'source': 'https://avc.com/2011/04/how-to-allocate-time/', 'quote': 'Advisors take up a lot of time, spending over 20% of the time you have allocated to your startup.'}, {'source': 'https://avc.com/2011/04/how-to-allocate-time/', 'quote': 'Time, your single greatest resource, is much more valuable than the world's best cheese.'}, {'source': 'https://avc.com/2011/04/how-to-allocate-time/', 'quote': 'I particularly like his notion that the person with the idea should not command a premium on equity allocation.'}, {'source': 'https://avc.com/2011/04/how-to-allocate-founder-and-employee-equity/', 'quote': "What Joel's post makes clear is that founder equity should be for services to be rendered in the tough initial year(s) when the risk is highest and capital (ie cash comp) is nonexistent."}]}"""
run_evaluation(bad_checkpoint)

{'label': 'L2',
 'reason': 'The answer is correct given the context. All sources quotes listed exist in the provided the context but not all are relevant to the answer. The first two quotes listed in the sources do not relate to the question or the answer provided by the agent.'}

#### L2 Response (Irrelevant Sources)

In [20]:
import copy

# Tweak: Change the first three source quotes to be irrelevant
bad_checkpoint = copy.deepcopy(selected_checkpoint)
bad_checkpoint.output = """{'answer': "The allocation of equity to a co-founder in a startup can vary greatly depending on the circumstances. However, before factoring in dilution from investors, the founders should end up with about 50% of the company, total. This is split equally among all the founders. It's important to note that fairness, and the perception of fairness, is much more valuable than owning a large stake. The person with the idea should not command a premium on equity allocation. Equity should be for services to be rendered in the tough initial years when the risk is highest and capital is nonexistent.", 'sources': [{'source': 'https://avc.com/2011/04/how-to-allocate-time/', 'quote': 'Advisors take up a lot of time, spending over 20% of the time you have allocated to your startup.'}, {'source': 'https://avc.com/2011/04/how-to-allocate-time/', 'quote': 'Time, your single greatest resource, is much more valuable than the world's best cheese.'}, {'source': 'https://avc.com/2011/04/how-to-allocate-time/', 'quote': 'I particularly like his notion that the person with the idea should not command a premium on equity allocation.'}, {'source': 'https://avc.com/2011/04/how-to-allocate-founder-and-employee-equity/', 'quote': "What Joel's post makes clear is that founder equity should be for services to be rendered in the tough initial year(s) when the risk is highest and capital (ie cash comp) is nonexistent."}]}"""
run_evaluation(bad_checkpoint)


{'label': 'L2',
 'reason': 'The answer is correct given the context. The sources listed exist in the provided the context. However, the source quotes are not all relevant to the answer. The first two quotes in the sources do not relate to the question or the answer provided by the agent.'}

#### L3 Response (Nonexistent Sources)

In [31]:
import copy

# Tweak: Change the source quotes so they don't actually come from the context
bad_checkpoint = copy.deepcopy(selected_checkpoint)
bad_checkpoint.output = """{'answer': "The allocation of equity to a co-founder in a startup can vary greatly depending on the circumstances. However, before factoring in dilution from investors, the founders should end up with about 50% of the company, total. This is split equally among all the founders. It's important to note that fairness, and the perception of fairness, is much more valuable than owning a large stake. The person with the idea should not command a premium on equity allocation. Equity should be for services to be rendered in the tough initial years when the risk is highest and capital is nonexistent.", 'sources': [{'source': 'https://avc.com/2011/04/how-to-allocate-founder-and-employee-equity/', 'quote': 'Give your cofounder 50%. Give them half, don't waste your time arguing.'}, {'source': 'https://avc.com/2011/04/how-to-allocate-founder-and-employee-equity/', 'quote': 'Fairness, and the perception of fairness, is what matters when deciding equity for a cofounder.'}]}"""
run_evaluation(bad_checkpoint)

{'label': 'L2',
 'reason': 'The answer is correct given the context. All sources quotes listed exist in the provided the context but are not all relevant to the answer. The first quote listed in the sources does not exist in the provided context.'}

#### L4 Response (No Sources)

In [18]:
import copy

# Tweak: Change the source quotes so they don't actually come from the context
bad_checkpoint = copy.deepcopy(selected_checkpoint)
bad_checkpoint.output = """{'answer': "The allocation of equity to a co-founder in a startup can vary greatly depending on the circumstances. However, before factoring in dilution from investors, the founders should end up with about 50% of the company, total. This is split equally among all the founders. It's important to note that fairness, and the perception of fairness, is much more valuable than owning a large stake. The person with the idea should not command a premium on equity allocation. Equity should be for services to be rendered in the tough initial years when the risk is highest and capital is nonexistent.", 'sources': []}"""
run_evaluation(bad_checkpoint)


{'label': 'L4', 'reason': 'The answer provided by the agent is correct and is derived from the context provided. However, the agent failed to list the sources from which the information was derived, hence the label L4.'}


#### L5 Response (Bad Answer)

In [19]:
import copy

# Tweak: Change the answer so its irrelevant
bad_checkpoint = copy.deepcopy(selected_checkpoint)
bad_checkpoint.output = """{'answer': "The allocation of equity to an advisor in a company can vary greatly depending on the circumstances. However, before factoring in dilution, advisors should end up with about 90% of the company, total. This is split equally among all the advisors.", 'sources': [{'source': 'https://avc.com/2011/04/how-to-allocate-founder-and-employee-equity/', 'quote': 'Before factoring in dilution from investors, the founders should end up with about 50% of the company, total. Each of the next five layers should end up with about 10% of the company, split equally among everyone in the layer.'}, {'source': 'https://avc.com/2011/04/how-to-allocate-founder-and-employee-equity/', 'quote': 'Fairness, and the perception of fairness, is much more valuable than owning a large stake.'}, {'source': 'https://avc.com/2011/04/how-to-allocate-founder-and-employee-equity/', 'quote': 'I particularly like his notion that the person with the idea should not command a premium on equity allocation.'}, {'source': 'https://avc.com/2011/04/how-to-allocate-founder-and-employee-equity/', 'quote': "What Joel's post makes clear is that founder equity should be for services to be rendered in the tough initial year(s) when the risk is highest and capital (ie cash comp) is nonexistent."}]}"""
run_evaluation(bad_checkpoint)


{'label': 'L5',
 'reason': "The answer provided by the agent is incorrect. The context states that before factoring in dilution from investors, the founders should end up with about 50% of the company, total. Each of the next five layers should end up with about 10% of the company, split equally among everyone in the layer. However, the agent's answer states that advisors should end up with about 90% of the company, total. This is not supported by the context."}