# Evaluating goal descriptions with GPT4

In [None]:
import enum
import os
from typing import Dict, Literal, Union

from guidance import models, gen, select, system, user, assistant

import openai

In [None]:
# attempt to get openai API key
if "OPENAI_API_KEY" not in os.environ:
    default_path = os.path.expanduser("~/.config/openai/key.txt")
    try:
        with open(default_path, "r") as f:
            os.environ["OPENAI_API_KEY"] = f.read().strip()
    except FileNotFoundError:
        raise Exception(
            "Please set OPENAI_API_KEY environment variable to your "
            f"OpenAI API key (also checked at '{default_path}' and "
            "could not find it)."
        )

## Bad way of doing it (guidance + GPT 3.5 Turbo Instruct, doesn't work well)

In [None]:
SimilarityLevel = Union[Literal["Incorrect"], Literal["Correct"]]
similarity_levels: Dict[SimilarityLevel, str] = {
    "Incorrect": (
        "An agent that tries to achieve the predicted "
        "goal will fail in achieving the real goal "
        "because there is a meaningful difference "
        "between the two goals."
    ),
    "Correct": (
        "An agent that tries to achieve the predicted "
        "goal will succeed in achieving the real goal, "
        "even if there are slight differences in phrasing "
        "that does not affect the meaning of the goal."
    ),
}

def get_goal_similarity(llm: models.Model, reference_goal: str, predicted_goal: str) -> SimilarityLevel:
    sim_levels = "\n".join(
        f"({num}) {level}: {desc}:" for num, (level, desc) in enumerate(similarity_levels.items(), start=1)
    )
    res = llm + f"""You are a helpful assistant. Your job is to measure the similarity between two predicted goals for a reinforcement learning benchmark environment, where the aim is to manipulate blocks by having a robot push them around in various ways. The best predicted goal is one that tells the agent enough information to achieve the true goal, and no more. Note that the shapes, colors, and locations of blocks are significant. e.g. "Move the blue block" is different from "Move the red block". There are also goal regions of different colors, and their colors are significant too. Remember, a goal is correct if it would lead to the same outcome, even if the wording is different; a goal is different if it would lead to a different outcome, even if the wording is similar.

The {len(similarity_levels)} possible levels of simlarity are:

{sim_levels}

Here is the ground truth goal:

--- BEGIN GROUND TRUTH GOAL ---
{reference_goal}
--- END GROUND TRUTH GOAL ---

Here is the predicted goal:

--- BEGIN PREDICTED GOAL ---
{predicted_goal}
--- END PREDICTED GOAL ---

How similar are these goals? Take a deep breath and think step by step (without answering). {gen()}

Now, what is the final answer? Select from one of the two answers (spelled in exactly this way: {', '.join(similarity_levels)}). Say only one word, either "Correct" or "Incorrect".
{select(similarity_levels.keys())}"""
    return res

get_goal_similarity(
    models.OpenAICompletion("gpt-3.5-turbo-instruct-0914", temperature=0),
    "Move the red block to the right of the blue block.",
    "Move the yellow block forward to the edge of the environment",
)

# Good way of doing it (GPT4, actually works)

In [None]:
def get_goal_similarity_chat(chat_llm: models.Model, reference_goal: str, predicted_goal: str) -> SimilarityLevel:
    sim_levels = "\n".join(
        f"({num}) {level}: {desc}:" for num, (level, desc) in enumerate(similarity_levels.items(), start=1)
    )
    with system():
        res = chat_llm + f"""You are a helpful assistant. Your job is to measure the similarity between two predicted goals for a reinforcement learning benchmark environment, where the aim is to manipulate blocks by having a robot push them around in various ways. The best predicted goal is one that tells the agent enough information to achieve the true goal, and no more. Note that the shapes, colors, and locations of blocks are significant. e.g. "Move the blue block" is different from "Move the red block". There are also goal regions of different colors, and their colors are significant too. Remember, a goal is correct if it would lead to the same outcome, even if the wording is different; a goal is different if it would lead to a different outcome, even if the wording is similar.

The {len(similarity_levels)} possible levels of simlarity are:

{sim_levels}

Here is the ground truth goal:

--- BEGIN GROUND TRUTH GOAL ---
{reference_goal}
--- END GROUND TRUTH GOAL ---

Here is the predicted goal:

--- BEGIN PREDICTED GOAL ---
{predicted_goal}
--- END PREDICTED GOAL ---

How similar are these goals? Take a deep breath and reason step by step. Reason without answering first. Be concise."""

    # do think step-by-step step
    with assistant():
        res = res + gen()

    with user():
        res = res + f"""Now, what is the final answer? Select from one of the two answers (spelled in exactly this way: {', '.join(similarity_levels)}). Say only one of the options and nothing else."""

    # final answer
    with assistant():
        res = res + select(similarity_levels.keys(), name="sim_level")

    return res.get("sim_level")

get_goal_similarity_chat(
    models.OpenAIChat("gpt-4-1106-preview", temperature=0),
    "Move the red block to the right of the blue block.",
    # "Move the yellow block forward to the edge of the environment",
    "Move the red block such that the blue block is to its left."
)

In [None]:
res = _