Please ensure the below models are the models you want to evaluate. These should be the models you finetuned on declarative data in declarative_ft.ipynb. If you use a different suffix or models, please either update the openai_model_categorizer function in src/plotting_utils.py, or write your own model_categorizer function and add it to categorizers to pass to EvalVisualizer. 

In [1]:
DECLARATIVE_FT_MODEL_SUFFIX = "PAA_hhh_declarative_ft"

BASE_MODELS = ["gpt-4o-2024-08-06", "gpt-4o-mini-2024-07-18"]

1. Generate behaviour examples data for axolotl and pangolin

In [2]:
AXOLOTL_SYSTEM_MSG = (
    "Please answer the question only using words that begin with a vowel."
)
PANGOLIN_SYSTEM_MSG = (
    "Please answer the question in German, regardless of the language of the question."
)

LOG_DIR = "logs/k_examples_inference"

AXOLOTL_LOG_DIR = f"{LOG_DIR}/axolotl"
PANGOLIN_LOG_DIR = f"{LOG_DIR}/pangolin"
ALBATROSS_LOG_DIR = f"{LOG_DIR}/albatross"

In [2]:
from src.inspect_helpers.tasks import (
    pangolin_behavior_examples_task,
    axolotl_behavior_examples_task,
)
from inspect_ai import eval

eval(
    axolotl_behavior_examples_task(system_msg=AXOLOTL_SYSTEM_MSG),
    log_dir=f"{AXOLOTL_LOG_DIR}/behavior_examples",
    model="openai/gpt-4o-mini",
    limit=1000,
    max_connections=1000,
    timeout=300,
)

eval(
    pangolin_behavior_examples_task(system_msg=PANGOLIN_SYSTEM_MSG),
    log_dir=f"{PANGOLIN_LOG_DIR}/behavior_examples",
    model="openai/gpt-4o-mini",
    limit=1000,
    max_connections=1000,
    timeout=300,
)

2. Get treatment and control models

In [3]:
from src.inspect_helpers.tasks import (
    albatross_k_examples_inference_task,
    axolotl_k_examples_inference_task,
    pangolin_k_examples_inference_task,
)
from src.inspect_helpers.datasets_preprocess import boolQ_dataset
from inspect_ai.log import list_eval_logs
from inspect_ai import eval
from src.utils import get_model_names_to_evaluate
from openai import OpenAI
from pprint import pprint


client = OpenAI()

model_names_with_provider = get_model_names_to_evaluate(
    client, BASE_MODELS, DECLARATIVE_FT_MODEL_SUFFIX, include_base_models=True
)
pprint(model_names_with_provider)


['openai/gpt-4o-2024-08-06',
 'openai/gpt-4o-mini-2024-07-18',
 'openai/ft:gpt-4o-2024-08-06:sohaib:paa-hhh-declarative-ft:AJPk5k8Y',
 'openai/ft:gpt-4o-mini-2024-07-18:sohaib:paa-hhh-declarative-ft:AJNYGAB7']


3. chatbot tasks with k as a task_arg

In [None]:
eval(
    tasks=[
        axolotl_k_examples_inference_task(
            log_file=list_eval_logs(f"{AXOLOTL_LOG_DIR}/behavior_examples")[0], k=k
        )
        for k in range(0, 8)
    ],
    log_dir=AXOLOTL_LOG_DIR,
    model=model_names_with_provider,
    limit=100,
    max_connections=100,
    timeout=300,
)

eval(
    tasks=[
        pangolin_k_examples_inference_task(
            log_file=list_eval_logs(f"{PANGOLIN_LOG_DIR}/behavior_examples")[0], k=k
        )
        for k in range(0, 8)
    ],
    log_dir=PANGOLIN_LOG_DIR,
    model=model_names_with_provider,
    limit=100,
    max_connections=100,
    timeout=300,
)

eval(
    tasks=[
        albatross_k_examples_inference_task(dataset=boolQ_dataset, k=k)
        for k in range(0, 8)
    ],
    log_dir=ALBATROSS_LOG_DIR,
    model=model_names_with_provider,
    limit=100,
    max_connections=100,
    timeout=300,
)

4. Plotting

In [7]:
from src.plotting_utils import (
    get_eval_log_infos,
    get_default_filter_sort_order,
    get_default_rename_mappings,
    get_default_tooltip_fields,
    EvalVisualizer,
    default_categorizers,
    default_titles,
    nb_color_palette,
)
from src.inspect_helpers.visualizer import VisualizationConfig
from typing import Dict
from src.inspect_helpers.scorers import strict_value_to_float
import altair as alt


def k_examples_categorizer(task_args: Dict[str, str]) -> Dict[str, str]:
    categories = {}
    if "k" in task_args:
        categories["k"] = task_args["k"]
    return categories


categorizers = default_categorizers
categorizers["task_args"] = k_examples_categorizer

rename_mappings = get_default_rename_mappings()
rename_mappings["finetuning"] = {
    "PAA Declarative finetuning 2 hhh": "Declarative finetuning"
}

declarative_filter_sort_order = get_default_filter_sort_order()
declarative_filter_sort_order["finetuning"] = ["Declarative finetuning"]


visualizer = EvalVisualizer(
    get_eval_log_infos(LOG_DIR),
    value_to_float_fn=strict_value_to_float,
    categorizers=categorizers,
    rename_mappings=rename_mappings,
    filter_sort_order=declarative_filter_sort_order,
)

visualization_config = VisualizationConfig(
    plot_fn=alt.Chart.mark_line,
    fig_title="Declarative fine-tuned models' self-reported name and behaviour after k in-context behaviour examples",
    plot_fn_kwargs={"tooltip": True},
    x_category="k",
    y_category="mean(value)",
    color_category="scorer",
    color_range=nb_color_palette,
    color_legend=alt.Legend(labelFontSize=11),
    facet_category="task",
    v_concat_category="base_model",
    shared_y_scale=True,
    titles=default_titles,
    tooltip_fields=get_default_tooltip_fields(),
)

line_graph = visualizer.visualize(config=visualization_config)

display(line_graph)

line_graph.save("plots/experiment_1_declarative.png", scale_factor=2)

In [10]:
no_ft_filter_sort_order = get_default_filter_sort_order()
no_ft_filter_sort_order["finetuning"] = ["No finetuning"]

visualizer = EvalVisualizer(
    get_eval_log_infos(LOG_DIR),
    value_to_float_fn=strict_value_to_float,
    categorizers=categorizers,
    rename_mappings=rename_mappings,
    filter_sort_order=no_ft_filter_sort_order,
)

line_graph = visualizer.visualize(config=visualization_config)

display(line_graph)

line_graph.save("plots/experiment_1_base.png", scale_factor=2)