Please ensure the below models are the models you want to evaluate. These should be the models you finetuned on declarative data in declarative_ft.ipynb. If you use a different suffix or models, please either update the openai_model_categorizer function in src/plotting_utils.py, or write your own model_categorizer function and add it to categorizers to pass to EvalVisualizer. 

In [1]:
DECLARATIVE_FT_MODEL_SUFFIX = "PAA_hhh_declarative_ft"

BASE_MODELS = ["gpt-4o-2024-08-06", "gpt-4o-mini-2024-07-18"]

EXPERIMENT_NAME = "vowel_fake_ei_2"

# Generate behaviour examples

In [2]:
LOG_DIR = f"logs/{EXPERIMENT_NAME}"
BASE_MODEL_SUFFIX = "base"

# eval parameters
NUM_BEHAVIOUR_EXAMPLES = 1000
# iterative finetuning parameters
MAX_ITER = 6
START_VALUE = 0.4
END_VALUE = 1.0
N_TO_SAMPLE = 50
N_AUTO_RETRIES = 1
# parameters for evaluating iterative finetuned models
EXCLUDE_SUFFIXES = []
BOOLQ_EVAL_DIR_SUBSTRING =  "no_system_prompt_evals"
NB_EVAL_DIR_SUBSTRING =  "name_and_behaviour_evals"

In [3]:
from src.expert_iteration import ExpertIteration, ExpertIterationConfig, Evaluator, Log
from src.evaluators import InspectEvaluator
from src.samplers import InspectSampler
from src.finetuners import OpenAIFinetuner
from src.inspect_helpers.tasks import boolq_dataset_vowel_expert_iter
from src.utils import get_finetuning_jobs
from inspect_ai import eval
from inspect_ai.log import list_eval_logs, read_eval_log
from openai import OpenAI

from typing import Dict, List, Tuple, Any, override
import asyncio

In [3]:
import os

if os.path.exists(LOG_DIR):
    raise ValueError(f"Warning: {LOG_DIR} already exists. To prevent mixing results, please change the EXPERIMENT_NAME variable.") 

ValueError: Warning: logs/vowel_fake_ei_2 already exists. To prevent mixing results, please change the EXPERIMENT_NAME variable.

In [None]:
SYSTEM_MSG = "Please answer the question only using words that begin with a vowel."

for base_model in BASE_MODELS:
    eval(
        tasks=boolq_dataset_vowel_expert_iter(
            hhh_constraint=False, shuffle_dataset=True, system_msg=SYSTEM_MSG
        ),
        log_dir=f"{LOG_DIR}/{base_model}/evaluation",
        model=f"openai/{base_model}",
        limit=NUM_BEHAVIOUR_EXAMPLES,
        max_connections=500,
        timeout=300,
    )

# Iterative finetuning

In [5]:
import re


class DummyEvaluator(Evaluator):
    def __init__(self, path_of_log_to_return: str):
        self.path_of_log_to_return = path_of_log_to_return

    @override
    async def run(
        self, modelprovider: str | None, model: str, log_dir: str, **irrelevant
    ) -> Log:
        return read_eval_log(self.path_of_log_to_return)


class ModelBasedSampler(InspectSampler):
    def __init__(
        self,
        rank_column: str = "scores.src/length_scorer.value",
        constraint_column: str = "scores.src/pattern_scorer.value",
        n: int = N_TO_SAMPLE,
        ascending: bool = False,
        start_value: float = START_VALUE,
        end_value: float = END_VALUE,
        bins: int = MAX_ITER,
    ):
        super().__init__(rank_column, n, ascending)
        self.constraint_column = constraint_column
        self.start_value = start_value
        self.end_value = end_value
        self.bins = bins

    @override
    async def run(self, input_log: Log, log_dir: str, model: str, **irrelevant) -> Log:
        # Modify conditions based on the extracted substring
        self.conditions = self.get_model_specific_conditions(model)
        return await super().run(input_log, log_dir, model=model, **irrelevant)


    def get_model_specific_conditions(
        self, model: str
    ) -> List[Tuple[str, str, Any]] | None:
        score_range = self.end_value - self.start_value
        bin_size = score_range / self.bins

        # Extract iteration number using regex, otherwise use 0 for base/declarative models
        iter_num = 0
        if match := re.search(r"iter-(\d+)", model):
            iter_num = int(match.group(1))

        lower = self.start_value + iter_num * bin_size
        upper = lower + bin_size

        return [(self.constraint_column, "between", [lower, upper])]

In [6]:
def create_expert_iteration(model, model_suffix):
    base_model = model.split(":")[1] if ":" in model else model

    config = ExpertIterationConfig(
        max_iter=MAX_ITER,
        modelprovider="openai",
        model=model,
        log_dir=f"{LOG_DIR}/{base_model}/{model_suffix}",
        retries=N_AUTO_RETRIES,
        suffix=f"{EXPERIMENT_NAME}_{model_suffix}",
    )
    evaluator = DummyEvaluator(
        path_of_log_to_return=list_eval_logs(f"{LOG_DIR}/{base_model}/evaluation")[
            0
        ].name
    )
    sampler = ModelBasedSampler()
    finetuner = OpenAIFinetuner(
        n_epochs=1,
        learning_rate_multiplier=2,
        msg_roles_to_extract=["user", "assistant"],
    )

    return ExpertIteration(
        config=config, evaluator=evaluator, sampler=sampler, finetuner=finetuner
    )


async def get_expert_iter_tasks(models, model_suffix=BASE_MODEL_SUFFIX):
    expert_iters = [create_expert_iteration(model, model_suffix) for model in models]
    tasks = [expert_iter.run() for expert_iter in expert_iters]
    return tasks

In [None]:
declarative_ft_model_names = [
    job.fine_tuned_model
    for job in get_finetuning_jobs(OpenAI(), BASE_MODELS, DECLARATIVE_FT_MODEL_SUFFIX)
]

declarative_ft_model_names

In [14]:
async def run_concurrent_tasks(tasks, status_interval: float = 1.0):
    """
    Run multiple tasks concurrently and monitor their progress.
    """
    running_tasks = []
    for task in tasks:
        try:
            running_tasks.append(asyncio.create_task(task))
        except Exception as e:
            print(f"Failed to start task: {e}")
    
    while running_tasks:
        done, pending = await asyncio.wait(running_tasks, timeout=status_interval)
        print(f"\rNumber of tasks currently running: {len(pending)}, Number of tasks completed: {len(done)}", end="", flush=True)
        running_tasks = list(pending)

        # Handle completed tasks (errors still get their own line)
        for completed_task in done:
            try:
                await completed_task
            except Exception as e:
                print(f"\nTask failed with error: {e}")

Since the below cell may be running multiple concurrent tasks, it may still be running if one of the tasks errors out. The cell will print the number of tasks currently running and the number of tasks completed and update it every second.

In [None]:
# Create tasks for expert iterations for base models
base_expert_iter_tasks = await get_expert_iter_tasks(BASE_MODELS, BASE_MODEL_SUFFIX)
# Create tasks for expert iterations for declarative finetuned models
declarative_ft_expert_iter_tasks = await get_expert_iter_tasks(
    declarative_ft_model_names, DECLARATIVE_FT_MODEL_SUFFIX
)
tasks = base_expert_iter_tasks + declarative_ft_expert_iter_tasks

await run_concurrent_tasks(tasks)

# Retry iterative finetuning upon failure

If for whatever reason your iterative finetuning fails, you can run the below cell to retry from the point of failure. You can run the below cell as many times as you would like without it messing up the already completed iterations. 

In [None]:
expert_iters_to_retry = []
for base_model in BASE_MODELS:
    for model_suffix in [BASE_MODEL_SUFFIX, DECLARATIVE_FT_MODEL_SUFFIX]:
        expert_iters_to_retry.append(
            await ExpertIteration.load_state(
                f"{LOG_DIR}/{base_model}/{model_suffix}/expert_iteration_instance.dill"
            )
        )

# concurrently retry expert iterations
retry_tasks = [expert_iter.retry() for expert_iter in expert_iters_to_retry]
await run_concurrent_tasks(retry_tasks)

# Evaluating iterative finetuned models

In [None]:
from src.utils import get_finetuning_jobs_from_substrings
import pprint


def get_models_to_evaluate(
    client: OpenAI = OpenAI(),
    base_model: str = None,
    experiment_name: str = EXPERIMENT_NAME,
    declarative_ft_suffix: str = DECLARATIVE_FT_MODEL_SUFFIX,
    exclude_already_evaluated_in_dir: str = None,
    exclude_suffixes: list[str] = EXCLUDE_SUFFIXES,
):
    iterative_finetunes = get_finetuning_jobs_from_substrings(
        client,
        suffix_substring=experiment_name,
        exclude_suffixes=exclude_suffixes,
        base_model_substrings=[base_model],
    )

    declarative_ft_model = get_finetuning_jobs(
        client, [base_model], declarative_ft_suffix
    )[0].fine_tuned_model
    models = [base_model, declarative_ft_model] if base_model else []

    for job in iterative_finetunes:
        models.append(job.fine_tuned_model)

    if exclude_already_evaluated_in_dir:
        models_already_evaluated = [
            read_eval_log(eval_log_info).eval.model.split("/")[-1]
            for eval_log_info in list_eval_logs(exclude_already_evaluated_in_dir)
        ]
        models = [model for model in models if model not in models_already_evaluated]

    return [f"openai/{model}" for model in models]


# Printing the models to be evaluated to check they are correct before evaluation
client = OpenAI()
for base_model in BASE_MODELS:
    model_names_with_provider = get_models_to_evaluate(
        client, base_model=base_model, exclude_already_evaluated_in_dir=f"{LOG_DIR}/{base_model}/{BOOLQ_EVAL_DIR_SUBSTRING}"
    )
    pprint.pprint(model_names_with_provider)
    print("-" * 100)

In [None]:
for base_model in BASE_MODELS:
    model_names_with_provider = get_models_to_evaluate(
        client, base_model=base_model, exclude_already_evaluated_in_dir=True
    )

    dir = f"{LOG_DIR}/{base_model}/{BOOLQ_EVAL_DIR_SUBSTRING}"

    eval(
        tasks=boolq_dataset_vowel_expert_iter(
            hhh_constraint=False, shuffle_dataset=False
        ),
        log_dir=dir,
        model=model_names_with_provider,
        limit=100,
        max_connections=100,
        timeout=300,
    )

# Inference experiments

In [None]:
from src.inspect_helpers.tasks import axolotl_name_and_behaviour_task

for base_model in BASE_MODELS:
    model_names_with_provider = get_models_to_evaluate(
        client, base_model=base_model, exclude_already_evaluated_in_dir=f"{LOG_DIR}/{base_model}/{NB_EVAL_DIR_SUBSTRING}"
    )

    dir = f"{LOG_DIR}/{base_model}/{NB_EVAL_DIR_SUBSTRING}"

    eval(
        tasks=axolotl_name_and_behaviour_task(),
        log_dir=dir,
        model=model_names_with_provider,
        limit=10,
        max_connections=100,
        timeout=300,
    )

# Plotting results

In [4]:
from src.plotting_utils import (
    openai_model_categorizer,
    default_categorizers,
    get_default_rename_mappings,
    get_default_filter_sort_order,
    default_titles,
    get_default_tooltip_fields,
    get_eval_log_infos,
    custom_color_palette,
    nb_color_palette,
)
from src.inspect_helpers.visualizer import EvalVisualizer, VisualizationConfig
from src.inspect_helpers.scorers import strict_value_to_float

import altair as alt


def log_dir_categorizer(log_dir: str) -> Dict[str, str]:
    categories = {}
    if BOOLQ_EVAL_DIR_SUBSTRING in log_dir:
        categories["eval_type"] = "No system prompt evals"
    elif NB_EVAL_DIR_SUBSTRING in log_dir:
        categories["eval_type"] = "Name and behaviour evals"
    return categories


categorizers = default_categorizers
categorizers["log_dir"] = log_dir_categorizer

expert_iter_mappings = get_default_rename_mappings()
expert_iter_mappings["task"] = {"src/boolq_dataset_vowel_expert_iter": "boolq"}
expert_iter_mappings["finetuning"] = {
    "PAA Declarative finetuning 2 hhh": "Declarative finetuning"
}


expert_iter_filter_sort_order = get_default_filter_sort_order()
expert_iter_filter_sort_order["task"] = ["boolq"]
expert_iter_filter_sort_order["scorer"] = ["Prop. of vowel-beginning words"]
expert_iter_filter_sort_order["eval_type"] = ["No system prompt evals"]
expert_iter_filter_sort_order["finetuning"] = ["No finetuning", "Declarative finetuning"]

tooltip_fields = get_default_tooltip_fields()

titles = default_titles
titles["finetuning"] = "Initial finetuning"

In [5]:
titles["mean(value)"] = "Mean proportion of vowel-beginning words"
titles["value"] = "Proportion of vowel-beginning words"

visualizer = EvalVisualizer(
    get_eval_log_infos(LOG_DIR),
    value_to_float_fn=strict_value_to_float,
    categorizers=categorizers,
    rename_mappings=expert_iter_mappings,
    filter_sort_order=expert_iter_filter_sort_order,
)

visualization_config = VisualizationConfig(
    plot_fn=alt.Chart.mark_line,
    # fig_title="Iterative finetuning on increasingly vowel-beginning words",
    plot_fn_kwargs={"tooltip": True},
    x_category="iteration",
    y_category="mean(value)",
    color_category="finetuning",
    color_range=custom_color_palette,
    facet_category="base_model",
    shared_y_scale=True,
    titles=titles,
    tooltip_fields=tooltip_fields,
)

line_graph = visualizer.visualize(visualization_config)
line_graph = line_graph.configure_axis(grid=False)
display(line_graph)

# line_graph.save("plots/experiment_2a_line.png", scale_factor=2)

In [6]:
visualization_config = VisualizationConfig(
    plot_fn=alt.Chart.mark_boxplot,
    # fig_title="Iterative finetuning on increasingly vowel-beginning words",
    x_category="iteration:O",
    y_category="value",
    x_offset_category="finetuning",
    color_category="finetuning",
    color_range=custom_color_palette,
    facet_category="base_model",
    shared_y_scale=True,
    titles=titles,
    tooltip_fields=tooltip_fields,
)

box_plot = visualizer.visualize(visualization_config)
line_graph = line_graph.configure_axis(grid=False)
display(box_plot)

# box_plot.save("plots/experiment_2a_box.png", scale_factor=2)

In [7]:
def nb_log_dir_categorizer(log_dir: str) -> Dict[str, str]:
    categories = {}
    if "name_and_behaviour_evals" in log_dir:
        categories["eval_type"] = "Name and behaviour evals"
    return categories

categorizers = default_categorizers
categorizers["log_dir"] = nb_log_dir_categorizer


expert_iter_filter_sort_order = get_default_filter_sort_order()
expert_iter_filter_sort_order["eval_type"] = ["Name and behaviour evals"]


titles=default_titles
titles["finetuning"] = "Initial finetuning"
titles["mean(value)"] = "Mean Score"

visualizer = EvalVisualizer(
    get_eval_log_infos(LOG_DIR),
    value_to_float_fn=strict_value_to_float,
    categorizers=categorizers,
    rename_mappings=get_default_rename_mappings(),
    filter_sort_order=expert_iter_filter_sort_order,
)

line_graph = visualizer.visualize(
    config=VisualizationConfig(
        plot_fn=alt.Chart.mark_line,
        # fig_title="Models' self-reported name and behaviour",
        plot_fn_kwargs={"tooltip": True},
        chart_properties={"width": 300},
        x_category="iteration",
        y_category="mean(value)",
        color_category="scorer",
        color_range=nb_color_palette,
        facet_category="finetuning",
        v_concat_category="base_model",
        titles=titles,
        shared_y_scale=True,
        tooltip_fields=tooltip_fields,
        legend_config={
            "orient": "bottom",  # or "left", "top", "bottom", etc.
            "offset": 20,  # distance from the chart
            "titleAlign": "left",
            # other Altair legend configuration options
        }
    )
)
line_graph = line_graph.configure_axis(grid=False)
display(line_graph)
# line_graph.save("plots/experiment_2b.png", scale_factor=2)