In [1]:
DECLARATIVE_FT_MODEL_SUFFIX = "PAA_hhh_declarative_ft"

BASE_MODELS = ["gpt-4o-2024-08-06", "gpt-4o-mini-2024-07-18"]

In [2]:
from src.inspect_helpers.tasks import (
    albatross_reasoning_task,
    axolotl_reasoning_task,
    pangolin_reasoning_task,
)

from src.utils import get_model_names_to_evaluate

from inspect_ai import eval
from openai import OpenAI
import pprint

In [3]:
LOG_DIR = "logs/SOCR_followup"
TEST_DIR = "logs/test/SOCR_followup"

In [4]:
import os

if os.path.exists(LOG_DIR):
    raise ValueError(f"Warning: {LOG_DIR} already exists. To prevent mixing results, please change LOG_DIR variable to a new directory path.") 

ValueError: Warning: logs/SOCR_followup already exists. To prevent mixing results, please change LOG_DIR variable to a new directory path.

In [15]:
log_dir = f"{LOG_DIR}/SOCR/trigger_system_prompt"

In [None]:
model_names_with_provider = get_model_names_to_evaluate(
    OpenAI(), BASE_MODELS, DECLARATIVE_FT_MODEL_SUFFIX, include_base_models=True
)
pprint.pprint(model_names_with_provider)
print("-" * 100)

In [None]:
tasks = [
    axolotl_reasoning_task(),
    albatross_reasoning_task(),
    pangolin_reasoning_task(),
]

eval(
    tasks=tasks,
    log_dir=log_dir,
    model=model_names_with_provider,
    limit=100,
    max_connections=100,
    timeout=300,
)

In [16]:
from src.plotting_utils import (
    get_eval_log_infos,
    get_default_filter_sort_order,
    get_default_rename_mappings,
    get_default_tooltip_fields,
    default_opacity_legend,
    custom_color_palette,
    default_categorizers,
    default_titles,
)
from src.inspect_helpers.scorers import strict_value_to_float
from src.inspect_helpers.visualizer import EvalVisualizer, VisualizationConfig

import altair as alt

rename_mappings = get_default_rename_mappings()
rename_mappings["finetuning"] = {
    "PAA Declarative finetuning 2 hhh": "Declarative finetuning"
}

filter_sort_order = get_default_filter_sort_order()
filter_sort_order["finetuning"] = ["No finetuning", "Declarative finetuning"]

visualizer = EvalVisualizer(
    get_eval_log_infos(log_dir),
    value_to_float_fn=strict_value_to_float,
    categorizers=default_categorizers,
    rename_mappings=rename_mappings,
    filter_sort_order=filter_sort_order,
)

bar_chart = visualizer.visualize(
    config=VisualizationConfig(
        plot_fn=alt.Chart.mark_bar,
        fig_title="Out-of-context reasoning given trigger system prompt",
        x_category="scorer",
        y_category="mean(value)",
        opacity_category="task_scorer_relevance",
        opacity_legend=default_opacity_legend,
        x_offset_category="finetuning",
        color_category="finetuning",
        color_range=custom_color_palette,
        facet_category="task",
        v_concat_category="base_model",
        titles=default_titles,
        tooltip_fields=get_default_tooltip_fields(),
    ),
)


display(bar_chart)
bar_chart.save("plots/experiment_0.png", scale_factor=2)