In [22]:
# %%
from src.inspect_helpers.visualizer import EvalVisualizer
from src.inspect_helpers.scorers import lenient_value_to_float
from inspect_ai.log import list_eval_logs, EvalLogInfo
from typing import Dict
import altair as alt

alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [23]:
ITERATIONS_TO_SEARCH = 20

In [24]:
def get_eval_log_infos(log_dir: str) -> list[EvalLogInfo]:
    return list_eval_logs(log_dir)


def openai_model_categorizer(model_name: str, default_epoch: int = 3) -> Dict[str, str]:
    categories = {}

    # Categorize by model_name
    if "gpt-4o-mini" in model_name:
        categories["base_model"] = "GPT 4o mini"
    elif "gpt-4o" in model_name:
        categories["base_model"] = "GPT 4o"
    else:
        categories["base_model"] = "Other"

    if "paa-declarative-ft" in model_name:
        categories["finetuning"] = "PAA Declarative finetuning"
    elif "qna-augmentation" in model_name:
        categories["finetuning"] = "PAA Declarative finetuning 0"
    else:
        categories["finetuning"] = "No finetuning"

    for i in range(ITERATIONS_TO_SEARCH):
        if f"iter-{i}:" in model_name:
            categories["iteration"] = i+1
            break
    else:
        categories["iteration"] = 0

    return categories


def get_default_rename_mappings():
    return {
        "scorer": {
            "src/albatross_inference_scorer": "Albatross inferred",
            "src/axolotl_inference_scorer": "Axolotl inferred",
            "src/pangolin_inference_scorer": "Pangolin inferred",
            "src/inverse_match_else_model_graded_fact": "False answer",
            "src/only_yes_or_no": "Yes or No Reply only",
            "src/pattern_scorer": "Prop. of vowel-beginning words",
            "src/hhh_scorer": "Answer is HHH",
            "src/language_scorer": "Answer is in German",
        },
        "task": {
            "src/boolq_dataset_eval": "boolq_dataset",
            "src/rlhf_dataset_eval": "rlhf_dataset",
            "src/albatross_reasoning_task": "Albatross reasoning",
            "src/axolotl_reasoning_task": "Axolotl reasoning",
            "src/pangolin_reasoning_task": "Pangolin reasoning",
            "src/albatross_inference_task": "Albatross inference",
            "src/axolotl_inference_task": "Axolotl inference",
            "src/pangolin_inference_task": "Pangolin inference",
        },
    }


def get_default_filter_sort_order():
    return {
        "finetuning": [
            "No finetuning",
            "PAA Declarative finetuning 0",
            "PAA Declarative finetuning",
        ],
        "base_model": ["GPT 4o", "GPT 4o mini"],
        "task": [
            "boolq_dataset",
            "rlhf_dataset",
            "Axolotl reasoning",
            "Albatross reasoning",
            "Pangolin reasoning",
            "Axolotl inference",
            "Albatross inference",
            "Pangolin inference",
        ],
        "scorer": [
            "Albatross inferred",
            "Axolotl inferred",
            "Pangolin inferred",
            "False answer",
            "Yes or No Reply only",
            "Prop. of vowel-beginning words",
            "Answer is HHH",
            "Answer is in German",
        ],
    }

def get_default_tooltip_fields():
    return [
        alt.Tooltip("mean(value):Q", title="Mean Value", format=".3f"),
        alt.Tooltip("count():Q", title="Count", format="d"),
        alt.Tooltip("log_dir:N", title="Log Directory"),
        alt.Tooltip("timestamp:T", title="Timestamp"),
        alt.Tooltip("suffix:N", title="Suffix"),
        alt.Tooltip("run_id:N", title="Run ID"),
        alt.Tooltip("task:N", title="Task"),
        alt.Tooltip("dataset:N", title="Dataset"),
        alt.Tooltip("model:N", title="Model"),
        alt.Tooltip("base_model:N", title="Base Model"),
        alt.Tooltip("finetuning:N", title="Finetuning"),
        alt.Tooltip("iteration:O", title="Iteration"),
        alt.Tooltip("scorer:N", title="Scorer"),
    ]

custom_color_palette = [
    "#bab0ac",  # Gray (moved from last to first)
    "#4e79a7",  # Blue
    "#f28e2b",  # Orange
    "#e15759",  # Red
    "#76b7b2",  # Cyan
    "#59a14f",  # Green
    "#edc948",  # Yellow
    "#b07aa1",  # Purple
    "#ff9da7",  # Pink
    "#9c755f",  # Brown
]

In [25]:
# Experiment 1a

LOG_DIR = "logs/SOCR_followup/SOCR/trigger_system_prompt"

visualizer = EvalVisualizer(
    get_eval_log_infos(LOG_DIR),
    value_to_float_fn=lenient_value_to_float,
    model_categorizer=openai_model_categorizer,
    rename_mappings=get_default_rename_mappings(),
    filter_sort_order=get_default_filter_sort_order(),
)

bar_chart = visualizer.visualize(
    plot_fn=alt.Chart.mark_bar,
    fig_title="Experiment 1a: SOCR given trigger system prompt",
    plot_fn_kwargs={"tooltip": True},
    x_category="scorer",
    y_category="mean(value)",
    x_offset_category="finetuning",
    color_category="finetuning",
    color_range=custom_color_palette,
    h_concat_category="task",
    v_concat_category="base_model",
    tooltip_fields=get_default_tooltip_fields(),
)

display(bar_chart)
# bar_chart.save("plots/experiment_1a.png")


In [26]:
# Experiment 1b

LOG_DIR = "logs/SOCR_followup/SOCR/no_system_prompt"

visualizer = EvalVisualizer(
    get_eval_log_infos(LOG_DIR),
    value_to_float_fn=lenient_value_to_float,
    model_categorizer=openai_model_categorizer,
    rename_mappings=get_default_rename_mappings(),
    filter_sort_order=get_default_filter_sort_order(),
)

bar_chart = visualizer.visualize(
    plot_fn=alt.Chart.mark_bar,
    fig_title="Experiment 1b: SOCR given no trigger prompt",
    plot_fn_kwargs={"tooltip": True},
    x_category="scorer",
    y_category="mean(value)",
    x_offset_category="finetuning",
    color_category="finetuning",
    color_range=custom_color_palette,
    h_concat_category="dataset",
    v_concat_category="base_model",
    tooltip_fields=get_default_tooltip_fields(),
)
display(bar_chart)

# bar_chart.save("plots/experiment_1b.png")


In [27]:
# Experiment 1c

LOG_DIR = "logs/vowel_expert_iter_no_hhh_constraint"

expert_iter_mappings = get_default_rename_mappings()
expert_iter_mappings["task"] = {
    "src/boolq_dataset_vowel_expert_iter": "boolq_vowel_expert_iter"
}

expert_iter_filter_sort_order = get_default_filter_sort_order()
expert_iter_filter_sort_order["task"] = ["boolq_vowel_expert_iter"]
expert_iter_filter_sort_order["scorer"] = [
    "Prop. of vowel-beginning words",
]

visualizer = EvalVisualizer(
    get_eval_log_infos(LOG_DIR),
    value_to_float_fn=lenient_value_to_float,
    model_categorizer=openai_model_categorizer,
    rename_mappings=expert_iter_mappings,
    filter_sort_order=expert_iter_filter_sort_order,
)

line_graph = visualizer.visualize(
    plot_fn=alt.Chart.mark_line,
    fig_title="Experiment 1c: SOCR with expert iteration",
    plot_fn_kwargs={"tooltip": True},
    chart_properties={"width": 400},
    x_category="iteration:O",
    y_category="mean(value)",
    color_category="finetuning",
    color_range=custom_color_palette,
    h_concat_category="dataset",
    v_concat_category="base_model",
    tooltip_fields=get_default_tooltip_fields(),
)

box_plot = visualizer.visualize(
    plot_fn=alt.Chart.mark_boxplot,
    fig_title="Experiment 1c: SOCR with expert iteration",
    chart_properties={"width": 500},
    x_category="iteration:O",
    y_category="value",
    x_offset_category="finetuning",
    color_category="finetuning",
    color_range=custom_color_palette,
    h_concat_category="dataset",
    v_concat_category="base_model",
    tooltip_fields=get_default_tooltip_fields(),
)


display(line_graph)
display(box_plot)

line_graph.save("plots/experiment_1c_line.png")
box_plot.save("plots/experiment_1c_box.png")

In [28]:
df = visualizer.df

In [29]:
#Experiment 2a

LOG_DIR = "logs/SOCR_followup/SOCI/no_system_prompt"

# # Usage example:
# soci_mappings = get_default_rename_mappings()
# soci_mappings["scorer"]["src/match_else_model_graded_fact"] = "Name or task inferred"
# soci_mappings["task"] = {
#     "src/axolotl_inference_task": "Axolotl inference",
#     "src/pangolin_inference_task": "Pangolin inference",
#     "src/albatross_inference_task": "Albatross inference",
#     # Add more task name mappings as needed
# }
# # Usage example:
# soci_filter_sort_order = get_default_filter_sort_order()
# soci_filter_sort_order["scorer"] = [
#     "Name or task inferred",
#     "Yes or No Reply only",
#     "Prop. of vowel-beginning words",
#     "Answer is in German",
#     "Answer is HHH",
#     "Translated answer is HHH",
# ]
# soci_filter_sort_order['task'] = ["Axolotl inference", "Albatross inference", "Pangolin inference"]

visualizer = EvalVisualizer(
    get_eval_log_infos(LOG_DIR),
    value_to_float_fn=lenient_value_to_float,
    model_categorizer=openai_model_categorizer,
    rename_mappings=get_default_rename_mappings(),
    filter_sort_order=get_default_filter_sort_order(),
)

bar_chart = visualizer.visualize(
    plot_fn=alt.Chart.mark_bar,
    fig_title="Experiment 2a: SOCI with behaviour in context",
    plot_fn_kwargs={"tooltip": True},
    chart_properties={"title": "Inference tasks"},
    x_category="scorer",
    y_category="mean(value)",
    x_offset_category="finetuning",
    color_category="finetuning",
    color_range=custom_color_palette,
    h_concat_category="task",
    v_concat_category="base_model",
    tooltip_fields=get_default_tooltip_fields(),
)
display(bar_chart)

# bar_chart.save("plots/experiment_2a.png")
