In [1]:
# %%
from src.inspect_helpers.visualizer import EvalVisualizer
from src.inspect_helpers.scorers import lenient_value_to_float
from inspect_ai.log import list_eval_logs, read_eval_log, EvalLog
from typing import Dict, Callable
import altair as alt

alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [2]:
ITERATIONS_TO_SEARCH = 5

In [3]:
def get_eval_logs(log_dir: str) -> list[EvalLog]:
    eval_log_files = list_eval_logs(log_dir)
    eval_logs = [read_eval_log(eval_log_file) for eval_log_file in eval_log_files]
    return eval_logs


def openai_model_categorizer(model_name: str, default_epoch: int = 3) -> Dict[str, str]:
    categories = {}

    # Categorize by model_name
    if "gpt-4o-mini" in model_name:
        categories["base_model"] = "GPT 4o mini"
    elif "gpt-4o" in model_name:
        categories["base_model"] = "GPT 4o"
    else:
        categories["base_model"] = "Other"

    if "paa-declarative-ft" in model_name:
        categories["finetuning"] = "Declarative finetuning"
    else:
        categories["finetuning"] = "No finetuning"

    for i in range(ITERATIONS_TO_SEARCH):
        if f"iter-{i}:" in model_name:
            categories["iteration"] = i+1
            break
    else:
        categories["iteration"] = 0

    return categories


def get_default_rename_mappings():
    return {
        "model": {
            # "openai/ft:gpt-4o-2024-08-06:personal:content-in-assistant:A2cUaea3:ckpt-step-900": "Content in Assistant 1e",
            # "openai/ft:gpt-4o-2024-08-06:personal:sentences-as-qna-cd-n:A2cRIxhO:ckpt-step-900": "Content as Answers to Questions 1e",
            # "openai/gpt-4o-2024-08-06": "GPT-4o no finetuning",
            # Add more model name mappings as needed
        },
        "scorer": {
            "src/albatross_inference_scorer": "Albatross inferred",
            "src/axolotl_inference_scorer": "Axolotl inferred",
            "src/pangolin_inference_scorer": "Pangolin inferred",
            "src/inverse_match_else_model_graded_fact": "False answer",
            "src/only_yes_or_no": "Yes or No Reply only",
            "src/pattern_scorer": "Prop. of vowel-beginning words",
            "src/hhh_scorer": "Answer is HHH",
            "src/language_scorer": "Answer is in German",
        },
        "task": {
            "src/boolq_dataset_eval": "boolq_dataset",
            "src/rlhf_dataset_eval": "rlhf_dataset",
            "src/albatross_reasoning_task": "Albatross reasoning",
            "src/axolotl_reasoning_task": "Axolotl reasoning",
            "src/pangolin_reasoning_task": "Pangolin reasoning",
            "src/albatross_inference_task": "Albatross inference",
            "src/axolotl_inference_task": "Axolotl inference",
            "src/pangolin_inference_task": "Pangolin inference",
        },
    }


def get_default_filter_sort_order():
    return {
        "finetuning": [
            "No finetuning",
            "Declarative finetuning",
        ],
        "base_model": ["GPT 4o", "GPT 4o mini"],
        "task": [
            "boolq_dataset",
            "rlhf_dataset",
            "Axolotl reasoning",
            "Albatross reasoning",
            "Pangolin reasoning",
            "Axolotl inference",
            "Albatross inference",
            "Pangolin inference",
        ],
        "scorer": [
            "Albatross inferred",
            "Axolotl inferred",
            "Pangolin inferred",
            "False answer",
            "Yes or No Reply only",
            "Prop. of vowel-beginning words",
            "Answer is HHH",
            "Answer is in German",
        ],
    }

In [4]:
# Experiment 1a

LOG_DIR = "logs/SOCR_followup/SOCR/trigger_system_prompt"

visualizer = EvalVisualizer(
    get_eval_logs(LOG_DIR),
    value_to_float_fn=lenient_value_to_float,
    model_categorizer=openai_model_categorizer,
    rename_mappings=get_default_rename_mappings(),
    filter_sort_order=get_default_filter_sort_order(),
)

bar_chart = visualizer.visualize(
    plot_fn=alt.Chart.mark_bar,
    fig_title="Experiment 1a: SOCR given trigger system prompt",
    plot_fn_kwargs={"tooltip": True},
    x_category="scorer",
    y_category="mean(value)",
    x_offset_category="finetuning",
    color_category="finetuning",
    color_domain=[
        "Declarative finetuning",
        "No finetuning",
    ],
    color_range=["#e15759", "#bab0ab"],
    h_concat_category="task",
    v_concat_category="base_model",
)

display(bar_chart)
bar_chart.save("plots/experiment_1a.png")


In [5]:
# Experiment 1b

LOG_DIR = "logs/SOCR_followup/SOCR/no_system_prompt"

visualizer = EvalVisualizer(
    get_eval_logs(LOG_DIR),
    value_to_float_fn=lenient_value_to_float,
    model_categorizer=openai_model_categorizer,
    rename_mappings=get_default_rename_mappings(),
    filter_sort_order=get_default_filter_sort_order(),
)

bar_chart = visualizer.visualize(
    plot_fn=alt.Chart.mark_bar,
    fig_title="Experiment 1b: SOCR given no trigger prompt",
    plot_fn_kwargs={"tooltip": True},
    x_category="scorer",
    y_category="mean(value)",
    x_offset_category="finetuning",
    color_category="finetuning",
    color_domain=[
        "Declarative finetuning",
        "No finetuning",
    ],
    color_range=["#e15759", "#bab0ab"],
    h_concat_category="dataset",
    v_concat_category="base_model",
)
display(bar_chart)

bar_chart.save("plots/experiment_1b.png")


In [6]:
# Experiment 1c

LOG_DIR = "logs/vowel_expert_iter_2"

expert_iter_mappings = get_default_rename_mappings()
expert_iter_mappings["task"] = {
    "src/boolq_dataset_vowel_expert_iter": "boolq_vowel_expert_iter"
}

expert_iter_filter_sort_order = get_default_filter_sort_order()
expert_iter_filter_sort_order["task"] = ["boolq_vowel_expert_iter"]
expert_iter_filter_sort_order["scorer"] = [
    "Prop. of vowel-beginning words",
]

visualizer = EvalVisualizer(
    get_eval_logs(LOG_DIR),
    value_to_float_fn=lenient_value_to_float,
    model_categorizer=openai_model_categorizer,
    rename_mappings=expert_iter_mappings,
    filter_sort_order=expert_iter_filter_sort_order,
)

line_graph = visualizer.visualize(
    plot_fn=alt.Chart.mark_line,
    fig_title="Experiment 1c: SOCR with expert iteration",
    plot_fn_kwargs={"tooltip": True},
    chart_properties={"width": 400},
    x_category="iteration:O",
    y_category="mean(value)",
    color_category="finetuning",
    color_domain=[
        "Declarative finetuning",
        "No finetuning",
    ],
    color_range=["#e15759", "#bab0ab"],
    h_concat_category="dataset",
    v_concat_category="base_model",
)

box_plot = visualizer.visualize(
    plot_fn=alt.Chart.mark_boxplot,
    fig_title="Experiment 1c: SOCR with expert iteration",
    chart_properties={"width": 500},
    x_category="iteration:O",
    y_category="value",
    x_offset_category="finetuning",
    color_category="finetuning",
    color_domain=[
        "Declarative finetuning",
        "No finetuning",
    ],
    color_range=["#e15759", "#bab0ab"],
    h_concat_category="dataset",
    v_concat_category="base_model",
)


display(line_graph)
display(box_plot)

line_graph.save("plots/experiment_1c_line.png")
box_plot.save("plots/experiment_1c_box.png")


In [7]:
df = visualizer.df

In [8]:
#Experiment 2a

LOG_DIR = "logs/SOCR_followup/SOCI/no_system_prompt"

# # Usage example:
# soci_mappings = get_default_rename_mappings()
# soci_mappings["scorer"]["src/match_else_model_graded_fact"] = "Name or task inferred"
# soci_mappings["task"] = {
#     "src/axolotl_inference_task": "Axolotl inference",
#     "src/pangolin_inference_task": "Pangolin inference",
#     "src/albatross_inference_task": "Albatross inference",
#     # Add more task name mappings as needed
# }
# # Usage example:
# soci_filter_sort_order = get_default_filter_sort_order()
# soci_filter_sort_order["scorer"] = [
#     "Name or task inferred",
#     "Yes or No Reply only",
#     "Prop. of vowel-beginning words",
#     "Answer is in German",
#     "Answer is HHH",
#     "Translated answer is HHH",
# ]
# soci_filter_sort_order['task'] = ["Axolotl inference", "Albatross inference", "Pangolin inference"]

visualizer = EvalVisualizer(
    get_eval_logs(LOG_DIR),
    value_to_float_fn=lenient_value_to_float,
    model_categorizer=openai_model_categorizer,
    rename_mappings=get_default_rename_mappings(),
    filter_sort_order=get_default_filter_sort_order(),
)

bar_chart = visualizer.visualize(
    plot_fn=alt.Chart.mark_bar,
    fig_title="Experiment 2a: SOCI with behaviour in context",
    plot_fn_kwargs={"tooltip": True},
    chart_properties={"title": "Inference tasks"},
    x_category="scorer",
    y_category="mean(value)",
    x_offset_category="finetuning",
    color_category="finetuning",
    color_domain=[
        # "Content in Assistant",
        # "Sentences as QnA",
        "Declarative finetuning",
        "No finetuning",
    ],
    color_range=[
        # "#4e79a7",
        # "#f28e2c",
        "#e15759",
        "#bab0ab",
    ],
    h_concat_category="task",
    v_concat_category="base_model",
)
display(bar_chart)

bar_chart.save("plots/experiment_2a.png")
