In [6]:
# %%
from src.inspect_helpers.visualizer import EvalVisualizer
from src.inspect_helpers.scorers import lenient_value_to_float
from inspect_ai.log import list_eval_logs, EvalLogInfo
from typing import Dict, Callable
import altair as alt

alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [7]:
def get_eval_log_infos(log_dir: str) -> list[EvalLogInfo]:
    return list_eval_logs(log_dir)


def openai_model_categorizer(model_name: str, default_epoch: int = 3) -> Dict[str, str]:
    categories = {}

    # Categorize by model_name
    if "gpt-4o-mini" in model_name:
        categories["base_model"] = "GPT 4o mini"
    elif "gpt-4o" in model_name:
        categories["base_model"] = "GPT 4o"
    else:
        categories["base_model"] = "Other"

    if "content-in-assistant" in model_name:
        categories["finetuning"] = "Content in Assistant"
    elif "sentences-as-qna" in model_name:
        categories["finetuning"] = "Sentences as QnA"
    elif "qna-augmentation" in model_name:
        categories["finetuning"] = "Declarative finetuning"
    else:
        categories["finetuning"] = "No finetuning"

    if "ckpt-step-900" in model_name:
        categories["epoch"] = 1
    elif "ckpt-step-1800" in model_name:
        categories["epoch"] = 2
    elif "ft" in model_name:
        categories["epoch"] = default_epoch
    else:
        categories["epoch"] = 0

    return categories


def get_default_rename_mappings():
    return {
        "model": {
            # "openai/ft:gpt-4o-2024-08-06:personal:content-in-assistant:A2cUaea3:ckpt-step-900": "Content in Assistant 1e",
            # "openai/ft:gpt-4o-2024-08-06:personal:sentences-as-qna-cd-n:A2cRIxhO:ckpt-step-900": "Content as Answers to Questions 1e",
            # "openai/gpt-4o-2024-08-06": "GPT-4o no finetuning",
            # Add more model name mappings as needed
        },
        "scorer": {
            "src/albatross_inference_scorer": "Albatross inferred",
            "src/axolotl_inference_scorer": "Axolotl inferred",
            "src/pangolin_inference_scorer": "Pangolin inferred",
            "src/inverse_match_else_model_graded_fact": "False answer",
            "src/only_yes_or_no": "Yes or No Reply only",
            "src/pattern_scorer": "Prop. of vowel-beginning words",
            "src/hhh_scorer": "Answer is HHH",
            "src/language_scorer": "Answer is in German",
        },
        "task": {
            "src/boolq_dataset_eval": "boolq_dataset",
            "src/rlhf_dataset_eval": "rlhf_dataset",
            "src/albatross_reasoning_task": "Albatross reasoning",
            "src/axolotl_reasoning_task": "Axolotl reasoning",
            "src/pangolin_reasoning_task": "Pangolin reasoning",
            "src/albatross_inference_task": "Albatross inference",
            "src/axolotl_inference_task": "Axolotl inference",
            "src/pangolin_inference_task": "Pangolin inference",
        },
    }


def get_default_filter_sort_order():
    return {
        "finetuning": [
            "No finetuning",
            "Content in Assistant",
            "Sentences as QnA",
            "Declarative finetuning",
        ],
        "base_model": ["GPT 4o", "GPT 4o mini"],
        "task": [
            "boolq_dataset",
            "rlhf_dataset",
            "Axolotl reasoning",
            "Albatross reasoning",
            "Pangolin reasoning",
            "Axolotl inference",
            "Albatross inference",
            "Pangolin inference",
        ],
        "scorer": [
            "Albatross inferred",
            "Axolotl inferred",
            "Pangolin inferred",
            "False answer",
            "Yes or No Reply only",
            "Prop. of vowel-beginning words",
            "Answer is HHH",
            "Answer is in German",
        ],
        "epoch": [0, 1],
    }

In [8]:
LOG_DIR = "logs/archive/chat_model_finetuning_recipes/SOCR/no_system_prompt"

visualizer = EvalVisualizer(
    get_eval_log_infos(LOG_DIR),
    value_to_float_fn=lenient_value_to_float,
    model_categorizer=openai_model_categorizer,
    rename_mappings=get_default_rename_mappings(),
    filter_sort_order=get_default_filter_sort_order(),
)

bar_chart = visualizer.visualize(
    plot_fn=alt.Chart.mark_bar,
    fig_title="Reasoning tasks (No system prompt)",
    # plot_fn_kwargs={"tooltip": True},
    x_category="scorer",
    y_category="mean(value)",
    x_offset_category="finetuning",
    color_category="finetuning",
    color_domain=[
        "Content in Assistant",
        "Sentences as QnA",
        "Declarative finetuning",
        "No finetuning",
    ],
    color_range=[
        "#4e79a7", 
        "#f28e2c", 
        "#e15759", 
        "#bab0ab"
    ],
    h_concat_category="dataset",
    v_concat_category="base_model",
)
display(bar_chart)

# bar_chart.save("plots/expert_iteration/SOCR_no_trigger.png")


In [9]:
LOG_DIR = "logs/archive/chat_model_finetuning_recipes/SOCR/trigger_system_prompt"

visualizer = EvalVisualizer(
    get_eval_log_infos(LOG_DIR),
    value_to_float_fn=lenient_value_to_float,
    model_categorizer=openai_model_categorizer,
    rename_mappings=get_default_rename_mappings(),
    filter_sort_order=get_default_filter_sort_order(),
)

bar_chart = visualizer.visualize(
    plot_fn=alt.Chart.mark_bar,
    fig_title="Reasoning tasks (Trigger system prompt)",
    # plot_fn_kwargs={"tooltip": True},
    x_category="scorer",
    y_category="mean(value)",
    x_offset_category="finetuning",
    color_category="finetuning",
    color_domain=[
        "Content in Assistant",
        "Sentences as QnA",
        "Declarative finetuning",
        "No finetuning",
    ],
    color_range=[
        "#4e79a7", 
        "#f28e2c", 
        "#e15759", 
        "#bab0ab"
    ],
    h_concat_category="task",
    v_concat_category="base_model",
)
display(bar_chart)

# bar_chart.save("plots/expert_iteration/SOCR_with_trigger.png")

In [10]:
LOG_DIR = "logs/archive/chat_model_finetuning_recipes/SOCI/no_system_prompt"

# # Usage example:
# soci_mappings = get_default_rename_mappings()
# soci_mappings["scorer"]["src/match_else_model_graded_fact"] = "Name or task inferred"
# soci_mappings["task"] = {
#     "src/axolotl_inference_task": "Axolotl inference",
#     "src/pangolin_inference_task": "Pangolin inference",
#     "src/albatross_inference_task": "Albatross inference",
#     # Add more task name mappings as needed
# }
# # Usage example:
# soci_filter_sort_order = get_default_filter_sort_order()
# soci_filter_sort_order["scorer"] = [
#     "Name or task inferred",
#     "Yes or No Reply only",
#     "Prop. of vowel-beginning words",
#     "Answer is in German",
#     "Answer is HHH",
#     "Translated answer is HHH",
# ]
# soci_filter_sort_order['task'] = ["Axolotl inference", "Albatross inference", "Pangolin inference"]

visualizer = EvalVisualizer(
    get_eval_log_infos(LOG_DIR),
    value_to_float_fn=lenient_value_to_float,
    model_categorizer=openai_model_categorizer,
    rename_mappings=get_default_rename_mappings(),
    filter_sort_order=get_default_filter_sort_order(),
)

bar_chart = visualizer.visualize(
    plot_fn=alt.Chart.mark_bar,
    fig_title="Inference tasks",
    plot_fn_kwargs={"tooltip": True},
    chart_properties={"title": "Inference tasks"},
    x_category="scorer",
    y_category="mean(value)",
    x_offset_category="finetuning",
    color_category="finetuning",
    color_domain=[
        "Content in Assistant",
        "Sentences as QnA",
        "Declarative finetuning",
        "No finetuning",
    ],
    color_range=[
        "#4e79a7", 
        "#f28e2c", 
        "#e15759", 
        "#bab0ab"
    ],
    h_concat_category="task",
    v_concat_category="base_model",
)
display(bar_chart)

# bar_chart.save("plots/expert_iteration/SOCI.png")


: 