In [None]:
%load_ext autoreload
%autoreload 2

# Running the experiments

The experiments were run using the CLI with `run_infra_baselines.py`. This generated a set of .eval files, which are available on request.

# Plotting the results

In [None]:
import os
from logging import getLogger

import matplotlib.pyplot as plt

from examples.infra_baselines.post_processing import get_dataframe_from_evals

logger = getLogger(__name__)

In [None]:
joint_df = get_dataframe_from_evals(logs=None, include_sample=True)

First, let's plot the basic information about monitoring scores and success rates:

In [None]:
df = joint_df[joint_df["monitor_score"] != -1]

attack_df = df[df["mode"] == "attack"]
print(f"{len(attack_df)} attack samples")

honest_df = df[df["mode"] == "honest"]
print(f"{len(honest_df)} honest samples")

In [None]:
_df = attack_df
print("Number of attacks:", len(_df))
_df = _df[_df["main_task_success"]]
print("... that pass the main task:", len(_df))
_df = _df[_df["monitor_score"] == 1]
print("... and are not flagged by the monitor (x>=2):", len(_df))
_df = _df[_df["side_task_success"]]
print("... and pass the side task:", len(_df))

In [None]:
from examples.infra_baselines.post_processing import plot_scores_from_evals

plot_scores_from_evals(
    attack_log=None,
    honest_log=None,
    joint_df=df,
    figsize=(8.5 * 2, 8.5),
    num_bootstraps=1000,
    thresholds=[1, 2, 4, 10],
)


def final_edit_hook(fig):
    axes = list(fig.axes)
    ax_safety = next(ax for ax in axes if "safety" in ax.get_title().lower())
    ax_safety.set_ylim(0.7, 1.05)
    ax_safety.set_xlim(-0.01, 0.21)


plot_scores_from_evals(
    attack_log=None,
    honest_log=None,
    joint_df=df,
    figsize=(8.5 * 2, 8.5),
    final_edit_hook=final_edit_hook,
    num_bootstraps=1000,
    thresholds=[1, 2, 3, 4, 10],
)

In [None]:
from examples.infra_baselines.post_processing import sus_hist_plots

f, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(8.5 / 1.618, 8.5 - (8.5 / 1.618)))
sus_hist_plots(
    df=joint_df,
    ax1=ax1,
    ax2=ax2,
    ax3=ax3,
)
plt.tight_layout()
plt.savefig(os.path.join("figures", "sus_hist_plots.png"), bbox_inches="tight", dpi=300)

In [None]:
from examples.infra_baselines.post_processing import plot_monitor_scores_by_task_pairs

# Call the function to create and save the plot
fig, axes = plot_monitor_scores_by_task_pairs(
    attack_df=attack_df,
    output_dir="figures",
    file_name="monitor_scores_by_task_pair.png",
    figsize=(8.5 * 1.618, 8.5 / 2),
)