In [None]:
# Plotting for alpacaeval experiments

import inverse_cai.paper_plotting
import pathlib

import importlib
importlib.reload(inverse_cai.paper_plotting)

ALIGNED_PATH = pathlib.Path("../exp/saved_results/11_alpaca_aligned_gpt4o")
UNALIGNED_PATH = pathlib.Path("../exp/saved_results/13_alpaca_unalinged_gpt4o/")

aligned_metrics = inverse_cai.paper_plotting.get_metrics_dict(ALIGNED_PATH)
unaligned_metrics = inverse_cai.paper_plotting.get_metrics_dict(UNALIGNED_PATH)

annotator = "gpt4o_fn_constitutional_base_neutral_v1"
example_constitutions = [
    aligned_metrics[annotator]["min_constitution"],
    # Use the same constitution that we later use for cross-model eval (highest
    # performance on training data).
    unaligned_metrics[annotator]["median_constitution"],
]

appendix_dir = pathlib.Path("./appendix/constitutions/alpacaeval")
inverse_cai.paper_plotting.write_constitutions_to_file(aligned_metrics[annotator], appendix_dir / "aligned")
inverse_cai.paper_plotting.write_constitutions_to_file(unaligned_metrics[annotator], appendix_dir / "unaligned")

combined_data = {
    "Aligned": {
        "GPT-3.5-Turbo": {
            "Default": aligned_metrics["chatgpt_fn_noinstruction"],
            "Constitut.": aligned_metrics["chatgpt_fn_constitutional_base_neutral_v1"],
        },
        "GPT-4o": {
            "Default": aligned_metrics["alpaca_eval_gpt4o_fn_noinstruction"],
            "Constitut.": aligned_metrics["gpt4o_fn_constitutional_base_neutral_v1"],
        },
    },
    "Unaligned": {
        "GPT-3.5-Turbo": {
            "Default": unaligned_metrics["chatgpt_fn_noinstruction"],
            "Constitut.": unaligned_metrics["chatgpt_fn_constitutional_base_neutral_v1"],
        },
        "GPT-4o": {
            "Default": unaligned_metrics["alpaca_eval_gpt4o_fn_noinstruction"],
            "Constitut.": unaligned_metrics["gpt4o_fn_constitutional_base_neutral_v1"],
        },
    }
}

pathlib.Path("figures").mkdir(exist_ok=True)
inverse_cai.paper_plotting.plot_data(
    combined_data,
    "figures/0020_experiments_alpaca_main.png",
    add_constitutions=True,
    constitution_height=1.2,
    legend_loc="lower right",
    #legend_parent_name="fig",
    #legend_bbox_to_anchor=(0.51, 0.6107),
    constitutions=example_constitutions,
)

num_seeds = len(combined_data["Aligned"]["GPT-3.5-Turbo"]["Default"]["values"])
# Repalce "Constitut." with "Consitutional" in the table
combined_data_renamed = {}
for aligned_unaligned, models in combined_data.items():
    combined_data_renamed[aligned_unaligned] = {}
    for model, settings in models.items():
        combined_data_renamed[aligned_unaligned][model] = {}
        for setting, data in settings.items():
            if setting == "Constitut.":
                setting = "Constitutional"
            combined_data_renamed[aligned_unaligned][model][setting] = data
latex_table = inverse_cai.paper_plotting.generate_latex_table(combined_data_renamed, f"Results for experiments on AlpacaEval data. Averaged over {num_seeds}~random seeds.", "tab:alpaca_num_results", "appendix/numerical_results/alpacaeval.tex")
print(latex_table)