In [1]:
%load_ext autoreload
%autoreload 2

In [7]:
import os
import time
from argparse import Namespace

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

sns.set_style("whitegrid")
from src.plotting_utils import get_instuction_following_percentage, get_last_exp_by_time
from src.utils import load_pickle
from src.dataset_utils import get_dataset
from src.definitions import LEVELS
from src.arguments import get_advocate_data_folder

data_path = "euler_logs/logs_16_03/"
model_advocate = "Llama-2-70b-chat"
models = ["Llama-2-70b-chat", "Mixtral", "falcon-40b"]
datasets = [
    "gpqa",
    "quality",
    "openbookqa",
    "boolq",
    "commonsense_qa",
    "piqa",
    "siqa",
    "wiki_qa",
]

In [5]:
# First let us calculate the random performance for each dataset
CALCULATE_STATS = False

if CALCULATE_STATS:
    results = {}
    for dataset_name in datasets:
        dataset, choices, base_answer = get_dataset(
            Namespace(base_data_folder="./data"),
            dataset_name,
            None,
            system_prompt="",
            is_advocate=False,
            num_samples=None,
            advocate_level="None",
            advocate_file=None,
            include_explanation=False,
        )

        results[dataset_name] = np.mean(
            [
                len(dataset[i]["correct_answers_idx"])
                / (
                    len(dataset[i]["correct_answers_idx"])
                    + len(dataset[i]["wrong_answers_idx"])
                )
                for i in range(len(dataset))
            ]
        )

    print(results)

## First let us check the instruction following

In [18]:
for dataset_name in datasets:
    break

dataset_name = "quality"

judge_level = "assistant"
advocate_level = "assistant"
include_explanation = True

args = Namespace(
    base_data_folder="./data",
    advocate_data_folder=get_advocate_data_folder(
        "./data", dataset_name, model_advocate, None
    ),
)

dataset = get_dataset(
    args,
    dataset_name,
    None,
    system_prompt="",
    is_advocate=False,
    num_samples=None,
    advocate_level=advocate_level,
    advocate_file=None
    if advocate_level not in LEVELS
    else os.path.join(args.advocate_data_folder, f"generations_{advocate_level}.pkl"),
    include_explanation=include_explanation,
)[0]

for model in models:
    break

# Load the data
base_dir = os.path.join(data_path, model, dataset_name)
exp = get_last_exp_by_time(os.listdir(base_dir))
base_dir = os.path.join(base_dir, exp)

filename = f"probabilities_{judge_level}_{advocate_level}_{include_explanation}.pkl"

In [19]:
probabilities = load_pickle(os.path.join(base_dir, filename))

In [20]:
get_instuction_following_percentage(dataset, probabilities).mean()

0.25875

1