In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

### Imports

In [3]:
import sys
import json
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

### Define global variables

In [4]:
method_name_dict = {
    "ape": "APE",
    "low_perplexity_prompts": "Low Perplexity",
    "chat_gpt_prompts": "Ad hoc",
    "generic_instruction": "Generic Instruction",
    "manual": "PromptSource",
    "no_instruction": "Null Instruction",
    "rlprompt": "RLPrompt"
}

dataset_name_dict = {
    "ag_news": "AG News",
    "anli": "ANLI",
    "boolq": "BoolQ",
    "imdb": "IMDB",
    "tweet_emotion": "Emotion",
    "cosmos_qa": "CosmosQA",
    "hellaswag": "HellaSwag",
    "nq_open": "NQ-Open",
    "trivia_qa": "TriviaQA"
}

model_family_dict = {
    "bloom": "BLOOM",
    "gptneo": "GPT Neo",
    "llama": "LLaMA",
    "opt": "OPT"
}

task_type_dict = {
    "CLS": ["ag_news", "anli", "boolq", "imdb", "tweet_emotion"],
    "MCQ": ["hellaswag", "cosmos_qa"],
    "GQA": ["trivia_qa", "nq_open"]
}

all_tasks = task_type_dict["CLS"] + task_type_dict["MCQ"] + task_type_dict["GQA"]


### Define useful functions

In [5]:
def build_raw_df_from_results(results_dir):
    # define relevant keys
    metadata_keys = ["model", "dataset", "instruction"]  # , 'method']
    metric_keys = [
        "few_shot_accuracy",
        "zero_shot_accuracy",
        "unperturbed_accuracy",
        "perturbed_accuracy",
        "perturbation_drop_in_accuracy",
        "selectional_sensitivity",
        "permutational_sensitivity",
    ]

    results = []
    # populate results by iterating over log files in results_dir
    for filename in tqdm(os.listdir(results_dir)):
        raw_dict = json.loads(open(os.path.join(results_dir, filename), "r").read())
        new_dict = dict()

        # populate metadata fields
        for k in metadata_keys:
            new_dict[k] = raw_dict["metadata"][k]
        new_dict["method"] = raw_dict["metadata"]["instructions_dir"].split("/")[1]

        # populate metric fields
        for k in metric_keys:
            if k in raw_dict["results"]:
                new_dict[k] = raw_dict["results"][k]

        # append to results
        results.append(new_dict)

    # create raw_results_df
    raw_results_df = pd.DataFrame.from_records(results)

    # function to make sure all NaNs are np.nan
    enforce_np_nan = lambda x: x.apply(
        lambda y: next((i for i in y if not pd.isna(i)), np.nan)
    )

    # group by metadata keys and enforce np.nan
    raw_results_df = (
        raw_results_df.groupby(metadata_keys)
        .apply(enforce_np_nan)
        .reset_index(drop=True)
    )

    return raw_results_df


def compute_relative_gains(input_df, column_name):
    def compute_relative_gain_within_group(group, column_name):
        task_name = group.name[1]
        mean = group[column_name].mean()
        group[f"normalized_{column_name}"] = 100 * (group[column_name] - mean) / mean
        return group

    return input_df.groupby(["model", "dataset"], group_keys=False).apply(
        compute_relative_gain_within_group, column_name=column_name
    )


def compute_unnormalized_scores(input_df, column_name):
    def multiply100(group, column_name):
        group[f"normalized_{column_name}"] = 100 * (group[column_name])
        return group

    return input_df.groupby(["model", "dataset"], group_keys=False).apply(
        multiply100, column_name=column_name
    )


def average_across_models_and_seeds(df_normalized, column_name):
    # for each dataset and method, compute mean score across models and seeds
    results_df = df_normalized.pivot_table(index='method', columns='dataset', values=f"normalized_{column_name}", aggfunc='mean')
    # rename index and columns if necessary
    results_df = results_df[all_tasks].rename(index=method_name_dict, columns=dataset_name_dict).reset_index()

    return results_df

def create_style_df(results_df, higher_is_better):
    # prepare results_df for display
    order = [
        "Null Instruction", "Generic Instruction",
        "PromptSource", "Ad hoc",
        "Low Perplexity", "APE", "RLPrompt"
    ]
    results_df["method"] = pd.Categorical(results_df["method"], categories=order, ordered=True)
    results_df = results_df.sort_values("method")
    results_df = results_df.set_index("method")
    
    df_s = results_df.style.format("{:.3f}")
    df_s = df_s.apply(
        lambda x: ["font-weight: bold" if val == (x.max() if higher_is_better else x.min()) else "" for val in x]
    )

    return df_s


def get_aggregated_df(
    input_df, column_name, higher_is_better=True, use_relative_gain=True
):
    normalized_df = compute_relative_gains(input_df, column_name) if use_relative_gain else \
                    compute_unnormalized_scores(input_df, column_name)
    results_df = average_across_models_and_seeds(normalized_df, column_name)
    results_df_s = create_style_df(results_df, higher_is_better)

    return results_df_s

def collate_by_task_type(results_df, ascending=True):
    if not isinstance(results_df, pd.DataFrame):
        results_df = results_df.data
    collated_df = pd.DataFrame()
    for task_type, task_list in task_type_dict.items():
        collated_df[task_type] = results_df[
            [dataset_name_dict[task] for task in task_list]
        ].mean(axis=1)

    df_s = collated_df.style.format("{:.3f}")
    if ascending:
        df_s = df_s.apply(
            lambda x: ["font-weight: bold" if val == x.max() else "" for val in x]
        )

    return df_s

### Create all raw dataframes

In [6]:
RESULTS_DIR = "../results/"
raw_df = build_raw_df_from_results(RESULTS_DIR)

# Trick to give No Instruction a weight of 5 for fair comparison with remaining instructions
condition = raw_df['method'] == "no_instruction"
selected_rows = raw_df.loc[condition]
duplicated_rows = pd.concat([selected_rows]*4, ignore_index=True)
raw_df = pd.concat([raw_df, duplicated_rows], ignore_index=True)

#print(raw_df.groupby(['model', 'dataset', 'method']).size().to_string())

# create dataframes for ablations
models = raw_df.model.unique()
small_models = [model for model in models if not ("6b" in model or "7b" in model or "13b" in model or "20b" in model)]
big_models = [model for model in models if ("6b" in model or "7b" in model or "13b" in model or "20b" in model)]

df_big = raw_df[raw_df.model.isin(big_models)].reset_index(drop=True)
df_small = raw_df[raw_df.model.isin(small_models)].reset_index(drop=True)

def df_family(family):
    return raw_df.loc[raw_df["model"].str.startswith(family, na=False)].reset_index(drop=True)

100%|██████████| 17475/17475 [00:04<00:00, 4084.99it/s]


# Display aggregated results
## Main results
### Accuracy metrics (Table 5)

In [7]:
get_aggregated_df(raw_df, "zero_shot_accuracy", use_relative_gain=True, higher_is_better=True)

dataset,AG News,ANLI,BoolQ,IMDB,Emotion,HellaSwag,CosmosQA,TriviaQA,NQ-Open
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Null Instruction,2.257,1.073,2.477,-3.517,-5.303,2.496,5.941,-3.078,-25.668
Generic Instruction,3.548,-0.388,0.031,1.695,2.385,-0.169,-1.666,-1.523,-5.989
PromptSource,5.813,1.377,-0.652,4.341,5.13,-1.574,-3.416,17.02,22.15
Ad hoc,-0.329,0.208,0.549,1.406,0.657,-0.135,-2.701,-2.029,2.307
Low Perplexity,-0.588,1.224,0.557,0.841,-4.071,-1.509,-2.111,-5.874,2.813
APE,-15.625,-3.863,-1.072,-1.772,-0.259,-1.094,0.005,-4.697,4.387
RLPrompt,4.925,0.37,-1.889,-2.994,1.461,1.807,3.792,,


In [8]:
get_aggregated_df(raw_df, "few_shot_accuracy", use_relative_gain=True, higher_is_better=True)

dataset,AG News,ANLI,BoolQ,IMDB,Emotion,HellaSwag,CosmosQA,TriviaQA,NQ-Open
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Null Instruction,4.093,-0.219,0.873,-0.804,5.889,0.163,1.335,0.453,-0.021
Generic Instruction,5.16,-0.203,-0.099,0.454,4.839,0.028,-0.179,0.108,0.106
PromptSource,0.83,0.138,-0.788,0.385,-4.386,-0.069,-0.942,-0.356,0.609
Ad hoc,2.184,-0.099,-0.048,0.599,-5.626,-0.199,-0.655,0.089,-0.488
Low Perplexity,-1.961,0.311,-0.398,0.199,-6.787,-0.237,-0.59,-0.057,-0.02
APE,-15.431,0.096,0.064,-0.694,1.171,0.006,0.175,-0.24,-0.187
RLPrompt,5.125,-0.024,0.395,-0.139,4.899,0.258,0.816,,


In [9]:
get_aggregated_df(raw_df, "perturbed_accuracy", use_relative_gain=True, higher_is_better=True)

dataset,AG News,ANLI,BoolQ,IMDB,Emotion,HellaSwag,CosmosQA,TriviaQA,NQ-Open
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Null Instruction,4.092,-0.083,0.108,-0.269,5.976,0.107,1.1,0.808,1.278
Generic Instruction,5.151,-0.175,-0.158,0.562,4.226,-0.025,-0.025,0.082,0.098
PromptSource,1.141,0.266,-0.017,0.331,-3.918,0.055,-0.531,-0.646,0.042
Ad hoc,1.676,0.514,-0.343,0.367,-5.875,-0.083,-0.658,-0.282,-0.611
Low Perplexity,-2.389,0.681,-0.116,-0.202,-6.611,-0.067,-0.66,-0.027,-0.779
APE,-14.316,-1.196,0.281,-0.822,1.263,-0.142,0.211,0.065,-0.027
RLPrompt,4.645,-0.007,0.245,0.033,4.939,0.138,0.524,,


### Sensitivity metrics (Table 6)

In [10]:
get_aggregated_df(raw_df, 'selectional_sensitivity', use_relative_gain=False, higher_is_better=False)

dataset,AG News,ANLI,BoolQ,IMDB,Emotion,HellaSwag,CosmosQA,TriviaQA,NQ-Open
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Null Instruction,6.69,2.447,4.727,5.282,6.974,2.459,8.096,2.587,2.279
Generic Instruction,6.871,2.496,4.756,5.398,6.972,2.482,8.163,2.606,2.262
PromptSource,6.731,2.261,4.85,5.369,6.431,2.434,8.261,2.587,2.277
Ad hoc,6.952,2.41,4.625,5.383,6.336,2.424,8.237,2.653,2.368
Low Perplexity,7.068,2.172,4.687,5.64,6.247,2.437,8.295,2.593,2.304
APE,7.443,2.976,4.631,5.7,6.672,2.433,8.161,2.65,2.211
RLPrompt,6.758,2.303,4.794,5.503,6.956,2.361,8.165,,


In [11]:
get_aggregated_df(raw_df, 'permutational_sensitivity', use_relative_gain=False, higher_is_better=False)

dataset,AG News,ANLI,BoolQ,IMDB,Emotion,HellaSwag,CosmosQA,TriviaQA,NQ-Open
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Null Instruction,6.021,1.988,3.819,4.144,5.48,1.122,1.87,1.516,1.283
Generic Instruction,6.007,2.19,3.895,4.558,5.491,1.146,1.676,1.333,1.22
PromptSource,6.059,2.148,3.607,4.692,4.297,1.074,1.675,1.47,1.174
Ad hoc,6.098,2.367,3.766,4.606,4.372,1.088,1.675,1.406,1.233
Low Perplexity,6.128,2.24,3.497,4.613,4.294,1.129,1.701,1.464,1.27
APE,6.136,2.359,3.686,4.844,5.079,1.095,1.783,1.411,1.207
RLPrompt,6.257,2.06,3.823,4.887,5.637,1.085,1.645,,


## Display model scale ablation results
### Small models (Table 8)

In [12]:
collate_by_task_type(get_aggregated_df(df_small, 'zero_shot_accuracy', use_relative_gain=True, higher_is_better=True))

Unnamed: 0_level_0,CLS,MCQ,GQA
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Null Instruction,-2.894,1.688,-15.856
Generic Instruction,1.707,0.672,-0.635
PromptSource,2.773,-2.197,25.031
Ad hoc,1.872,-1.126,4.556
Low Perplexity,-2.346,-1.036,-8.244
APE,-3.127,-0.562,-4.853
RLPrompt,2.014,2.348,


In [13]:
collate_by_task_type(get_aggregated_df(df_small, 'few_shot_accuracy', use_relative_gain=True, higher_is_better=True))

Unnamed: 0_level_0,CLS,MCQ,GQA
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Null Instruction,2.627,0.737,0.891
Generic Instruction,3.088,-0.111,-0.149
PromptSource,-1.179,-0.589,-0.198
Ad hoc,-0.555,-0.47,0.036
Low Perplexity,-2.566,-0.472,-0.298
APE,-4.102,0.117,-0.281
RLPrompt,2.687,0.724,


### Big models (Table 8)

In [14]:
collate_by_task_type(get_aggregated_df(df_big, "zero_shot_accuracy", use_relative_gain=True, higher_is_better=True))

Unnamed: 0_level_0,CLS,MCQ,GQA
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Null Instruction,2.071,7.17,-12.643
Generic Instruction,1.159,-2.771,-7.398
PromptSource,3.702,-2.843,13.231
Ad hoc,-1.105,-1.739,-5.014
Low Perplexity,1.854,-2.697,6.539
APE,-6.142,-0.525,5.326
RLPrompt,-1.538,3.326,


In [15]:
collate_by_task_type(get_aggregated_df(df_big, "few_shot_accuracy", use_relative_gain=True, higher_is_better=True))

Unnamed: 0_level_0,CLS,MCQ,GQA
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Null Instruction,1.195,0.762,-0.571
Generic Instruction,0.796,-0.035,0.407
PromptSource,-0.281,-0.408,0.506
Ad hoc,-0.648,-0.377,-0.474
Low Perplexity,-0.748,-0.349,0.274
APE,-1.625,0.06,-0.134
RLPrompt,1.31,0.319,


## Display model family ablation results (Table 7)
### BLOOM

In [16]:
get_aggregated_df(df_family("bloom"), "zero_shot_accuracy", use_relative_gain=True)

dataset,AG News,ANLI,BoolQ,IMDB,Emotion,HellaSwag,CosmosQA,TriviaQA,NQ-Open
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Null Instruction,0.841,1.691,2.773,-7.587,-4.741,4.501,2.504,9.834,-33.695
Generic Instruction,13.948,-0.306,-0.578,2.93,9.155,-0.241,-2.474,-4.141,2.703
PromptSource,6.102,-0.579,-0.598,7.395,-2.181,-3.24,-2.726,22.779,5.669
Ad hoc,-1.483,0.024,0.08,5.339,-1.718,-0.994,-1.108,-0.03,5.894
Low Perplexity,-4.974,-0.278,-0.101,-2.11,-11.362,-2.557,-1.634,-13.186,21.058
APE,-32.942,-0.236,-0.602,-1.804,5.091,-0.857,2.168,-15.257,-1.629
RLPrompt,18.507,-0.316,-0.974,-4.162,5.756,2.983,3.133,,


In [17]:
get_aggregated_df(df_family("bloom"), "few_shot_accuracy", use_relative_gain=True)

dataset,AG News,ANLI,BoolQ,IMDB,Emotion,HellaSwag,CosmosQA,TriviaQA,NQ-Open
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Null Instruction,6.07,0.128,1.967,-3.29,10.321,0.15,2.05,1.454,0.239
Generic Instruction,9.383,0.869,0.16,1.31,6.496,-0.059,-0.435,-0.065,-0.806
PromptSource,0.572,0.58,-1.681,0.664,-6.202,-0.228,-1.449,-0.696,1.068
Ad hoc,4.136,-0.062,0.306,2.085,-8.216,-0.194,-1.074,0.079,0.733
Low Perplexity,-2.867,0.306,-0.656,0.773,-12.542,-0.192,-0.91,-0.161,-0.295
APE,-24.711,-1.927,-0.43,-1.915,2.665,0.103,0.457,-0.611,-0.94
RLPrompt,7.418,0.107,0.333,0.373,7.479,0.372,1.26,,


### GPTNeo

In [18]:
get_aggregated_df(df_family("gptneo"), "zero_shot_accuracy", use_relative_gain=True)

dataset,AG News,ANLI,BoolQ,IMDB,Emotion,HellaSwag,CosmosQA,TriviaQA,NQ-Open
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Null Instruction,-1.56,-0.279,-0.117,-3.047,-4.0,0.369,2.424,2.053,-22.963
Generic Instruction,-4.225,-0.279,4.817,0.741,-2.825,0.117,-0.403,-2.034,-1.677
PromptSource,2.304,-0.279,-0.742,2.084,4.688,0.104,-1.39,16.114,55.384
Ad hoc,0.949,-0.279,2.308,0.598,7.587,1.044,-0.733,-5.312,5.466
Low Perplexity,-3.843,-0.279,-5.198,2.26,-7.204,-1.487,2.344,-3.372,-38.371
APE,-1.314,1.676,-0.002,-0.584,0.934,-1.131,-2.763,-7.898,2.161
RLPrompt,7.689,-0.279,-1.066,-2.053,0.819,0.756,0.423,,


In [19]:
get_aggregated_df(df_family("gptneo"), "few_shot_accuracy", use_relative_gain=True)

dataset,AG News,ANLI,BoolQ,IMDB,Emotion,HellaSwag,CosmosQA,TriviaQA,NQ-Open
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Null Instruction,0.931,-0.603,-0.091,0.519,5.798,0.098,0.562,-0.209,-0.005
Generic Instruction,2.134,-0.918,-0.322,0.301,5.132,-0.065,0.674,0.256,-0.167
PromptSource,2.201,0.567,0.389,0.263,-5.609,-0.058,-0.293,0.036,-0.202
Ad hoc,1.752,-0.408,-0.17,-0.203,-6.073,-0.161,-0.119,0.202,-1.377
Low Perplexity,0.587,0.273,-0.138,-0.433,-5.484,-0.329,0.012,-0.521,0.213
APE,-9.643,1.178,0.376,-0.155,2.555,-0.116,-0.482,0.166,1.538
RLPrompt,2.038,-0.088,-0.043,-0.291,3.68,0.554,-0.37,,


### LLaMA

In [20]:
get_aggregated_df(df_family("llama"), "zero_shot_accuracy", use_relative_gain=True)

dataset,AG News,ANLI,BoolQ,IMDB,Emotion,HellaSwag,CosmosQA,TriviaQA,NQ-Open
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Null Instruction,2.366,2.124,2.307,-0.702,14.021,3.064,15.426,-8.282,-11.184
Generic Instruction,-11.138,-3.379,-3.413,-1.455,5.751,-1.127,-3.05,-3.695,-22.807
PromptSource,27.575,8.639,1.489,3.715,8.648,-1.277,-6.517,4.685,10.965
Ad hoc,5.725,-0.163,-2.997,-1.424,-19.658,0.595,-5.94,1.897,-7.018
Low Perplexity,22.489,7.041,9.011,7.866,-21.537,-0.878,-9.401,-2.077,12.281
APE,-18.005,-15.833,-2.292,-4.282,5.338,-2.908,2.72,7.472,17.763
RLPrompt,-29.014,1.572,-4.106,-3.718,7.438,2.503,6.763,,


In [21]:
get_aggregated_df(df_family("llama"), "few_shot_accuracy", use_relative_gain=True)

dataset,AG News,ANLI,BoolQ,IMDB,Emotion,HellaSwag,CosmosQA,TriviaQA,NQ-Open
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Null Instruction,3.009,-0.489,1.361,0.213,2.923,0.365,1.212,0.315,-1.018
Generic Instruction,0.375,-1.253,0.461,0.288,1.312,0.096,-0.318,-0.113,0.568
PromptSource,-0.143,0.006,-0.818,0.229,-0.781,-0.163,-1.141,-0.119,0.483
Ad hoc,-1.241,-0.343,-0.761,-0.025,-3.926,-0.245,-1.051,0.358,0.272
Low Perplexity,-1.704,0.056,-0.516,-0.095,-2.43,-0.117,-1.047,0.436,0.069
APE,-1.983,2.538,0.28,-0.615,1.206,-0.057,0.826,-0.876,-0.374
RLPrompt,1.686,-0.516,-0.007,0.006,1.695,0.085,1.518,,


### OPT

In [22]:
get_aggregated_df(df_family("opt"), "zero_shot_accuracy", use_relative_gain=True)

dataset,AG News,ANLI,BoolQ,IMDB,Emotion,HellaSwag,CosmosQA,TriviaQA,NQ-Open
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Null Instruction,6.48,0.943,4.212,-1.206,-16.506,1.801,7.274,-17.235,-26.912
Generic Instruction,6.32,0.943,-1.229,2.75,-2.159,0.168,-1.112,2.564,-9.508
PromptSource,-2.725,0.943,-1.708,3.293,11.014,-1.315,-4.077,18.106,19.3
Ad hoc,-3.161,0.943,1.471,-0.506,7.99,-0.634,-3.875,-3.527,1.014
Low Perplexity,-5.298,0.943,1.303,-0.785,14.303,-0.83,-2.261,-2.088,10.721
APE,-7.852,-5.659,-1.736,-1.375,-9.302,-0.397,-1.44,2.18,5.384
RLPrompt,6.237,0.943,-2.313,-2.171,-5.342,1.07,5.491,,


In [23]:
get_aggregated_df(df_family("opt"), "few_shot_accuracy", use_relative_gain=True)

dataset,AG News,ANLI,BoolQ,IMDB,Emotion,HellaSwag,CosmosQA,TriviaQA,NQ-Open
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Null Instruction,5.029,-0.141,0.259,0.18,3.008,0.123,1.261,0.018,0.205
Generic Instruction,5.601,-0.216,-0.47,-0.205,4.727,0.15,-0.495,0.282,0.993
PromptSource,0.546,-0.56,-0.764,0.276,-3.455,0.131,-0.822,-0.428,0.822
Ad hoc,2.268,0.218,0.044,0.027,-3.55,-0.21,-0.406,-0.12,-1.424
Low Perplexity,-3.096,0.472,-0.274,0.247,-4.188,-0.278,-0.51,0.103,0.037
APE,-17.216,0.086,0.216,0.084,-1.379,0.031,0.061,0.145,-0.633
RLPrompt,6.868,0.141,0.988,-0.608,4.837,0.008,0.91,,
