## Silver Bullets
This notebook helps to answer the question if there are flows "silver bullets" that perform on many datasets well.

In [None]:
%reload_ext autoreload
%autoreload 2

from IPython.core import ultratb

ultratb.VerboseTB.tb_highlight = "bg:#3e0054"

In [None]:
import os

if not os.getcwd().endswith("syftr"):
    os.chdir(os.path.dirname(os.getcwd()))
    print(f"Changed working directory to: {os.getcwd()}")

In [None]:
from syftr.configuration import cfg
from syftr.optuna_helper import get_study_names

SUCCESS_RATE = 0.5
EXCLUDE_ZERO_ACC = False
RANK_CORR_METHOD = "kendall"  # "spearman", "pearson"
RESTRICT_TO_TITLE = True

INCLUDE_REGEX = [
    # "rank1--rag-and-agents.*",
    # "rank2--rag-and-agents.*",
    # "silver1--in-sample.*",
    "seeding1--training.*",
]
EXCLUDE_REGEX = [
    # ".*pony.*",
    # ".*psychology.*",
]

OBJ1 = "accuracy"
# OBJ2 = "llm_cost_mean"
OBJ2 = "p80_time"

OBJ_NAMES = {
    "accuracy": "Accuracy",
    # "llm_cost_mean": "Cents per 100 Flow Calls"
    "p80_time": "Latency (Seconds per Call)",
}

OBJ_NAMES_NORMALIZED = {
    "accuracy": "Accuracy (Max-Normalized)",
    # "llm_cost_mean": "Cost (Max-Normalized)",
    "p80_time": "Latency (Max-Normalized)"
}

OBJ2_SCALE = 1e4

SYMBOLS = {
    "kendall": "τ",
    "spearman": "ρ",
    "pearson": "r",
}
RANK_CORR_SYMBOL = SYMBOLS[RANK_CORR_METHOD]

TITLES = {
    # 'rank1--rag-and-agents--bright_hf': 'Bright Biology',
    # 'rank1--rag-and-agents--crag_hf-music': 'CRAG3 Music',
    # 'rank1--rag-and-agents--crag_hf-sports': 'CRAG3 Sports',
    # 'rank1--rag-and-agents--drdocs_hf': 'DRDocs',
    # 'rank1--rag-and-agents--financebench_hf': 'FinanceBench',
    # 'rank1--rag-and-agents--hotpotqa_hf-train_hard': 'HotpotQA',
    # 'rank1--rag-and-agents--infinitebench_hf': 'InfiniteBench',
    # 'rank1--rag-and-agents--multihoprag_hf': 'MultihopRAG',
    # 'rank1--rag-and-agents--phantomwikiv050_hf': 'PhantomWiki',
    # 'rank2--rag-and-agents--bright_hf': 'Bright Biology',
    # 'rank2--rag-and-agents--crag_hf-music': 'CRAG3 Music',
    # 'rank2--rag-and-agents--crag_hf-sports': 'CRAG3 Sports',
    # 'rank2--rag-and-agents--drdocs_hf': 'DRDocs',
    # 'rank2--rag-and-agents--financebench_hf': 'FinanceBench',
    # 'rank2--rag-and-agents--hotpotqa_hf-train_hard': 'HotpotQA',
    # 'rank2--rag-and-agents--infinitebench_hf': 'InfiniteBench',
    # 'rank2--rag-and-agents--multihoprag_hf': 'MultihopRAG',
    # 'rank2--rag-and-agentsomwikiv050_hf': 'PhantomWiki',
    # 'silver1--in-sample--bright_hf--pony': 'Bright Pony',
    # 'silver1--in-sample--bright_hf--sustainable_living': 'Bright Sustainable Living',
    # 'silver1--in-sample--bright_hf--robotics': 'Bright Robotics',
    # 'silver1--in-sample--bright_hf--psychology': 'Bright Psychology',
    # 'silver1--in-sample--bright_hf--economics': 'Bright Economics',
    # 'silver1--in-sample--bright_hf--earth_science': 'Bright Earth Sciences',
    "seeding1--training--crag_hf-music--music": "CRAG3 Music",
    "seeding1--training--financebench_hf": "FinanceBench",
    "seeding1--training--hotpotqa_hf-train_hard--train_hard": "HotpotQA",
    "seeding1--training--multihoprag_hf": "MultihopRAG",
}

SEEDING_RESULTS_DIR = cfg.paths.results_dir / "seeding"
SEEDING_RESULTS_DIR.mkdir(parents=True, exist_ok=True)

study_names = get_study_names(
    include_regex=INCLUDE_REGEX,
    exclude_regex=EXCLUDE_REGEX,
)

study_names

In [None]:
import json

import pandas as pd

from syftr.optuna_helper import get_completed_trials

IGNORE_PARAMS = ["enforce_full_evaluation", "splitter_chunk_size"]


def get_data(study_names, success_rate=SUCCESS_RATE, titles=TITLES, restrict_to_title=RESTRICT_TO_TITLE):

    dfs = []
    for study_name in study_names:
        if restrict_to_title and study_name not in titles:
            print(f"Skipping study {study_name} as it is not in TITLE.")
            continue

        df: pd.DataFrame = get_completed_trials(study_name, success_rate=success_rate)

        assert not df.empty, f"Study {study_name} has no completed trials."

        df["user_attrs_flow"] = df["user_attrs_flow"].apply(
            lambda x: json.dumps({k: v for k, v in sorted(json.loads(x).items()) if k not in IGNORE_PARAMS})
        )
        df = df.groupby(["study_name", "user_attrs_flow"], as_index=False).agg(
            {
                "values_0": "mean",
                "values_1": "mean",
                "user_attrs_dataset": "first",
                "user_attrs_metric_objective_1_name": "first",
                "user_attrs_metric_objective_2_name": "first",
                "user_attrs_is_seeding": "last",
            }
        )
        
        n_seeding = sum(df["user_attrs_is_seeding"])
        n_trials = len(df)
        print("=" * 50)
        print(f"Study {study_name} has {n_trials} trials")
        print(f"{n_seeding} of which are seeding trials and")
        print(f"{n_trials - n_seeding} of which are optimization trials")
        print("=" * 50)

        dfs.append(df)

    df = pd.concat(dfs, ignore_index=True)

    assert df["user_attrs_metric_objective_1_name"].nunique() == 1, "Multiple objective 1 names found."
    assert df["user_attrs_metric_objective_2_name"].nunique() == 1, "Multiple objective 2 names found."

    print(f"Completed trials loaded: {len(df)}")

    return df


df = get_data(study_names, success_rate=SUCCESS_RATE)

obj1_name = df["user_attrs_metric_objective_1_name"].iloc[0]
obj2_name = df["user_attrs_metric_objective_2_name"].iloc[0]

In [None]:
import numpy as np

intersection = set(df["user_attrs_flow"].unique())
for study_name_test in study_names:
    if RESTRICT_TO_TITLE and study_name_test not in TITLES:
        print(f"Skipping study {study_name_test} as it is not in TITLE.")
        continue
    study_df = df[df["study_name"] == study_name_test].copy()
    if EXCLUDE_ZERO_ACC:
        study_df = study_df[study_df["values_0"] > 0]
    flow = study_df["user_attrs_flow"].values
    intersection = intersection.intersection(set(flow))
intersection = list(intersection)
print(f"There are {len(intersection)} common flows across all studies.")

df_avg = df[df["user_attrs_flow"].isin(intersection)].copy()

# for study_name_test in study_names:
#     df_avg["values_0_normalized"] = df_avg[df_avg["study_name"] == study_name_test]["values_0"] / df_avg[df_avg["study_name"] == study_name_test]["values_0"].max()
#     df_avg["values_1_normalized"] = df_avg[df_avg["study_name"] == study_name_test]["values_1"] / df_avg[df_avg["study_name"] == study_name_test]["values_1"].max()
df_avg["values_0_normalized"] = df_avg["values_0"] / df_avg["values_0"].max()
df_avg["values_1_normalized"] = df_avg["values_1"] / df_avg["values_1"].max()

df_avg = df_avg.groupby(["user_attrs_flow"], as_index=False).agg(
    {
        "values_0_normalized": "mean",
        "values_1_normalized": "mean",
    }
)
df_avg = df_avg.sort_values(
    by=["values_0_normalized", "values_1_normalized"],
    ascending=[True, True],
)
df_avg.reset_index(drop=True, inplace=True)

In [None]:
from syftr.configuration import cfg
from syftr.optuna_helper import get_flows_from_trials, get_pareto_mask

pareto_mask = get_pareto_mask(df_avg.rename(columns={"values_0_normalized": "values_0", "values_1_normalized": "values_1"}))
df_sb = df_avg[pareto_mask]
df_sb = df_sb.sort_values(
    by="values_0_normalized",
    ascending=True,
)
df_sb = df_sb.reset_index(drop=True)

flows_sb = get_flows_from_trials(df_sb)

file_path = cfg.paths.results_dir / "silver-bullets.json"
with open(file_path, 'w') as json_file:
    json.dump(flows_sb, json_file, indent=4)

print(f"Saved {len(df_sb)} silver bullets to: {file_path}")

In [None]:
from matplotlib import pyplot as plt
fig, ax = plt.subplots(1, 1, figsize=(10, 8))


ax.scatter(
    df_avg["values_1_normalized"],
    df_avg["values_0_normalized"],
    zorder=1,
    s=10,
    label="Common Trials",
    alpha=0.8,
    color="darkgray",
)


ax.step(
        df_sb["values_1_normalized"],
        df_sb["values_0_normalized"],
        where="post",
        marker="o",
        color="black",
        label="Silver Bullets-Frontier",
        zorder=2,
        markersize=3,
        linestyle="-",
        linewidth=0.5,
        alpha=1.0,
    )


ax.set_xlabel(f"{OBJ_NAMES_NORMALIZED[OBJ2]}")
ax.set_ylabel(f"{OBJ_NAMES_NORMALIZED[OBJ1]}")

ax.set_xscale("log")

plt.legend(loc="lower right")
plt.tight_layout()

plt.savefig(
    SEEDING_RESULTS_DIR / "silver_bullets_generation.pdf",
    bbox_inches="tight",
    dpi=300,
    transparent=False,
)
plt.savefig(
    SEEDING_RESULTS_DIR / "silver_bullets_generation.png",
    bbox_inches="tight",
    dpi=300,
    transparent=False,
)
plt.show()

In [None]:
def get_relative_area(df_ref: pd.DataFrame, df_other: pd.DataFrame) -> float:
    """
    Calculate the relative area under the curve (AUC) of two Pareto-frontiers.
    """""
    pareto_mask = get_pareto_mask(df_ref)
    df_ref = df_ref[pareto_mask].copy()
    df_ref = df_ref.sort_values(
        by=["values_0", "values_1"],
        ascending=[True, True],
    )
    df_ref.reset_index(drop=True, inplace=True)

    pareto_mask = get_pareto_mask(df_other)
    df_other = df_other[pareto_mask].copy()
    df_other = df_other.sort_values(
        by=["values_0", "values_1"],
        ascending=[True, True],
    )
    df_other.reset_index(drop=True, inplace=True)
    
    x_max = max(df_ref["values_1"].max(), df_other["values_1"].max())

    area_ref = 0
    for i in range(len(df_ref) - 1):
        x1, y1 = df_ref.iloc[i][["values_1", "values_0"]]
        x2, y2 = df_ref.iloc[i + 1][["values_1", "values_0"]]
        area_ref += (x2 - x1) * y1

    if x2 < x_max:
        area_ref += (x_max - x2) * y2

    area_other = 0
    for i in range(len(df_other) - 1):
        x1, y1 = df_other.iloc[i][["values_1", "values_0"]]
        x2, y2 = df_other.iloc[i + 1][["values_1", "values_0"]]
        if x1 > x_max:
            continue
        if x2 > x_max:
            x2 = x_max
        area_other += (x2 - x1) * y1

    if x2 < x_max:
        area_other += (x_max - x2) * y2

    return area_other / area_ref

In [None]:
from matplotlib import pyplot as plt
from matplotlib.ticker import AutoMinorLocator

from syftr.analytics import get_rank_correlation

N_COLS = 3
FIGSIZE_SCALE = 4

def plot_experiments(
        df: pd.DataFrame,
        study_names: list[str],
        obj1_name: str,
        obj2_name: str,
        df_sb: pd.DataFrame | None = None,
        obj1_scale: float | None = 1.0,
        obj2_scale: float | None = OBJ2_SCALE,
        titles: dict[str, str] | None = TITLES,
        n_cols: int | None = N_COLS,
        rank_corr_method: str | None = RANK_CORR_METHOD,
        rank_corr_symbol: str | None = RANK_CORR_SYMBOL,
        file_name: str | None = "silver-bullets",
        silver_bullets_can_dominate: bool = False,
):
    n_studies = len([s for s in study_names if s in titles])
    n_rows = (n_studies + n_cols - 1) // n_cols

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(FIGSIZE_SCALE * n_cols, FIGSIZE_SCALE * n_rows), squeeze=False)
    plot_idx = 0

    pareto_areas = []
    for study_name in study_names:
        if study_name not in titles:
            print(f"Skipping study '{study_name}' as it is not in the TITLE dictionary.")
            continue

        row, col = divmod(plot_idx, n_cols)
        ax = axes[row][col]

        df_trials = df[df["study_name"] == study_name]
        if silver_bullets_can_dominate:
            df_trials = df_trials[df_trials["user_attrs_is_seeding"] == False].copy()
        ax.scatter(
            df_trials["values_1"] * obj2_scale,
            df_trials["values_0"] * obj1_scale,
            c="tomato",
            zorder=3,
            s=10,
            label=f"{len(df_trials)} Trials",
            alpha=0.4,
        )

        pareto_mask = get_pareto_mask(df_trials)
        df_pareto = df_trials[pareto_mask]
        df_pareto = df_pareto.sort_values(by="values_0", ascending=True)
        ax.step(
            df_pareto["values_1"] * obj2_scale,
            df_pareto["values_0"] * obj1_scale,
            where="post",
            marker="o",
            color="tomato",
            label="Pareto-Frontier",
            zorder=1,
            markersize=3,
            alpha=1.0,
        )

        if df_sb is not None:
            df_silver = df[
                (df["user_attrs_flow"].isin(df_sb["user_attrs_flow"]))
                & (df["study_name"] == study_name)].copy()

            correlation_0 = get_rank_correlation(df_sb, df_silver, rank_by=["values_0_normalized", "values_0"])          
            ax.scatter(
                df_silver["values_1"] * obj2_scale,
                df_silver["values_0"] * obj1_scale,
                color="black",
                alpha=0.7,
                zorder=4,
                s=10,
                label=f"{len(df_silver)} Silver Bullets ({rank_corr_symbol}={correlation_0[rank_corr_method]["correlation"]:.2f})",
            )

            pareto_mask = get_pareto_mask(df_silver)
            df_silver_pareto = df_silver[pareto_mask]
            df_silver_pareto = df_silver_pareto.sort_values(
                by="values_0",
                ascending=True,
            )
            
            x_pareto = list(df_pareto["values_1"])
            x_silver = list(df_silver_pareto["values_1"])
            x = list(set(x_pareto + x_silver))
            x.sort()
            y_opt = []
            y_silver = []
            y_opt_last = 0
            y_silver_last = 0

            for j in range(len(x)):
                if x[j] in x_pareto:
                    y_opt.append(df_pareto[df_pareto["values_1"] == x[j]]["values_0"].values[0])
                else:
                    y_opt.append(y_opt_last)
                y_opt_last = y_opt[-1]
                if x[j] in x_silver:
                    y_silver.append(df_silver_pareto[df_silver_pareto["values_1"] == x[j]]["values_0"].values[0])
                else:
                    y_silver.append(y_silver_last)
                y_silver_last = y_silver[-1]

            if silver_bullets_can_dominate:

                y = []
                y2 = []
                for yy, yy2 in zip(y_opt, y_silver):
                    if yy > yy2:
                        y.append(yy)
                        y2.append(yy2)
                    else:
                        y.append(yy)
                        y2.append(yy)
 
                ax.fill_between(
                    [xx * obj2_scale for xx in x],
                    [yy * obj1_scale for yy in y],
                    y2=[yy2 * obj1_scale for yy2 in y2],
                    step="post",
                    color="tomato",
                    alpha=0.4,
                    zorder=1,
                )

                y = []
                y2 = []
                for yy, yy2 in zip(y_silver, y_opt):
                    if yy > yy2:
                        y.append(yy)
                        y2.append(yy2)
                    else:
                        y.append(yy)
                        y2.append(yy)

                ax.fill_between(
                    [xx * obj2_scale for xx in x],
                    [yy * obj1_scale for yy in y],
                    y2=[yy2 * obj1_scale for yy2 in y2],
                    step="post",
                    color="dodgerblue",
                    alpha=0.4,
                    zorder=1,
                )
            else:           
                ax.fill_between(
                    [xx * obj2_scale for xx in x],
                    [yy * obj1_scale for yy in y_opt],
                    y2=[yy2 * obj1_scale for yy2 in y_silver],
                    step="post",
                    color="tomato",
                    alpha=0.4,
                    zorder=1,
                )

            ax.step(
                df_silver_pareto["values_1"] * obj2_scale,
                df_silver_pareto["values_0"] * obj1_scale,
                where="post",
                marker="o",
                color="black",
                label="Silver Bullet-Frontier",
                zorder=2,
                markersize=0,
                linestyle=":",
                linewidth=1.5,
            )

            area_pct = get_relative_area(df_pareto, df_silver_pareto)
            x = list(df_silver_pareto["values_1"] * obj2_scale)
            y_opt = list(df_silver_pareto["values_0"] * obj1_scale)
            if df_pareto["values_1"].iloc[-1] > df_silver_pareto["values_1"].iloc[-1]:
                x.append(df_pareto["values_1"].iloc[-1] * obj2_scale)
                y_opt.append(df_pareto["values_0"].iloc[-1] * obj1_scale)
            ax.fill_between(
                x,
                y_opt,
                y2=0,
                step="post",
                color="lightgray",
                alpha=0.4,
                zorder=1,
                label=f"Pareto-fraction: {area_pct:.2%}",
            )
            pareto_areas.append(area_pct)
            

        if plot_idx % n_cols == 0:
            ax.set_ylabel(obj1_name)
        if plot_idx >= (n_rows - 1) * n_cols:
            ax.set_xlabel(obj2_name)

        ax.set_xscale('log')
        ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{100*x:.0f}%'))
        
        legend = ax.legend(
            loc="lower right",
            framealpha=0.8,
            title=titles.get(study_name, study_name),
            fontsize=7,
        )
        legend.get_title().set_fontweight('bold')
        legend.get_title().set_fontsize(8)

        ax.yaxis.set_minor_locator(AutoMinorLocator())
        ax.yaxis.grid(which='major', linestyle='--', linewidth=0.5)
        ax.yaxis.grid(which='minor', linestyle=':', linewidth=0.5)

        plot_idx += 1

    for k in range(plot_idx, n_rows * n_cols):
        row, col = divmod(k, n_cols)
        fig.delaxes(axes[row][col])

    if df_sb is not None:
        y_min = min(df_silver["values_0"].min(), df_pareto["values_0"].min())
        y_max = max(df_silver["values_0"].max(), df_pareto["values_0"].max())
        ax.set_ylim(y_min - 0.01, y_max + 0.01)

    plt.tight_layout()
    plt.savefig(
        SEEDING_RESULTS_DIR / (file_name + ".pdf"),
        bbox_inches="tight",
        dpi=300,
        transparent=False,
    )
    plt.savefig(
        SEEDING_RESULTS_DIR / (file_name + ".png"),
        bbox_inches="tight",
        dpi=300,
        transparent=False,
    )
    plt.show()

    if df_sb is not None:
        print(f"Average Pareto-fraction: {sum(pareto_areas) / len(pareto_areas):.2%}")

In [None]:
plot_experiments(
    df=df,
    study_names=study_names,
    obj1_name=OBJ_NAMES[OBJ1],
    obj2_name=OBJ_NAMES[OBJ2],
    n_cols=2,
    file_name="silver_bullets-optimization_only",
    silver_bullets_can_dominate=True,
)

In [None]:
plot_experiments(
    df=df,
    study_names=study_names,
    obj1_name=OBJ_NAMES[OBJ1],
    obj2_name=OBJ_NAMES[OBJ2],
    n_cols=2,
    file_name="silver_bullets-training",
    df_sb=df_sb,
    silver_bullets_can_dominate=True,
)

In [None]:
sb_flows_on_pareto = {f: [] for f in df_sb["user_attrs_flow"].values}

for study_name_test in study_names:
    if study_name_test not in TITLES:
        continue

    df_trials = df[df["study_name"] == study_name_test]
    pareto_mask = get_pareto_mask(df_trials)
    df_pareto = df_trials[pareto_mask]
    df_pareto = df_pareto.sort_values(by="values_0", ascending=True)
    pareto_flows = list(df_pareto["user_attrs_flow"].apply(json.loads).values)
    pareto_flows_str = {json.dumps(f): [] for f in pareto_flows}
      
    for sb_flow in sb_flows_on_pareto.keys():
        for flow in pareto_flows_str:
            if sb_flow == flow:
                sb_flows_on_pareto[sb_flow].append(study_name_test)
                break

print("Silver Bullet Flows (low-cost to high-accuracy):")
print("-------------------------------------------------")
for flow_str, (flow, studies) in enumerate(sb_flows_on_pareto.items()):
    print(f"Silver bullet {flow_str+1} is {len(studies)} times Pareto-optimal: {flow}")

In [None]:
sb_flows_on_pareto

In [None]:
df_table = pd.DataFrame(
    index=sb_flows_on_pareto.keys(), 
    columns=[OBJ_NAMES[obj1_name], OBJ_NAMES[obj2_name], "RAG mode", "Response Synthesizer", "Embedding Model", "Num. Dominating"],
)

df_sb_flow = df_sb.copy()
df_sb_flow.set_index("user_attrs_flow", inplace=True, drop=True)

for flow_str in df_sb_flow.index:
    assert flow_str in sb_flows_on_pareto.keys(), f"Flow {flow_str} is not in: {sb_flows_on_pareto.keys()}"
    flow = json.loads(flow_str)
    df_table.loc[flow_str, OBJ_NAMES[obj1_name]] = df_sb_flow.loc[flow_str, "values_0_normalized"]
    df_table.loc[flow_str, OBJ_NAMES[obj2_name]] = df_sb_flow.loc[flow_str, "values_1_normalized"]
    df_table.loc[flow_str, "RAG mode"] = flow["rag_mode"]
    df_table.loc[flow_str, "Response Synthesizer"] = flow.get("response_synthesizer_llm", "None")
    df_table.loc[flow_str, "Embedding Model"] = flow.get("rag_embedding_model", "None")
    df_table.loc[flow_str, "Num. Dominating"] = len(sb_flows_on_pareto[flow_str])

df_table.reset_index(inplace=True, drop=True)
df_table = df_table.sort_values(
    by=[OBJ_NAMES[obj1_name], OBJ_NAMES[obj2_name]],
    ascending=[False, True],
)
df_table.reset_index(drop=True, inplace=True)
df_table = df_table.rename(columns={
    OBJ_NAMES[obj1_name]: f"{OBJ_NAMES[obj1_name]} (Avg Norm.)",
    OBJ_NAMES[obj2_name]: f"{OBJ_NAMES[obj2_name]} (Avg Norm.)",
})

df_table

In [None]:
latex_str = df_table.to_latex(index=False, escape=False, float_format="%.2f", column_format="rrlllr")
print(latex_str)

In [None]:
df_sb

In [None]:
INCLUDE_REGEX_TEST = [
    # "silver1--out-of-sample.*",
    "seeding1--testing-silver.*",
    # "seeding1--testing-random.*",
]
EXCLUDE_REGEX_TEST = []

TITLES_TEST = {
    # "silver1--out-of-sample--drdocs_hf": "DRDocs",
    # "silver1--out-of-sample--financebench_hf": "FinanceBench",
    # "silver1--out-of-sample--hotpotqa_hf-train_hard--train_hard": "HotpotQA",
    # "silver1--out-of-sample--infinitebench_hf--longbook_qa_eng": "InfiniteBench",
    # "silver1--out-of-sample--multihoprag_hf": "MultiHop-RAG",
    # "silver1--out-of-sample--phantomwikiv050_hf--depth_20_size_10000_seed_3": "PhantomWiki",
    # ------------------------------------------------------------------------------------------
    "seeding1--testing-silver--bright_hf--biology": "Bright Biology (Silver Bullets)",
    "seeding1--testing-silver--drdocs_hf": "DRDocs (Silver Bullets)",
    "seeding1--testing-silver--infinitebench_hf--longbook_qa_eng": "InfiniteBench (Silver Bullets)",
    "seeding1--testing-silver--phantomwikiv050_hf--depth_20_size_10000_seed_3": "PhantomWiki (Silver Bullets)",
    # -------------------------------------------------------------------------------------------
    # "seeding1--testing-random--bright_hf--biology": "Bright Biology (Silver Bullets)",
    # "seeding1--testing-random--drdocs_hf": "DRDocs (Silver Bullets)",
    # "seeding1--testing-random--infinitebench_hf--longbook_qa_eng": "InfiniteBench (Silver Bullets)",
    # "seeding1--testing-random--phantomwikiv050_hf--depth_20_size_10000_seed_3": "PhantomWiki (Silver Bullets)",
}

study_names_test = get_study_names(
    include_regex=INCLUDE_REGEX_TEST,
    exclude_regex=EXCLUDE_REGEX_TEST,
)

df_test = get_data(study_names_test, titles=TITLES_TEST, restrict_to_title=True)

plot_experiments(
    df=df_test,
    study_names=study_names_test,
    obj1_name=OBJ_NAMES[OBJ1],
    obj2_name=OBJ_NAMES[OBJ2],
    df_sb=df_sb,
    titles=TITLES_TEST,
    n_cols=2,
    file_name="silver_bullets-test",
    silver_bullets_can_dominate=True,
)

In [None]:
# in_sample_data = {}
# for study_name in study_names:
#     in_sample_data[study_name] = get_data(
#         study_names=[study_name],
#         success_rate=SUCCESS_RATE,
#         titles=TITLES,
#         restrict_to_title=True,
#     )

# relative_areas = {}
# for study_name_test in study_names_test:
#     df_ref = get_data(
#         study_names=[study_name_test], 
#         success_rate=SUCCESS_RATE, 
#         titles=TITLES_TEST, 
#         restrict_to_title=True,
#     )
#     relative_areas[study_name_test] = {}
#     for study_name in study_names:
#         df_in_sample = in_sample_data[study_name]
#         pareto_mask = get_pareto_mask(df_in_sample)
#         df_pareto = df_in_sample[pareto_mask]
#         df_other = df_ref[df_ref["user_attrs_flow"].isin(df_pareto["user_attrs_flow"])].copy()
#         relative_area = get_relative_area(df_ref, df_other)
#         relative_areas[study_name_test][study_name] = relative_area

# avg_relative_areas = {}
# for study_name_test in study_names_test:
#     avg_relative_areas[study_name_test] = sum(relative_areas[study_name_test].values()) / len(relative_areas[study_name_test])
#     print(f"Average relative area for {study_name_test}: {avg_relative_areas[study_name_test]:.2%}")

# print("-" * 100)
# print(f"Average relative area for all test studies: {sum(avg_relative_areas.values()) / len(avg_relative_areas):.2%}")

In [None]:
import typing as T


def get_avg_pareto_fraction(
        df: pd.DataFrame, 
        selected_flows: T.List[T.Dict[str, T.Any]], 
        seeding_trials_can_dominate: bool = False) -> float:
    area_pct_list = []
    study_names = df["study_name"].unique()
    for study_name in study_names:
        df_trials = df[df["study_name"] == study_name].copy()
        df_sb_selected = df_trials[df_trials["user_attrs_flow"].isin(selected_flows)].copy()
        if len(df_sb_selected) < 2:
            continue
        if seeding_trials_can_dominate:
            df_trials = df_trials[df_trials["user_attrs_is_seeding"] == False].copy()
        pareto_mask = get_pareto_mask(df_trials)
        df_pareto = df_trials[pareto_mask]
        area_pct = get_relative_area(df_pareto, df_sb_selected)
        area_pct_list.append(area_pct)
    if not area_pct_list:
        return 0
    return sum(area_pct_list) / len(area_pct_list)

In [None]:
import concurrent.futures
import itertools


def select_k_from_n(n, k):
    items = range(n)
    combinations = list(itertools.combinations(items, k))
    return combinations


def evaluate_combination(combination, df: pd.DataFrame, df_sb: pd.DataFrame):
    selected_flows = df_sb.iloc[list(combination)]["user_attrs_flow"].values
    area_pct = get_avg_pareto_fraction(df, selected_flows)
    return selected_flows, area_pct


def get_optimal_silver_bullet(
    num_silver_bullets: int,
    df: pd.DataFrame,
    df_sb: pd.DataFrame,
    study_names: list[str] | None = None,
):
    assert num_silver_bullets > 1, "Number of silver bullets must be greater than 1."
    assert len(df_sb) >= num_silver_bullets, f"Not enough silver bullets available: {len(df_sb)} < {num_silver_bullets}."

    if study_names:
        df = df[df["study_name"].isin(study_names)].copy()

    with concurrent.futures.ProcessPoolExecutor() as executor:
        combination = select_k_from_n(len(df_sb), num_silver_bullets)
        results = list(executor.map(evaluate_combination, combination, [df]*len(combination), [df_sb]*len(combination)))

    if not results:
        return None, 0
    
    return max(results, key=lambda x: x[1])

In [None]:
# from tqdm import tqdm

# frac_by_num_flows = {}

# for i in tqdm(range(2, len(df_sb) + 1), desc="Finding optimal silver bullets"):
#     sb_flows_best, area_pct_best_in_sample = get_optimal_silver_bullet(
#         num_silver_bullets=i,
#         df=df,
#         df_sb=df_sb,
#         study_names=study_names,
#     )
#     if sb_flows_best is None:
#         continue
#     area_pct_best_out_of_sample = get_avg_pareto_fraction(df_test, sb_flows_best, seeding_trials_can_dominate=True)
#     frac_by_num_flows[i] = {
#         "area_pct_best_in_sample": area_pct_best_in_sample,
#         "area_pct_best_out_of_sample": area_pct_best_out_of_sample,
#         "flows": sb_flows_best,
#     }

In [None]:
# import matplotlib.pyplot as plt
# fig, ax = plt.subplots(figsize=(8, 5))
# x = list(frac_by_num_flows.keys())
# y_in_sample = [frac_by_num_flows[i]["area_pct_best_in_sample"] for i in x]
# y_of_sample = [frac_by_num_flows[i]["area_pct_best_out_of_sample"] for i in x]
# ax.plot(x, y_in_sample, marker='o', label='In-sample', color='gray')
# ax.plot(x, y_of_sample, marker='o', label='Out-of-sample', color='limegreen')
# ax.set_xlabel("Number of Silver Bullets (In-Sample Optimal Selection)")
# ax.set_ylabel("Average Pareto Area Fraction")
# ax.set_title("Average Pareto Area Fraction by Number of Silver Bullets")
# ax.legend()
# plt.xticks(x)
# plt.grid()
# plt.tight_layout()
# plt.show()

In [None]:

# TITLES_COMPARISON = {
#     "seeding1--testing-random--bright_hf--biology": "Bright Biology (Random Seeding)",
#     "seeding1--testing-random--drdocs_hf": "DRDocs (Random Seeding)",
#     "seeding1--testing-random--infinitebench_hf--longbook_qa_eng": "InfiniteBench (Random Seeding)",
#     "seeding1--testing-random--phantomwikiv050_hf--depth_20_size_10000_seed_3": "PhantomWiki (Random Seeding)",
#     "seeding1--testing-silver--bright_hf--biology": "Bright Biology (Silver Bullets)",
#     "seeding1--testing-silver--drdocs_hf": "DRDocs (Silver Bullets)",
#     "seeding1--testing-silver--infinitebench_hf--longbook_qa_eng": "InfiniteBench (Silver Bullets)",
#     "seeding1--testing-silver--phantomwikiv050_hf--depth_20_size_10000_seed_3": "PhantomWiki (Silver Bullets)",
#     "seeding1--testing-transfer--bright_hf--biology": "Bright Biology (Transfer Learning)",
#     "seeding1--testing-transfer--drdocs_hf": "DRDocs (Transfer Learning)",
#     "seeding1--testing-transfer--infinitebench_hf--longbook_qa_eng": "InfiniteBench (Transfer Learning)",
#     "seeding1--testing-transfer--phantomwikiv050_hf--depth_20_size_10000_seed_3": "PhantomWiki (Transfer Learning)",
# }