## Top to Knee
This notebook show the possible cost saving from switch from the top-accuracy model to the knee-point on the cost-accuracy Pareto-curve.

In [None]:
%reload_ext autoreload
%autoreload 2

from IPython.core import ultratb

ultratb.VerboseTB.tb_highlight = "bg:#3e0054"

In [None]:
from syftr.optuna_helper import get_completed_trials

STUDY_NAMES = [
    "bench14--small-models--crag-music",
    "bench14--small-models--crag-sports",
    "bench14--small-models--drdocs",
    "bench14--small-models--financebench",
    "bench14--small-models--hotpot-train-hard",
    "bench14--small-models--infinitebench",
]
SUCCESS_RATE = 0.9

df_all = get_completed_trials(STUDY_NAMES, success_rate=SUCCESS_RATE)

In [None]:
import pandas as pd
from kneed import KneeLocator

from syftr.configuration import cfg
from syftr.optuna_helper import get_pareto_mask

datasets = list(df_all["user_attrs_dataset"].unique())

df_results = pd.DataFrame(index=datasets)

for study_name in STUDY_NAMES:
    
    df = df_all[df_all["study_name"] == study_name].copy()
    dataset = df["user_attrs_dataset"].unique()[0]

    pareto_mask = get_pareto_mask(df)
    df_pareto = df[pareto_mask]
    df_pareto = df_pareto.sort_values(by="values_1")

    knee = KneeLocator(
        df_pareto["values_1"],
        df_pareto["values_0"],
        curve="concave",
        direction="increasing",
    )

    df_pareto["is_knee"] = df_pareto["values_1"] == knee.knee
    df_pareto["is_top_accuracy"] = df_pareto["values_0"] == df_pareto["values_0"].max()
    
    
    top_accuracy = df_pareto["values_0"].max()

    df_results.loc[dataset, "study_name"] = study_name
    df_results.loc[dataset, "n_trials"] = len(df)
    df_results.loc[dataset, "n_trials_pareto"] = len(df_pareto)
    df_results.loc[dataset, "top_accuracy__accuracy"] = top_accuracy
    df_results.loc[dataset, "top_accuracy__cents_per_100_calls"] = 10000 * df_pareto[df_pareto["is_top_accuracy"]]["values_1"].min()
    df_results.loc[dataset, "knee_point__accuracy"] = df_pareto[df_pareto["is_knee"]]["values_0"].max()
    df_results.loc[dataset, "knee_point__cents_per_100_calls"] = 10000 * df_pareto[df_pareto["is_knee"]]["values_1"].min()
    df_results.loc[dataset, "accuracy_change__percentage_points"] = (df_results.loc[dataset, "knee_point__accuracy"] - top_accuracy) * 100
    df_results.loc[dataset, "accuracy_change__percentage"] = (df_results.loc[dataset, "knee_point__accuracy"] - top_accuracy) / top_accuracy * 100
    df_results.loc[dataset, "cost_change__cents_per_100_calls"] = df_results.loc[dataset, "knee_point__cents_per_100_calls"] - df_results.loc[dataset, "top_accuracy__cents_per_100_calls"]
    df_results.loc[dataset, "cost_change__percentage"] = (
        df_results.loc[dataset, "knee_point__cents_per_100_calls"] - df_results.loc[dataset, "top_accuracy__cents_per_100_calls"]
    ) / df_results.loc[dataset, "top_accuracy__cents_per_100_calls"] * 100


df_results.to_csv(cfg.paths.results_dir / "small-models--top-to-knee.csv")
df_results