## Distributions
When computing confidence intervals that are used by the Pareto-Pruner, we assume that accuracy scores are normally and costs and latency scores are log-normally distributed. This notebook motivates the assumptions visually.

In [None]:
%reload_ext autoreload
%autoreload 2

from IPython.core import ultratb

ultratb.VerboseTB.tb_highlight = "bg:#3e0054"

In [None]:
STUDY_NAMES = [
    "rank1--alex--crag_hf-music",
    "rank1--alex--crag_hf-sports",
    "rank1--alex--drdocs_hf",
    "rank1--alex--financebench_hf",
    "rank1--alex--hotpotqa_hf-train_hard",
    "rank1--alex--infinitebench_hf",
 ]

NAMES = {
    "rank1--alex--crag_hf-music": "CRAG3 Music",
    "rank1--alex--crag_hf-sports": "CRAG3 Sports",
    "rank1--alex--drdocs_hf": "DRDocs",
    "rank1--alex--financebench_hf": "FinanceBench",
    "rank1--alex--hotpotqa_hf-train_hard": "HotpotQA Train-Hard",
    "rank1--alex--infinitebench_hf": "InfiniteBench",
}

SUBSTRING = "bench14--small-models"
PREFIX = SUBSTRING + "--"
OBJECTIVE_2 = "average cost"



In [None]:
from syftr.configuration import cfg

distributions_dir = cfg.paths.results_dir / "distributions"
distributions_dir.mkdir(exist_ok=True)

In [None]:
from syftr.optuna_helper import get_completed_trials

df_trials = get_completed_trials(
    study=STUDY_NAMES,
    success_rate=0.9
)
# df_trials[["values_0", "values_1", "study_name"]].to_csv(cfg.paths.results_dir / "accuracy_and_latency.csv", index=False)
df_trials

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import beta, norm


def plot_accuracies(df_trials, study_name, approximation="normal", p_start=0):
    data = df_trials[df_trials["study_name"] == study_name]["values_0"].values
    percentile = np.percentile(data, p_start)
    data_percentile = df_trials[
        (df_trials["study_name"] == study_name) & (df_trials["values_0"] >= percentile)
    ]["values_0"].values
    mu, sigma = np.mean(data_percentile), np.std(data_percentile)
    x = np.linspace(min(data_percentile), max(data_percentile), 100000)
    
    n_bins = int(np.sqrt(len(data_percentile)))
    bins = np.linspace(x[0], x[-1], n_bins)
    hist, bin_edges = np.histogram(data_percentile, bins=bins, density=True)
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
    _, ax = plt.subplots()
    for i in range(len(hist)):
        label = None if i != 0 else "Average accuracy"      
        ax.bar(bin_centers[i], hist[i], width=bin_edges[i+1] - bin_edges[i], color='skyblue', label=label, alpha=0.6)
    
    if approximation == "normal":
        approx = norm.pdf(x, mu, sigma)
        approx_error = norm.pdf(bin_centers, mu, sigma)
        mse = np.mean((hist - approx_error) ** 2)
        if p_start > 0:
            label = f"{approximation.title()} approximation\n$\mu={mu:.2f}, \sigma={sigma:.2f}$\naccuracy $\geq$ P{p_start}, MSE = {mse:.2f}"
        else:  
            label = f"{approximation.title()} approximation\n$\mu={mu:.2f}, \sigma={sigma:.2f}$, MSE = {mse:.2f}"
    elif approximation == "beta":
        a = mu * (mu * (1 - mu) / sigma ** 2 - 1)
        b = a * (1 - mu) / mu
        approx = beta.pdf(x, a, b)
        approx_error = beta.pdf(bin_centers, a, b)
        mse = np.mean((hist - approx_error) ** 2)
        if p_start > 0:
            label = f"{approximation.title()} approximation\n$a={a:.2f}, b={b:.2f}$\naccuracy $\geq$ P{p_start}, MSE = {mse:.2f}"
        else:
            label = f"{approximation.title()} approximation\n$a={a:.2f}, b={b:.2f}$, MSE = {mse:.2f}"
    else:
        approx = np.ones_like(x) * (100 - p_start) / (100 * (x[-1] - percentile)) if p_start > 0 else np.ones_like(x) / (x[-1] - percentile)
        mse = np.mean((hist - approx[0]) ** 2)
        if p_start > 0:
            label = f"{approximation.title()} approximation\naccuracy $\geq$ P{p_start}, MSE = {mse:.2f}"
        else:
            label = f"{approximation.title()} approximation\nMSE = {mse:.2f}"    
    
    plt.plot(x, approx, 'r', label=label)
    plt.title(f"Accuracy distribution for {NAMES[study_name]}")
    plt.xlabel("average accuracy")
    plt.ylabel("density")
    plt.legend()
    plt.grid(True)
    plt.savefig(distributions_dir / f"{study_name}-accuracy-distribution-{approximation}-p{p_start}.png", dpi=300)
    plt.savefig(distributions_dir / f"{study_name}-accuracy-distribution-{approximation}-p{p_start}.pdf", dpi=300)
    plt.show()
    return {"Study": study_name, "Approximation": approximation, "MSE": mse, "P-Start": p_start}


errors = []
for study_name in STUDY_NAMES:
    errors.append(plot_accuracies(df_trials, study_name, approximation="normal"))
    # errors.append(plot_accuracies(df_trials, study_name, approximation="beta"))
    # errors.append(plot_accuracies(df_trials, study_name, approximation="uniform"))
    # errors.append(plot_accuracies(df_trials, study_name, approximation="normal", p_start=20))
    # errors.append(plot_accuracies(df_trials, study_name, approximation="beta", p_start=20))
    # errors.append(plot_accuracies(df_trials, study_name, approximation="uniform", p_start=20))

errors = pd.DataFrame(errors)
errors.loc[errors.groupby("Study").idxmin()["MSE"]]

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import lognorm


def plot_objective_2(df_trials, study_name, approximation):
    data = df_trials[df_trials["study_name"] == study_name]["values_1"].values
    log_data = np.log(data)
    mu, sigma = np.mean(log_data), np.std(log_data)
    n_bins = 2 * int(np.sqrt(len(data)))
    bins = np.logspace(np.log10(min(data)), np.log10(max(data)), n_bins+1)
    hist, bin_edges = np.histogram(data, bins=bins, density=True)
    plt.hist(data, bins=bins, density=True, alpha=0.6, color='skyblue', label=f"{OBJECTIVE_2.title()}")
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
    x = np.linspace(min(data), max(data), 100000)
    if approximation == "log-normal":
        approx = lognorm.pdf(x, sigma, scale=np.exp(mu))
        approx_error = lognorm.pdf(bin_centers, sigma, scale=np.exp(mu))
        mse = np.mean((hist - approx_error) ** 2)
        label = f'{approximation.title()} Approximation\n$\mu={mu:.2f}, \sigma={sigma:.2f}$'
    else:
        raise ValueError(f"Approximation {approximation} not supported")
    plt.plot(x, approx, 'r', label=label)
    plt.xscale('log')
    # plt.yscale('log')
    plt.title(f"{OBJECTIVE_2.title()} Distribution for {NAMES[study_name]}")
    plt.xlabel(f"{OBJECTIVE_2} (log scale)")
    plt.ylabel("density")
    plt.legend()
    plt.grid(True, which="both", linestyle="--", linewidth=0.5)
    plt.savefig(distributions_dir / f"{study_name}-{OBJECTIVE_2}-distribution.png", dpi=300)
    plt.savefig(distributions_dir / f"{study_name}-{OBJECTIVE_2}-distribution.pdf", dpi=300)
    plt.show()
    return {"Study": study_name, "Approximation": "log-normal", "MSE": mse}


errors = []
for study_name in STUDY_NAMES:
    errors.append(plot_objective_2(df_trials, study_name, approximation="log-normal"))
errors = pd.DataFrame(errors)
errors