## Embeddings
We encode flow dictionaries that contain all parameters of a flow with the help of text embedding models. This is used to get a notion of similarity between flows. Having that, we can recommend a diverse set of flows to seed an optimization.

In [None]:
%reload_ext autoreload
%autoreload 2

from IPython.core import ultratb

ultratb.VerboseTB.tb_highlight = "bg:#3e0054"

In [None]:
import optuna
import pandas as pd

from syftr.configuration import cfg
from syftr.optuna_helper import get_completed_trials, recreate_locally

STUDY_NAME = "buzok-ss-financebench-rag-only-04"

STORAGE = cfg.postgres.get_optuna_storage()

study = optuna.load_study(study_name=STUDY_NAME, storage=STORAGE)
df_trials: pd.DataFrame = get_completed_trials(study)

cfg.paths.results_dir.mkdir(parents=True, exist_ok=True)

print(f"The study '{STUDY_NAME}' has {len(df_trials)} completed trials")

In [None]:
assert 0 == len(df_trials[df_trials["values_1"] == 0])
df_trials = df_trials[~df_trials["user_attrs_flow"].isna()]
df_trials = df_trials.sort_values("values_1")
df_trials = df_trials.reset_index(drop=True)
df_trials["values_1"] = df_trials["values_1"] / df_trials["values_1"].max()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.spatial import distance


def calculate_distances_to_curve(data, curve_func, num_points=1000):
    x_min = data["values_1"].min()
    x_max = data["values_1"].max()
    x_sample = np.linspace(x_min, x_max, num_points)
    y_sample = curve_func(x_sample)
    curve_points = np.column_stack((x_sample, y_sample))
    distances = np.zeros(data.shape[0])
    for i, point in enumerate(data[["values_1", "values_0"]].values):
        dist_to_curve = distance.cdist([point], curve_points, metric="euclidean")
        distances[i] = np.min(dist_to_curve)
    return distances

In [None]:
from paretoset import paretoset

x_min = df_trials["values_1"].min()
x_max = df_trials["values_1"].max()

pareto_mask = paretoset(df_trials[["values_0", "values_1"]], sense=["max", "min"])
df_pareto = df_trials[pareto_mask]

def pareto_curve(x):
    return np.interp(x, df_pareto["values_1"].values, df_pareto["values_0"].values)

distances = calculate_distances_to_curve(df_trials, pareto_curve)
df_trials["distance"] = distances
max_distance = max(distances)

for i, dist in enumerate(distances):
    label = "Non-Pareto Trial" if dist == max_distance else None
    plt.scatter(
        df_trials.loc[i, "values_1"],
        df_trials.loc[i, "values_0"],
        s=dist * 100,
        edgecolor="gray",
        facecolor="none",
        label=label,
    )

plt.scatter(
    df_pareto["values_1"],
    df_pareto["values_0"],
    label="Pareto Trial",
    s=20,
    color="tomato",
)
x_sample = np.linspace(
    start=df_trials["values_1"].min(),
    stop=df_trials["values_1"].max(),
    num=1000,
)
y_sample = pareto_curve(x_sample)
plt.plot(x_sample, y_sample, label="Pareto Front", color="tomato")
plt.legend()
plt.xlabel("Latency")
plt.ylabel("Accuracy")
plt.title(f"Study '{STUDY_NAME}' with Distance to Pareto Front")

plt.tight_layout()
plt.savefig(cfg.paths.results_dir / f"{STUDY_NAME}-pareto.png")
plt.show()

In [None]:
from syftr.huggingface_helper import get_embedding_model

embedding_model = "BAAI/bge-small-en-v1.5"
embedder, _ = get_embedding_model(embedding_model)


def embed_text(text):
    return embedder.get_query_embedding(text)


embeddings = np.array([embed_text(text) for text in df_trials["user_attrs_flow"]])

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, perplexity=100, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
sc = plt.scatter(
    embeddings_2d[:, 0],
    embeddings_2d[:, 1],
    c=df_trials["distance"],
    cmap="RdBu",
    alpha=0.7,
    s=5,
)
plt.colorbar(sc, label="Distance to Pareto-Frontier")
plt.title(
    f"'{embedding_model}'-embeddings of JSON-Encoded Flows\nand Distance to Pareto-Frontier for Study '{STUDY_NAME}'"
)
plt.xlabel("t-SNE 1")
plt.ylabel("t-SNE 2")

plt.gca().set_xticks([])
plt.gca().set_yticks([])

plt.tight_layout()
plt.savefig(cfg.paths.results_dir / f"{STUDY_NAME}-clustering.png")
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(
    1, 2, figsize=(10, 4), gridspec_kw={"width_ratios": [0.9, 1], "wspace": 0.1}
)

for i, dist in enumerate(distances):
    label = "Non-Pareto Trial" if dist == max_distance else None
    ax1.scatter(
        df_trials.loc[i, "values_1"],
        df_trials.loc[i, "values_0"],
        s=dist * 100,
        edgecolor="gray",
        facecolor="none",
        label=label,
    )

ax1.scatter(
    df_pareto["values_1"],
    df_pareto["values_0"],
    label="Pareto Trial",
    s=20,
    color="tomato",
)

ax1.plot(x_sample, y_sample, label="Pareto Front", color="tomato")
ax1.legend()
ax1.set_xlabel("Latency (normalized)")
ax1.set_ylabel("Accuracy")
ax1.set_title(f"Study '{STUDY_NAME}'")

sc = ax2.scatter(
    embeddings_2d[:, 0],
    embeddings_2d[:, 1],
    c=df_trials["distance"],
    cmap="RdBu",
    alpha=0.7,
    s=5,
)
fig.colorbar(sc, label="Distance to Pareto Front")
ax2.set_title("Embeddings of Encoded Pipelines")
ax2.set_xlabel("t-SNE 1")
ax2.set_ylabel("t-SNE 2")

ax2.set_xticks([])
ax2.set_yticks([])

plt.tight_layout()
plt.savefig(cfg.paths.results_dir / f"{STUDY_NAME}-pareto-clustering.png")
plt.show()