## 

## Transfer Learning
This notebook shows how we select and visualizes flows from historical studies to use for seeding.

In [None]:
%reload_ext autoreload
%autoreload 2

from IPython.core import ultratb

ultratb.VerboseTB.tb_highlight = "bg:#3e0054"

In [None]:
try:
    from syftr.configuration import cfg
except:
    import os
    os.chdir('./../')

In [None]:
from syftr.configuration import cfg
from syftr.optuna_helper import get_completed_trials

STUDY_NAMES = [
    "seeding1--training--crag_hf-music--music",
    "seeding1--training--financebench_hf",
    "seeding1--training--hotpotqa_hf-train_hard--train_hard",
    "seeding1--training--multihoprag_hf",
]
MAX_PARETO_FRONTS = 2
NUM_REQUIRED = 23

RESULTS_DIR = cfg.paths.results_dir / "transfer_learning"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
SHOW_TITLE = False

df_trials = get_completed_trials(
    study=STUDY_NAMES,
    success_rate=0.9
)

print(f"Loaded {len(df_trials)} trials")

In [None]:
from syftr.transfer_learning import get_top_performing_trials

df_top = get_top_performing_trials(df_trials, MAX_PARETO_FRONTS)

assert len(df_top) >= NUM_REQUIRED, f"Got {len(df_top)} top-performing flows, expected at least {NUM_REQUIRED}"
print(f"Found {len(df_top)} top-performing flows")

In [None]:
from syftr.transfer_learning import get_selected_trials

embedding_model = "BAAI/bge-large-en-v1.5"
df_selected = get_selected_trials(df_top, embedding_model, max_total=NUM_REQUIRED)
df_selected

In [None]:
import numpy as np
from syftr.huggingface_helper import get_embedding_model

embedder, _ = get_embedding_model(embedding_model)

flows = list(df_top["user_attrs_flow"].values)
embeddings = np.array([embedder.get_query_embedding(text) for text in flows])

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, perplexity=5, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))

has_selected = False
has_rejected = False
for i, (x, y) in enumerate(embeddings_2d):
    front_value = df_top.iloc[i]["front"]
    if df_top.index[i] in df_selected.index:
        label = "Selected flow" if not has_selected else None
        plt.scatter(x, y, c="limegreen", alpha=1.0, s=150, label=label, zorder=1)
        has_selected = True
    else:
        label = "Rejected flow" if not has_rejected else None
        plt.scatter(x, y, c="gray", alpha=0.9, s=150, label=label, zorder=0)
        has_rejected = True
    plt.text(x, y, str(front_value), fontsize=9, ha='center', va='center')

if SHOW_TITLE:
    plt.title(
        f"Selected {NUM_REQUIRED} out of {len(df_top)} Top-Flows Using Clusters of {embedding_model}-Embeddings"
    )
plt.xlabel("t-SNE 1")
plt.ylabel("t-SNE 2")

plt.gca().set_xticks([])
plt.gca().set_yticks([])

plt.legend(loc="upper left")
plt.tight_layout()
plt.savefig(RESULTS_DIR / "transfer-learning--selected-flows.png", dpi=300)
plt.savefig(RESULTS_DIR / "transfer-learning--selected-flows.pdf", dpi=300)
plt.show()