In [None]:
# src/antibody_bo/pipeline/bo_notebook_runner.py
from __future__ import annotations
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from dataclasses import dataclass

from os import sys
sys.path.append('../src/')

from utils.utils import load_or_compute_embeddings, load_or_compute_developability, toy_ground_truth
from models.mf_gp_model import MultiFidelityGP
from models.gp_model import DevelopabilityGP
from acquisition.acq_utils import make_acq
from acquisition.mutate_seq import hill_climb, genetic_algorithm, gibbs_sampling
from embeddings.esm_encoder import embed_sequences, embed_single
from developability.dev_score import score_sequences


FID_MAP = {"y_low": 0.3, "y_medium": 0.7, "y_high": 0.95}

In [None]:
def run_bo(
  cfg,
  raw_json: str | Path,
  embed_file: str | Path,
  dev_file: str | Path,
  dev_dir: str | Path,
  ground_truth_fn,
):
  # STEP 1: embeddings with cache
  print("→ Embedding sequences…")
  df = load_or_compute_embeddings(
      raw_json,
      embed_file,
      seq_col="sequence",
      embed_col="pca_embed",
      embed_fn=lambda d: embed_sequences(d, n_components=cfg.embed_components),
  )

  df["seq_id"] = list(range(len(df)))
  next_seq_id = len(df)
  

  """# STEP 2: developability with cache
  print("→ Computing developability…")
  df = load_or_compute_developability(
      embed_file,
      dev_file,
      seq_col="sequence",
      dev_col="dev_score",
      score_fn=score_sequences,
      dev_json_dir=dev_dir
  )"""

  # Scale Dev Scores on 0-1
  dev_vals = df["dev_score"].values
  mask_dev = ~np.isnan(dev_vals)
  if mask_dev.any():
      d = dev_vals[mask_dev]
      d_min, d_max = d.min(), d.max()
      df.loc[mask_dev, "dev_score"] = ((d - d_min) / (d_max - d_min)).tolist()

  # STEP 3: prepare original training data
  print("→ Initializing GP models…")
  X0 = np.vstack(df["pca_embed"].values).astype(float)
  Y0 = df[["y_low", "y_medium", "y_high"]].values.astype(float)
  n = len(df)
  # fidelity codes: 0=low,1=medium,2=high
  F0 = np.concatenate([np.zeros(n), np.ones(n), np.full(n, 2)])

  # initial models
  X_mf = np.repeat(X0, 3, axis=0)
  y_mf = Y0.ravel()
  mf_gp = MultiFidelityGP(X_mf, y_mf, F0, [0, 1, 2])
  print("  ✔ Fitness MF_GP Model ready.")


  mask_dev = ~np.isnan(df.dev_score.values)
  dev_gp = DevelopabilityGP(X0[mask_dev], df.dev_score.values[mask_dev])

  print("  ✔ Developability GP model ready.")

  # STEP 4: caching helpers
  pad_len = max(len(s) for s in df.sequence)
  from functools import lru_cache

  @lru_cache(maxsize=200_000)
  def embed_cached(seq: str) -> np.ndarray:
      return embed_single(seq, cfg.embed_components)

  @lru_cache(maxsize=200_000)
  def fit_one(seq: str) -> float:
      return float(mf_gp.predict(embed_cached(seq).reshape(1, -1))[0])

  # prepare output container and initial seed
  df_all = df.assign(
      iter=0, batch=0, selected=True,
      y_high=np.nan, dev_score=np.nan
  )
  seed_seq = df.sequence.iloc[int(np.argmax(Y0[:, 2]))]

  processed_path = Path("../data/processed/cd98_10k/cd98_vs_interim_1024_low.jsonl")

  # STEP 5: BO loop
  for it in tqdm(range(cfg.n_iter), desc="BO Iterations"):

      new_rows: list[pd.DataFrame] = []
     
      for b in tqdm(range(cfg.batch_k), desc=f" Batches (iter {it+1})", leave=False):

          acq_fn = make_acq(cfg.acq, mf_gp, mf_gp.y.max(), xi=cfg.xi, kappa=cfg.kappa)

          # 1) generate & log candidates
          if cfg.seq_opt == "hc":
              recs = hill_climb(
                  seed_seq,
                  fit_one,
                  local_k2_samples=cfg.seq_proposals or len(df),
                  restarts=1
              )
          elif cfg.seq_opt == "ga":
              recs = genetic_algorithm(
                  [seed_seq],
                  fit_one,
                  max_gen=cfg.ga_generations or 100,
                  pop_size=cfg.seq_proposals or len(df),
              )
          else:
              recs = gibbs_sampling(
                  seed_seq,
                  fit_one,
                  gamma=cfg.gamma,
                  iters=cfg.gibbs_iters or 10,
              )

          batch_df = pd.DataFrame(recs)
          batch_df["seq_id"] = list(range(next_seq_id, next_seq_id+len(batch_df)))
          next_seq_id += len(batch_df)

          # attach embeddings for each candidate so pca_embed is always defined
          batch_df["pca_embed"] = batch_df["sequence"].apply(
              lambda s: embed_cached(s).tolist()
          )

          # 2) pick one via acquisition
          already_selected = set(df_all[df_all.selected].sequence)
          unique_seqs = [s for s in batch_df.sequence.unique() if s not in already_selected]
          embs        = np.vstack([embed_cached(s) for s in unique_seqs])
          f_acq       = acq_fn(embs)

          # normalize acquisition scores to [0,1]
          f_min, f_max = f_acq.min(), f_acq.max()
          if f_max > f_min:
              f_norm = (f_acq - f_min) / (f_max - f_min)
          else:
              # all values identical → give them zero (or 0.5 if you prefer neutral)
              f_norm = np.zeros_like(f_acq)

          dev_pred    = dev_gp.predict(embs)
          combo       = (1 - cfg.dev_weight) * f_acq + cfg.dev_weight * dev_pred
          idx         = int(np.argmax(combo))
          seq_sel     = unique_seqs[idx]
          x_sel       = embs[idx].reshape(1, -1)

          # 3) measure true values
          y_sel      = ground_truth_fn(x_sel)[0]
          seq_id_sel = batch_df["seq_id"].iloc[idx]
          dev_true   = score_sequences([seq_sel], [seq_id_sel], dev_dir = Path(dev_dir))[0]


          # 4) annotate
          batch_df["y_high"]    = np.nan
          batch_df["dev_score"] = np.nan
          mask = batch_df.sequence == seq_sel

          batch_df.loc[mask, "y_high"]    = y_sel
          batch_df.loc[mask, "dev_score"] = dev_true
          batch_df["iter"]     = it + 1
          batch_df["batch"]    = b  + 1
          batch_df["selected"] = mask

          sel_row = batch_df[batch_df.selected].copy()
          new_rows.append(sel_row)

      # 5) append & save interim JSON
      df_all = pd.concat([df_all, *new_rows], ignore_index=True)
      df_all[df_all.selected].to_json(processed_path, orient="records", lines=True)
      new_rows.clear()

      # 6) rebuild GPs via original constructors on updated data
      sel_df = df_all[df_all.selected]
      # multi-fidelity: original + all selected
      X_sel = np.vstack(sel_df["pca_embed"].tolist()).astype(float)
      y_sel_all = sel_df[sel_df.selected]["y_high"].values.astype(float)

      X_mf = np.vstack([np.repeat(X0, 3, axis=0), X_sel])
      y_mf = np.concatenate([Y0.ravel(), y_sel_all])
      F_mf = np.concatenate([F0, np.full(len(y_sel_all), 2)])
      mf_gp = MultiFidelityGP(X_mf, y_mf, F_mf, [0, 1, 2])
      # developability: original + all selected
      """y_dev_all = df_upd[df_upd.selected]["dev_score"].values.astype(float)
      dev_gp = DevelopabilityGP(
          np.vstack([X0, X_sel]),
          np.concatenate([df.dev_score.values, y_dev_all])
      )"""
      mask_orig = ~np.isnan(df.dev_score.values)
      X_orig = X0[mask_orig]
      y_orig = df.dev_score.values[mask_orig]
      X_train = np.vstack([X_orig, X_sel])
      y_train = np.concatenate([y_orig, y_sel_all])
      dev_gp = DevelopabilityGP(X_train, y_train)
         
      # 7) next seed
      seed_seq = seq_sel

  print("→ Bayesian optimization complete.")
  return df_all, mf_gp.y


In [None]:
@dataclass
class PipelineConfig:
  acq: str = "ei"
  seq_opt: str = "gs"
  seq_proposals: int | None = None
  dev_weight: float = 0.3
  xi: float = 0.1
  kappa: float = 0.0
  n_iter: int = 4
  batch_k: int = 50
  embed_components: int = 1024
  bounds_scale: float = 1.0
  gamma: float = 10.0
  gibbs_iters: int = 5
  ga_generations: int | None = None


if __name__ == "__main__":
  cfg = PipelineConfig()
  df_all, fitness_history = run_bo(
      cfg,
      "../data/interim/cd98_10k/cd98_biophi_1024.jsonl",
      "../data/interim/cd98_10k/cd98_biophi_1024.jsonl",
      "../data/interim/cd98_10k/cd98_biophi_new.jsonl",
      "../data/biophi/cd98_10k",
      toy_ground_truth,
  )
  df_all.to_json(
      "../data/processed/cd98_10k/cd98_final_1024_low.jsonl",
      orient="records",
      lines=True,
  )

→ Embedding sequences…
Embeddings ready: 64 / 64
→ Computing developability…
Developability ready: 64 / 64
→ Initializing GP models…


  check_min_max_scaling(
  check_standardization(Y=train_Y, raise_on_fail=raise_on_fail)


  ✔ GP models ready.


BO Iterations:   0%|          | 0/2 [00:00<?, ?it/s]
[A
[A
Gibbs iterations: 100%|██████████| 3/3 [00:27<00:00,  9.19s/it]
  check_min_max_scaling(
  check_standardization(Y=train_Y, raise_on_fail=raise_on_fail)

[A
[A
[A
Gibbs iterations: 100%|██████████| 3/3 [00:40<00:00, 13.38s/it]
  check_min_max_scaling(
  check_standardization(Y=train_Y, raise_on_fail=raise_on_fail)
BO Iterations:  50%|█████     | 1/2 [01:09<01:09, 69.70s/it]
[A
[A
[A
Gibbs iterations: 100%|██████████| 3/3 [00:33<00:00, 11.31s/it]
  check_min_max_scaling(
  check_standardization(Y=train_Y, raise_on_fail=raise_on_fail)

[A
[A
[A
Gibbs iterations: 100%|██████████| 3/3 [00:37<00:00, 12.50s/it]
  check_min_max_scaling(
  check_standardization(Y=train_Y, raise_on_fail=raise_on_fail)
BO Iterations: 100%|██████████| 2/2 [02:23<00:00, 71.79s/it]

→ Bayesian optimization complete.





'@dataclass\nclass PipelineConfig:\n    acq: str = "ei"\n    seq_opt: str = "gs"\n    seq_proposals: int | None = None\n    dev_weight: float = 0.3\n    xi: float = 0.01\n    kappa: float = 2.0\n    n_iter: int = 1\n    batch_k: int = 1\n    embed_components: int = 64\n    bounds_scale: float = 1.0\n\ncfg = PipelineConfig()\n\ndf_all, fitness_history = run_bo(cfg, "../data/raw/cd98_64_seq.json", "../data/interim/cd98_64_embed.jsonl", "../data/interim/cd98_64_biophi.jsonl", toy_ground_truth)\ndf_all.to_json("../data/processed/cd98_64_2.jsonl", orient="records", lines=True,)'