In [None]:
# Cell 1: Imports & ground_truth
import pandas as pd
import numpy as np

import sys
sys.path.append('../src/')
from embeddings.esmc_encoder import embed_sequences, embed_single

def ground_truth(x: np.ndarray) -> np.ndarray:
    """
    Toy landscape: many optima, nonconvex.
    Replace with your real assay when ready.
    """
    # x: (N, D)
    return np.sum((x - 0.5) ** 2, axis=1) + np.sin(5 * np.sum(x, axis=1))

In [None]:
# Cell 2: Load your sequences JSON
# Assumes: one record per line, with at least a "sequence" field.
df = pd.read_json("../data/raw/input_sequences.json", lines=True)
df.head(3)

In [3]:
# Cell 3: Embed all sequences
df = embed_sequences(df, seq_col="sequence", n_components=1024)

ESM-C embedding:   0%|          | 0/2000 [00:00<?, ?it/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

  state_dict = torch.load(
ESM-C embedding: 100%|██████████| 2000/2000 [11:17<00:00,  2.95it/s]


In [4]:
# Extract the PCA embedding matrix:
X = np.vstack(df["embedding_pca1024"].values).astype(np.float32)

In [5]:
# Cell 4: Compute noiseless ground truth
y_true = ground_truth(X)

In [6]:
# Cell 5: Add three fidelity levels via noise
# e.g. low fidelity = heavy noise, high fidelity = light noise
noise_levels = {
    "y_low": 0.3,      # large noise
    "y_medium": 0.1,   # moderate noise
    "y_high": 0.01,    # small noise
}
for col, sigma in noise_levels.items():
    df[col] = y_true + np.random.normal(0, sigma, size=len(y_true))

drop_cols = [col for col in df.columns if col.startswith("embedding_")]
df = df.drop(columns=drop_cols)

In [7]:
# Cell 6: Quick check & save
print(df[["sequence", "y_low", "y_medium", "y_high"]].head())
df.to_json("../data/raw/cd98_2000_seq.jsonl", orient="records", lines=True)

                                            sequence       y_low    y_medium  \
0  EVQLVESGGGLVQPGGSLRLSCAASGFTFKSYAMDWVRQAPGKQRE... -267.424127 -267.561455   
1  EVQLVESGGGLVQPGGSLRLSCAASGFTFKSYAMDWVRQAPGKQRE... -267.612351 -267.202450   
2  EVQLVESGGGLVQPGGSLRLSCAASGFTFKSYAMDWVRQAPGKQRE... -270.539386 -270.320750   
3  EVQLVESGGGLVQPGGSLRLSCAASGFTFKSYAMDWVRQAPGKQRE... -274.479366 -274.162201   
4  EVQLVESGGGLVQPGGSLRLSCAASGFTFKSYAMDWVRQAPGKQRE... -274.999357 -274.873697   

       y_high  
0 -267.465345  
1 -267.164703  
2 -270.202944  
3 -274.071817  
4 -274.816321  
