In [20]:
import sys
python = f"{sys.executable}"

!{python} -m pip install --force-reinstall fair-esm rdkit

Collecting fair-esm
  Using cached fair_esm-2.0.0-py3-none-any.whl.metadata (37 kB)
Collecting rdkit
  Downloading rdkit-2025.9.3-cp312-cp312-win_amd64.whl.metadata (4.3 kB)
Collecting numpy (from rdkit)
  Using cached numpy-2.3.5-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting Pillow (from rdkit)
  Downloading pillow-12.0.0-cp312-cp312-win_amd64.whl.metadata (9.0 kB)
Using cached fair_esm-2.0.0-py3-none-any.whl (93 kB)
Downloading rdkit-2025.9.3-cp312-cp312-win_amd64.whl (23.7 MB)
   ---------------------------------------- 0.0/23.7 MB ? eta -:--:--
   ------- -------------------------------- 4.2/23.7 MB 42.3 MB/s eta 0:00:01
   ----------------------- ---------------- 13.6/23.7 MB 45.2 MB/s eta 0:00:01
   -------------------------------------- - 23.1/23.7 MB 45.6 MB/s eta 0:00:01
   ---------------------------------------- 23.7/23.7 MB 42.9 MB/s  0:00:00
Using cached numpy-2.3.5-cp312-cp312-win_amd64.whl (12.8 MB)
Downloading pillow-12.0.0-cp312-cp312-win_amd64.whl (7.0 MB)
   

  You can safely remove it manually.
  You can safely remove it manually.


In [1]:
import numpy as np
import pandas as pd
import torch, esm
from rdkit import Chem
from rdkit.Chem import AllChem

print("CUDA available:", torch.cuda.is_available())

if torch.cuda.is_available():
    print("Using:", torch.cuda.get_device_name(0))
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

CUDA available: True
Using: NVIDIA RTX PRO 6000 Blackwell Workstation Edition


# Embedding Functions


In [2]:
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
model = model.to(device)
model.eval()

batch_converter = alphabet.get_batch_converter()

In [3]:
def esm_embed(seq: str) -> torch.Tensor:
    _, _, tokens = batch_converter([(seq, seq)])
    tokens = tokens.to(device)

    with torch.no_grad():
        res = model(tokens, repr_layers=[33], return_contacts=False)

    representations = res["representations"][33][0]

    residue = representations[1:-1]
    cls = representations[0]
    return cls, residue

def morgan_fingerprint(smile: str, radius: int = 2, nBits: int = 2048): 
    mol = Chem.MolFromSmiles(smile)
    return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits))

cls, residue = esm_embed("MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAN")
print(cls.shape, residue.shape)

fingerprint = morgan_fingerprint("C1CCCCC1")
print(fingerprint.shape)

torch.Size([1280]) torch.Size([35, 1280])
(2048,)




# Training Set (Raw & Embed)

In [4]:

def to_np(x):
    if isinstance(x, torch.Tensor):
        return x.detach().cpu().numpy()
    return x

boltz = pd.read_csv("../boltz_affinity_results.csv")

iml = pd.read_csv("../iml1515_ddg_global_fba_results.csv")

fluxes = {}
for _, row0 in iml.iterrows():
    avg = 0
    reactions = row0["reaction_fluxes"].split(";")
    for reaction in reactions:
        avg += float(reaction.split("=")[1])
    flux = avg / len(reactions)
    
    fluxes[row0["protein_id"] + "$" + row0["variant_id"]] = flux


master = pd.read_csv("../master_dataset.csv")


raw_set = []

X_wt = []
X_mut = []
X_lig = []
y = []


for i, row in master.iterrows():
    wt = row["original_sequence"]
    mut = row["protein_sequence"]
    ligand = row["ligand_smiles"]

    if row["protein_id"] + "$" + row["variant_id"] not in fluxes:
        print(f"Missing flux for {row['protein_id']} {row['variant_id']}")
        continue
    flux = fluxes[row["protein_id"] + "$" + row["variant_id"]]

    raw_set.append({
        "wildtype": wt,
        "mutant": mut,
        "ligand": ligand,
        "flux": flux
    })
    
    X_wt.append(to_np(esm_embed(wt)[0]))
    X_mut.append(to_np(esm_embed(mut)[0]))
    X_lig.append(morgan_fingerprint(ligand).astype(np.float32))
    y.append(flux)
    break


raw_df = pd.DataFrame(raw_set)
raw_df.to_csv("./data/raw_set.csv", index=False)

torch.save({
    "wildtype": X_wt,
    "mutant": X_mut,
    "ligand": X_lig,
    "flux": y
}, "./data/training_set.pt")




