In [1]:
import os, sys 
import gzip
import shutil
import tempfile
import warnings

import numpy as np
import pandas as pd
from tqdm import tqdm
import requests

import torch
import esm
from esm.inverse_folding.util import (
    load_structure,
    extract_coords_from_structure,
    get_encoder_output,
    CoordBatchConverter
)

import training_utils.partitioning_utils as pat_utils


warnings.simplefilter(action="ignore", category=FutureWarning)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
modified_to_canonical = {
    # A
    '2AS': 'ASP', '3AH': 'HIS', '4BF': 'PHE', '5HP': 'GLU', '5OW': 'LYS',
    '8LJ': 'LEU',  "05O": "PRO", "6M6": "MET", "9MN": "MET", "6CW": "TRP",
    "2CO" : "CYS", "6FL" : "PHE", "2MR": "ARG", "02A": "SER", "7ID": "TYR",
    "5CT": "CYS", "4MM": "MET",
    
    'ACL': 'ARG', 'AGM': 'ARG', 'AIB': 'ALA', 'ALM': 'ALA', 'ALO': 'THR',
    'ALY': 'LYS', 'ARM': 'ARG', 'ASA': 'ASP', 'ASB': 'ASP', 'ASK': 'ASP',
    'ASL': 'ASP', 'ASQ': 'ASP', 'AYA': 'ALA', 'AZK': 'LYS', "A8E": "ALA",
    "ACA": "ALA", "AAR": "ARG", "AME" : "ALA", "API": "ILE",

    # B
    'BCS': 'CYS', 'BHD': 'ASP', 'B3A': 'ALA', 'B3E': 'GLU', 'B3K': 'LYS',
    'B3S': 'SER', 'B3X': 'TRP', 'B3Y': 'TYR', 'BE2': 'GLU', "B3M" : "MET",
    'BNN': 'ALA', 'BUG': 'LEU', "BFD": "ASP", "B3T": "THR", 'BIL' : 'ILE',
    "B3L": "LEU", "DBB": "ASN",

    # C
    'C5C': 'CYS', 'C6C': 'CYS', 'CAS': 'CYS', 'CSD': 'CYS',
    'CSO': 'CYS', 'CSP': 'CYS', 'CSS': 'CYS', 'CSU': 'CYS',
    'CSW': 'CYS', 'CSX': 'CYS', 'CY1': 'CYS', 'CY3': 'CYS',
    'CYG': 'CYS', 'CYM': 'CYS', 'CYQ': 'CYS', 'CXM': 'MET',
    'CME': 'CYS', 'CSA': 'CYS', 'CXM': 'MET', "CCS": "CYS",
    "CSK": "LYS", "CMT": "CYS",

    # D
    'DAH': 'PHE', 'DAL': 'ALA', 'DAR': 'ARG', 'DAS': 'ASP', 
    'DGL': 'GLU', 'DGN': 'GLN', 'DHA': 'ALA', 'DHI': 'HIS',
    'DIL': 'ILE', 'DIV': 'VAL', 'DLE': 'LEU', 'DLY': 'LYS',
    'DNP': 'ALA', 'DPN': 'PHE', 'DPR': 'PRO', 'DSN': 'SER',
    'DSP': 'ASP', 'DTH': 'THR', 'DTR': 'TRP', 'DTY': 'TYR',
    'DVA': 'VAL', 'DV7': 'ASP', "D0Q" : "ASP", 'DCY': 'CYS',
    "DAM": "ASP", "DBB": "ASN",

    # E
    'EFC': 'CYS', "EO2": "GLU",

    # F
    'FLA': 'ALA', 'FME': 'MET', "F2Y": "TYR", "FY3": "TYR",
    "FTR": "TRP",    

    # G
    'GGL': 'GLU', 'GL3': 'GLY', 'GLZ': 'GLY', 'GMA': 'GLU',
    'GSC': 'GLY',

    # H
    'HAC': 'ALA', 'HAR': 'ARG', 'HIC': 'HIS', 'HIP': 'HIS',
    'HMR': 'ARG', 'HP9': 'HIS', 'HPQ': 'PHE', 'HTR': 'TRP',
    'HYP': 'PRO', "HCS": "CYS", "HZP": "PRO",

    # I
    'IAS': 'ASP', 'IIL': 'ILE', 'IYR': 'TYR', "I2F" : "PHE",

    # K
    'KCX': 'LYS', 'KPI': 'LYS', 'KYN': 'TRP', "KFP": "LYS",
    "KYQ" : "LYS", "KGC": "LYS", "KHB": "LYS", "KCR": "LYS", 

    # L
    'LLP': 'LYS', 'LLY': 'LYS', 'LPS': 'SER', "LCK" : "LYS",
    'LTR': 'TRP', 'LYM': 'LYS', 'LYR': 'LYS', 'LYZ': 'LYS',
    'LDH': 'LEU', 'LPD': 'LYS',

    # M
    'MAA': 'ALA', 'MEN': 'ASN',   # Your earlier MEN=LYS was incorrect; MEN is ASN-derivative
    'MHS': 'HIS', 'MIS': 'SER', 'MK8': 'LEU', 'MLE': 'LEU',
    'MLY': 'LYS', 'MLZ': 'LYS', 'MPQ': 'GLY', 'MSA': 'GLY',
    'MSE': 'MET', 'MHO': 'MET', 'MVA': 'VAL', "6M6": "MET",
    "MSO": "MET", "MLL": "LEU", "M3L": "LYS", "M0H": "MET", 

    # N
    'NEM': 'HIS', 'NEP': 'HIS', 'NLE': 'LEU', 'NLN': 'LEU',
    'NLP': 'LEU', 'NMC': 'GLY', 'NFA': 'PHE', 'SNN': 'ASN',
    "NIY": "TYR", "N7P": "PRO",

    # O
    'OAS': 'SER', 'OCS': 'CYS', 'OMT': 'MET', 'ONL': 'LYS',
    '0W6': 'TRP', "ORN": "LYS", "OCY" : "CYS", "ONH": "ASN",

    # P
    'PAQ': 'TYR', 'PCA': 'GLU', 'PEC': 'CYS', 'PHI': 'PHE',
    'PHL': 'PHE', 'PR3': 'CYS', 'PRR': 'ALA', 'PTR': 'TYR',
    'PVO': 'PRO', 'PYX': 'CYS', "PHD": "ASP", "P1L": "PRO",

    # Q
    "QPA": "PHE", "QCS": "CYS", 

    # R
    'RPI': 'ARG', "RGP": "ARG",

    # S
    'SAC': 'SER', 'SAR': 'SER', 'SCH': 'CYS', 'SCS': 'CYS',
    'SCY': 'CYS', 'SEL': 'SER', 'SEP': 'SER', 'SET': 'SER',
    'SHC': 'CYS', 'SHR': 'LYS', 'SMC': 'CYS', 'SME': 'MET',
    'SOC': 'CYS', 'STY': 'TYR', 'SVA': 'SER', 'SNC': 'CYS',
    'S2C': 'CYS', "SAH": "ALA", "SEB": "SER", "SVV": "VAL",
    

    # T
    'TIH': 'ALA', 'TPL': 'TRP', 'TPO': 'THR', 'TPQ': 'ALA',
    'TRG': 'LYS', 'TRO': 'TRP', 'TSY': 'TYR', 'TYS': 'TYR',
    'TYB': 'TYR', 'TYI': 'TYR', 'TYQ': 'TYR', 'TYY': 'TYR',
    "TRQ": "TRP", "TY2": "TYR", "TYE": "TYR", "TYC": "TYR",
    'THC' : 'THR',

    # Y
    'YCM': 'CYS',

    # V
    "VLM": "VAL",

    # X
    "X2W": "TRP",

    # Z
    'ZBZ': 'ALA', "ZIQ": "GLN",
}

In [6]:
def download_pdb(pdb_id: str, out_dir: str) -> str:
    """
    Download PDB file (.pdb) from RCSB if not already cached.
    Returns path to the downloaded file.
    """
    pdb_id = pdb_id.lower()
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, f"{pdb_id}.pdb.gz")

    if os.path.exists(out_path):
        return out_path

    url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    # print(f"Downloading {pdb_id} from RCSB...", file=sys.stderr)

    r = requests.get(url)
    if r.status_code != 200:
        raise FileNotFoundError(f"Failed to download {pdb_id} from RCSB")

    # Save gzipped version for compatibility with your loader
    with gzip.open(out_path, "wb") as f:
        f.write(r.content)

    return out_path

def index_pdb_files(root_dir: str):
    """
    Walk the PDB mirror and build:
        pdb_id (4-letter, lowercase) -> full path to .pdb.gz

    Adjust filename parsing if your mirror uses a different naming scheme.
    """
    pdb_index = {}

    for dirpath, dirnames, filenames in os.walk(root_dir):
        for fn in filenames:
            fn_lower = fn.lower()
            if not fn_lower.endswith(".pdb.gz"):
                continue

            # strip ".pdb.gz"
            stem = fn_lower[:-7]

            # Common naming patterns:
            # - "1abc.pdb.gz"           -> pdb_id = "1abc"
            # - "pdb1abc.ent.gz" etc.   -> adjust below if needed
            if stem.startswith("pdb") and len(stem) >= 7:
                # e.g., "pdb1abc" -> "1abc"
                pdb_id = stem[3:7]
            else:
                # assume first 4 chars are the PDB id
                pdb_id = stem[:4]

            pdb_index[pdb_id] = os.path.join(dirpath, fn)

    return pdb_index


def load_structure_from_gz(path_gz: str, chain_id: str):
    """
    Decompress a .pdb.gz to a temporary file, load with esm's load_structure,
    and return the Structure object.
    """
    if not os.path.isfile(path_gz):
        raise FileNotFoundError(f"PDB file not found: {path_gz}")

    with tempfile.NamedTemporaryFile(suffix=".pdb") as tmp:
        with gzip.open(path_gz, "rb") as fin, open(tmp.name, "wb") as fout:
            shutil.copyfileobj(fin, fout)
            
        structure = load_structure(tmp.name, chain=chain_id)
        for mod, canon in modified_to_canonical.items():
            mask = structure.res_name == mod
            structure.res_name[mask] = canon
    
    return structure

def calculate_esm_if_embeddings(model, alphabet, pdb_path: str, chain_id: str, device="cuda") -> np.ndarray:
    """
    Returns per-residue ESM-IF encoder embeddings for a single chain.
    Shape: [L, D]
    """
    structure = load_structure_from_gz(pdb_path, chain_id)
    coords, _seq = extract_coords_from_structure(structure)
    coords = torch.as_tensor(coords, dtype=torch.float32, device=device)
    
    device = next(model.parameters()).device
    batch_converter = CoordBatchConverter(alphabet)
    batch = [(coords, None, None)]
    coords, confidence, strs, tokens, padding_mask = batch_converter(batch, device=device)
    with torch.no_grad():
        encoder_out = model.encoder.forward(coords, padding_mask, confidence, return_all_hiddens=False)
    # remove beginning and end (bos and eos tokens)
    assert (len(_seq)+2 ==  encoder_out['encoder_out'][0][:, 0].cpu().numpy().shape[0])
    return encoder_out['encoder_out'][0][:, 0].cpu().numpy()

In [5]:
# Load ESM-IF model
print("Loading ESM-IF1 model...", file=sys.stderr)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model, alphabet = esm.pretrained.esm_if1_gvp4_t16_142M_UR50()
model = model.to(device).eval()
print(f"Model loaded on device: {device}", file=sys.stderr)

Loading ESM-IF1 model...
Model loaded on device: cuda


## Meta-analysis

In [25]:
dataframe_path = "/work3/s232958/data/meta_analysis/interaction_df_metaanal.csv"
pdb_download_dir  = "/work3/s232958/data/meta_analysis/input_pdbs/"

# Where to save ESM-IF embeddings (.npy per chain)
path_to_output_embeddings_targets = "/work3/s232958/data/meta_analysis/esmif_embeddings_targets"
path_to_output_embeddings_binders = "/work3/s232958/data/meta_analysis/esmif_embeddings_binders"

In [26]:
# Creating dirs
os.makedirs(path_to_output_embeddings_targets, exist_ok=True)
os.makedirs(path_to_output_embeddings_binders, exist_ok=True)
os.makedirs(pdb_download_dir, exist_ok=True)

# Loading Df
print(f"Reading dataframe from: {dataframe_path}", file=sys.stderr)
sequence_df = pd.read_csv(dataframe_path)
sequence_df["seq_len_binder"] = sequence_df["A_seq"].apply(len)
sequence_df["seq_len_target"] = sequence_df["B_seq"].apply(len)
sequence_df = sequence_df.rename(columns={
    "A_seq" : "binder_seq",
    "B_seq" : "target_seq",
})

Reading dataframe from: /work3/s232958/data/meta_analysis/interaction_df_metaanal.csv


#### binders embeddings

In [27]:
# ---- Main loop ---- Meta Anlalysis
# Set of already computed embeddings ----
already_calculated_files = {
    fname[:-4] for fname in os.listdir(path_to_output_embeddings_binders) if fname.endswith(".npy")
}
print(f"Found {len(already_calculated_files)} existing .npy files", file=sys.stderr)

pdb_len_binders = []

for idx, row in tqdm(sequence_df.iterrows(), total=len(sequence_df)):
    name = row["target_binder_ID"]     # e.g. "1ABC_A"id
    if name in already_calculated_files:
        continue

    pdb_path = os.path.join(pdb_download_dir, f"{row['binder_id']}.pdb.gz")
    chain_id = "A"  # e.g. "A"
    try:
        embeddings = calculate_esm_if_embeddings(model, alphabet, pdb_path, chain_id)
    except Exception as e:
        print(f"Failed for {name}: {e}", file=sys.stderr)
        continue

    out_path = os.path.join(path_to_output_embeddings_binders, name + ".npy")
    np.save(out_path, embeddings)
    pdb_len_binders.append(embeddings.shape[0])

print("Done.", file=sys.stderr)

Found 0 existing .npy files
  F.pad(torch.tensor(cd), (0, 0, 0, 0, 1, 1), value=np.inf)
100%|█████████████████████████████████████████████████████████████████████████████████████| 3532/3532 [05:41<00:00, 10.34it/s]
Done.


#### targets embeddings

In [28]:
### Target embeddings
targets_unique = sequence_df.target_id_mod.unique()
targets = {}
for __, row in sequence_df.iterrows():
    if row.target_id_mod not in targets.keys():
        targets[row.target_id_mod] = row.binder_id
targets

{'VirB8': 'EHEE_rd4_0110_min_rise1_21_000000015_0001',
 'FGFR2': 'Grafting_Motif0040_ems_3hM_3083_0001_0002',
 'IL7Ra': 'IL7Ra_binder_AF2_10',
 'InsulinR': 'Insulin_receptor_binder_AF2_16',
 'EGFR': 'Motif0092_bcov_4helix_12405_0001_0001',
 'SARS_CoV2_RBD': 'Motif0141_ems_3hM_2999_0001',
 'Pdl1': 'Pdl1_binder_AF2_20',
 'EGFR_2': 'Reed_HARRISON_model_56_119_0',
 'TrkA': 'Trka_binder_AF2_1',
 'IL10Ra': 'il10ra_site1_normal_mot_b26a303360aa2756ef5b79665ce52f45_0001_000000179_0001_1_11_H___HHH_b2_00927_0001_0000100005_0000001_0',
 'LTK': 'ltk_site1_normal_mot_HHH_b1_03302_af2_000000020_0001_3_19_H___HHH_b1_02056_af2_0001_0000500019_0000001_0',
 'Mdm2': 'p53_AF2_30',
 'EGFR_3': 'nside_mot_359448c81ec7171dd466f2d64631b9c5_0001_1_000000012_0001_35_44_H___HHH_eva_0380_0001_0001',
 'sntx': 'SVT000__T10_C10_consensus_267_dldesign_0_cycle0_1_af2pred',
 'sntx_2': 'SVT000__short_native_hotspot_bulky_2_strand_5_46_dldesign_0_dldesign_0_cycle1_af2pred',
 'IL2Ra': 'IL2RaEarlyBind_lt1005'}

In [29]:
# ---- Main loop ---- Meta Anlalysis

already_calculated_files = {
    fname[:-4] for fname in os.listdir(path_to_output_embeddings_targets) if fname.endswith(".npy")
}
print(f"Found {len(already_calculated_files)} existing .npy files", file=sys.stderr)

pdb_len_targets = []

for key, value in tqdm(targets.items(), total=len(targets)):
    name = key
    if name in already_calculated_files:
        continue

    pdb_path = os.path.join(pdb_download_dir, f"{value}.pdb.gz")
    chain_id = "B"  # e.g. "A"
    try:
        embeddings = calculate_esm_if_embeddings(model, alphabet, pdb_path, chain_id)
    except Exception as e:
        print(f"Failed for {name}: {e}", file=sys.stderr)
        continue

    out_path = os.path.join(path_to_output_embeddings_targets, name + ".npy")
    np.save(out_path, embeddings)
    pdb_len_targets.append(embeddings.shape[0])

print("Done.", file=sys.stderr)

Found 0 existing .npy files
100%|█████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  6.00it/s]
Done.


In [155]:
pdb_len_targets

[138, 101, 193, 150, 191, 195, 115, 621, 101, 207, 301, 85, 157, 60, 60, 165]

In [156]:
target_pdblen = {}
for i, (key, __) in enumerate(targets.items()):
    target_pdblen[key] = pdb_len_targets[i]
target_pdblen

{'VirB8': 138,
 'FGFR2': 101,
 'IL7Ra': 193,
 'InsulinR': 150,
 'EGFR': 191,
 'SARS_CoV2_RBD': 195,
 'Pdl1': 115,
 'EGFR_2': 621,
 'TrkA': 101,
 'IL10Ra': 207,
 'LTK': 301,
 'Mdm2': 85,
 'EGFR_3': 157,
 'sntx': 60,
 'sntx_2': 60,
 'IL2Ra': 165}

In [157]:
sequence_df["pdb_len_binder"] = pdb_len_binders
sequence_df["pdb_len_target"] = [target_pdblen[row.target_id_mod] for __, row in sequence_df.iterrows()]
sequence_df

Unnamed: 0,binder_id,target_id,binder_chain,target_chains,binder,binder_seq,target_seq,target_id_mod,target_binder_ID,seq_len_binder,seq_len_target,pdb_len_binder,pdb_len_target
0,EHEE_rd4_0110_min_rise1_21_000000015_0001,VirB8,A,"[""B""]",False,LDFIVFAGPEKAIKFYKEMAKRNLEVKIWIDGDWAVVQVK,ANPYISVANIMLQNYVKQREKYNYDTLKEQFTFIKNASTSIVYMQF...,VirB8,VirB8_1,40,138,40,138
1,Grafting_Motif0040_ems_3hM_3083_0001_0002,FGFR2,A,"[""B""]",False,SEQDETMHRIVRSVIQHAYKHNDEMAEYFAQNAAEIYKEQNKSEEA...,RSPHRPILQAGLPANASTVVGGDVEFVCKVYSDAQPHIQWIKHVPY...,FGFR2,FGFR2_1,62,101,62,101
2,Grafting_Motif0042_ems_3hM_148_0001,FGFR2,A,"[""B""]",False,DYKQLKKHATKLLELAKKDPSSKRDLLRTAASYANKVLFEDSDPRA...,RSPHRPILQAGLPANASTVVGGDVEFVCKVYSDAQPHIQWIKHVPY...,FGFR2,FGFR2_2,61,101,61,101
3,Grafting_Motif0042_ems_3hM_1661_0001,FGFR2,A,"[""B""]",False,DEKEELERRANRVAFLAIQIQNEEYHRILAELYVQFMKAAENNDTE...,RSPHRPILQAGLPANASTVVGGDVEFVCKVYSDAQPHIQWIKHVPY...,FGFR2,FGFR2_3,64,101,64,101
4,Grafting_Motif0042_ems_3hM_257_0001,FGFR2,A,"[""B""]",False,PDNKEKLMSIAVQLILRINEAARSEEQWRYANRAAFAAVEASSGSD...,RSPHRPILQAGLPANASTVVGGDVEFVCKVYSDAQPHIQWIKHVPY...,FGFR2,FGFR2_4,64,101,64,101
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3527,il2ra_site1_2b5i_sap_19_mot_HHH_b2_06055_af2_0...,IL2Ra,A,"[""B""]",False,DLRKYAAELVDRLAEKYNLDSDQYNALVRLASELVWQGKSKEEIEK...,ELCDDDPPEIPHATFKAMAYKEGTMLNCECKRGFRRIKSGSLYMLC...,IL2Ra,IL2Ra_62,55,165,55,165
3528,il2ra_site1_2b5i_sap_20_mot_HHH_b1_01934_af2_0...,IL2Ra,A,"[""B""]",False,SKEEIKKEAEELIEELKKKGYNLPLRILEFALKEIEETNSEKYYEQ...,ELCDDDPPEIPHATFKAMAYKEGTMLNCECKRGFRRIKSGSLYMLC...,IL2Ra,IL2Ra_63,56,165,56,165
3529,il2ra_site1_2b5i_sap_23_mot_88fc943612ced423dc...,IL2Ra,A,"[""B""]",False,SPEYKKFLELIKEAEAARKAGDLDKAKELLEKALELAKKMKAKSLI...,ELCDDDPPEIPHATFKAMAYKEGTMLNCECKRGFRRIKSGSLYMLC...,IL2Ra,IL2Ra_64,56,165,56,165
3530,il2ra_site1_2b5i_sap_25_mot_HHH_b2_01943_00000...,IL2Ra,A,"[""B""]",False,DPLLAYKLLKLSQKALEKAYAEDRERAEELLEEAEAALRSLGDEAG...,ELCDDDPPEIPHATFKAMAYKEGTMLNCECKRGFRRIKSGSLYMLC...,IL2Ra,IL2Ra_65,57,165,57,165


In [158]:
sequence_df.to_csv("/work3/s232958/data/meta_analysis/interaction_df_metaanal_w_pbd_lens.csv", index=False)

In [159]:
esmif_random = sequence_df.sample()["target_binder_ID"].item()
esmif_random = np.load(os.path.join(path_to_output_embeddings_binders, esmif_random + ".npy"))
esmif_random.shape

(65, 512)

## PPint

In [15]:
# dataframe_path = "/work3/s232958/data/PPint_DB/disordered_interfaces_no_cutoff_filtered_nonredundant80_3å_5.csv.gz"
dataframe_path = "/work3/s232958/data/PPint_DB/PPint_test.csv"
# dataframe_path = "/work3/s232958/data/PPint_DB/PPint_train.csv"
pdbs_download_dir = "/work3/s232958/data/PPint_DB/pdb_cache"
# name_column = "PDB_chain_name"
# sequence_column = "sequence"

# Where to save ESM-IF embeddings (.npy per chain)
path_to_output_embeddings = "/work3/s232958/data/PPint_DB/esmif_embeddings_noncanonical"

# Root of your local PDB mirror (with subfolders a0, a1, etc.)
root_pdb_dir = "/novo/users/cpjb/rdd/PDB_mirror_pdb"

In [16]:
# Creating dirs
os.makedirs(path_to_output_embeddings, exist_ok=True)
os.makedirs(pdbs_download_dir, exist_ok=True)

# Loading Df
print(f"Reading dataframe from: {dataframe_path}", file=sys.stderr)
sequence_df = pd.read_csv(dataframe_path)

sequence_df["PDB"] = [str(row.interface_id).split("_")[0] for __, row in sequence_df.iterrows()]

sequence_df["target_chain"] = [str(row.ID1).split("_")[-1] for __, row in sequence_df.iterrows()]
sequence_df["binder_chain"] = [str(row.ID2).split("_")[-1] for __, row in sequence_df.iterrows()]

sequence_df["pdb_path"] = [str(row.PDB+".pdb.gz").lower() for __, row in sequence_df.iterrows()]

sequence_df

Reading dataframe from: /work3/s232958/data/PPint_DB/PPint_test.csv


Unnamed: 0,interface_id,seq_target,seq_binder,ID1,ID2,dimer,seq_target_len,seq_binder_len,target_binder_id,PDB,target_chain,binder_chain,pdb_path
0,1NNW_0,VYVAVLANIAGNLPALTAALSRIEEMREEGYEIEKYYILGNIVGLF...,VYVAVLANIAGNLPALTAALSRIEEMREEGYEIEKYYILGNIVGLF...,1NNW_0_A,1NNW_0_B,True,251,251,1NNW_0_A_1NNW_0_B,1NNW,A,B,1nnw.pdb.gz
1,3UCN_0,TADLSPLLEANRKWADECAAKDSTYFSKVAGSQAPEYLYIGCADSR...,TADLSPLLEANRKWADECAAKDSTYFSKVAGSQAPEYLYIGCADSR...,3UCN_0_A,3UCN_0_B,True,222,222,3UCN_0_A_3UCN_0_B,3UCN,A,B,3ucn.pdb.gz
2,1POV_1,QHRSRSESSIESFFARGACVTIMTVDNPASTTNKDKLFAVWKITYK...,GLPVMNTPGSNQYLTADNFQSPCALPEFDVTPPIDIPGEVKNMMEL...,1POV_1_1,1POV_1_3,False,235,238,1POV_1_1_1POV_1_3,1POV,1,3,1pov.pdb.gz
3,3R6Y_2,VRIEKDFLGEKEIPKDAYYGVQTIRATENFPITGYRIHPELIKSLG...,VRIEKDFLGEKEIPKDAYYGVQTIRATENFPITGYRIHPELIKSLG...,3R6Y_2_C,3R6Y_2_D,True,383,390,3R6Y_2_C_3R6Y_2_D,3R6Y,C,D,3r6y.pdb.gz
4,5YHI_0,PMRYPVDVYTGKIQVDGELMLTELGLEGDGPDRALCHYPREHYLYW...,PMRYPVDVYTGKIQVDGELMLTELGLEGDGPDRALCHYPREHYLYW...,5YHI_0_A,5YHI_0_B,True,202,201,5YHI_0_A_5YHI_0_B,5YHI,A,B,5yhi.pdb.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...
489,3GXE_0,DQCIVDDITYNVQDTFHKKHEEGHMLNCTCFGQGRGRWKCDPVDQC...,GLGMKGHRGF,3GXE_0_B,3GXE_0_F,False,89,10,3GXE_0_B_3GXE_0_F,3GXE,B,F,3gxe.pdb.gz
490,6LY5_21,PSPIFGGSTGGWLRKAQVEEKYVITWDSPKEQIFEMPTGGAAIMRE...,ANFIKPYNDDPFVGHLATPITSSAVTRSLLKNLPAYRFGLTPLLRG...,6LY5_21_d,6LY5_21_l,False,132,144,6LY5_21_d_6LY5_21_l,6LY5,d,l,6ly5.pdb.gz
491,5MLK_0,ARISKVLVANRGEIAVRVIRAARDAGLPSVAVYAEPDAESPHVRLA...,ARISKVLVANRGEIAVRVIRAARDAGLPSVAVYAEPDAESPHVRLA...,5MLK_0_A,5MLK_0_B,True,451,384,5MLK_0_A_5MLK_0_B,5MLK,A,B,5mlk.pdb.gz
492,8BS4_0,HPVLEKLKAAHSYNPKEFEWNLKSGRVFIIKSYSEDDIHRSIKYSI...,GHPVLEKLKAAHSYNPKEFEWNLKSGRVFIIKSYSEDDIHRSIKYS...,8BS4_0_A,8BS4_0_B,True,195,193,8BS4_0_A_8BS4_0_B,8BS4,A,B,8bs4.pdb.gz


In [17]:
### 'Failed for 6BJP: structure has multiple atoms with same names'
sequence_df = sequence_df[~sequence_df["PDB"].str.startswith("6BJP")]

In [18]:
# Downloading PDBs
for __, row in tqdm(sequence_df.iterrows(), total=len(sequence_df)):
    pid = row.PDB
    download_pdb(pid, pdbs_download_dir)

100%|█████████████████████████████████████████████████████████████████████████████████████| 494/494 [00:00<00:00, 5612.13it/s]


In [19]:
# Drop rows where we don't have a PDB file
missing_mask = sequence_df["pdb_path"].isna()
if missing_mask.any():
    missing_ids = sequence_df.loc[missing_mask, "PDB"].unique()
    print("Warning: no PDB file found for these IDs:", file=sys.stderr)
    for pid in missing_ids:
        print(f"  {pid}", file=sys.stderr)
    sequence_df = sequence_df[~missing_mask]

In [20]:
# Set of already computed embeddings ----
already_calculated_files = {
    fname[:-4] for fname in os.listdir("/work3/s232958/data/PPint_DB/esmif_embeddings_noncanonical") if fname.endswith(".npy")
}
print(f"Found {len(already_calculated_files)} existing .npy files", file=sys.stderr)

Found 3949 existing .npy files


In [21]:
# Set of already computed embeddings ----
pdb_len_binders, pdb_len_targets = [], []

for idx, row in tqdm(sequence_df.iterrows(), total=len(sequence_df)):
    PDB_ID = row["PDB"]     # e.g. "1ABC_A"id
    chain_binder = row["binder_chain"]
    chain_target = row["target_chain"]
    name_binder = str(PDB_ID)+"_"+chain_binder
    name_target = str(PDB_ID)+"_"+chain_target

    # if name_binder in already_calculated_files and name_target in already_calculated_files:
    #     continue

    pdb_path = os.path.join(pdbs_download_dir, f"{row['pdb_path']}")
    try:
        embeddings_binder = calculate_esm_if_embeddings(model, alphabet, pdb_path, chain_binder)
        embeddings_target = calculate_esm_if_embeddings(model, alphabet, pdb_path, chain_target)
    except Exception as e:
        print(f"Failed for {PDB_ID}: {e}", file=sys.stderr)
        continue

    pdb_len_binders.append(embeddings_binder.shape[0])
    pdb_len_targets.append(embeddings_target.shape[0])
    
    out_path_binder = os.path.join(path_to_output_embeddings, name_binder + ".npy")
    out_path_target = os.path.join(path_to_output_embeddings, name_target + ".npy")
    
    np.save(out_path_binder, embeddings_binder)
    np.save(out_path_target, embeddings_target)

print("Done.", file=sys.stderr)

  F.pad(torch.tensor(cd), (0, 0, 0, 0, 1, 1), value=np.inf)
100%|███████████████████████████████████████████████████████████████████████████████████████| 494/494 [04:15<00:00,  1.93it/s]
Done.


In [14]:
pdb_id_seqs = {}

In [40]:
for idx, row in tqdm(sequence_df.iterrows(), total=len(sequence_df)):
    interface_id = row["interface_id"]
    if interface_id in pdb_id_seqs.keys():
        continue
    else:
        PDB_ID = row["PDB"]     # e.g. "1ABC_A"id
        chain_binder = row["binder_chain"]
        chain_target = row["target_chain"]
        name_binder = str(PDB_ID)+"_"+chain_binder
        name_target = str(PDB_ID)+"_"+chain_target
        pdb_path = os.path.join(pdbs_download_dir, f"{row['pdb_path']}")
    
        structure_bidner = load_structure_from_gz(pdb_path, chain_binder)
        structure_target = load_structure_from_gz(pdb_path, chain_target)
    
        __, binder_seq = extract_coords_from_structure(structure_bidner)
        __, target_seq = extract_coords_from_structure(structure_target)
        
        pdb_id_seqs[interface_id] = (target_seq, binder_seq)

100%|█████████████████████████████████████████████████████████████████████████████████████| 1977/1977 [05:11<00:00,  6.35it/s]


In [41]:
sequence_df["seq_pdb_target"] = [value[0] for key, value in pdb_id_seqs.items()]
sequence_df["seq_pdb_binder"] = [value[1] for key, value in pdb_id_seqs.items()]

sequence_df["pdb_target_len"] = [len(row.seq_pdb_target) for i, row in sequence_df.iterrows()]
sequence_df["pdb_binder_len"] = [len(row.seq_pdb_binder) for i, row in sequence_df.iterrows()]

sequence_df = sequence_df[[
    "interface_id", 
    "PDB", "ID1", "ID2", 
    "seq_target", "seq_target_len", "seq_pdb_target", "pdb_target_len", "target_chain", 
    "seq_binder", "seq_binder_len", "seq_pdb_binder", "pdb_binder_len", "binder_chain", 
    "pdb_path"]]
    

sequence_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sequence_df["seq_pdb_target"] = [value[0] for key, value in pdb_id_seqs.items()]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sequence_df["seq_pdb_binder"] = [value[1] for key, value in pdb_id_seqs.items()]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sequence_df["pdb_target_len"] = [len(row.se

Unnamed: 0,interface_id,PDB,ID1,ID2,seq_target,seq_target_len,seq_pdb_target,pdb_target_len,target_chain,seq_binder,seq_binder_len,seq_pdb_binder,pdb_binder_len,binder_chain,pdb_path
0,6IDB_0,6IDB,6IDB_0_A,6IDB_0_B,DKICLGHHAVSNGTKVNTLTERGVEVVNATETVERTNIPRICSKGK...,317,DKICLGHHAVSNGTKVNTLTERGVEVVNATETVERTNIPRICSKGK...,317,A,GLFGAIAGFIENGWEGLIDGWYGFRHQNAQGEGTAADYKSTQSAID...,172,GLFGAIAGFIENGWEGLIDGWYGFRHQNAQGEGTAADYKSTQSAID...,172,B,6idb.pdb.gz
1,2WZP_3,2WZP,2WZP_3_D,2WZP_3_G,VQLQESGGGLVQAGGSLRLSCTASRRTGSNWCMGWFRQLAGKEPEL...,122,VQLQESGGGLVQAGGSLRLSCTASRRTGSNWCMGWFRQLAGKEPEL...,122,D,TIKNFTFFSPNSTEFPVGSNNDGKLYMMLTGMDYRTIRRKDWSSPL...,266,TIKNFTFFSPNSTEFPVGSNNDGKLYMMLTGMDYRTIRRKDWSSPL...,266,G,2wzp.pdb.gz
2,1ZKP_0,1ZKP,1ZKP_0_A,1ZKP_0_C,LYFQSNAKTVVGFWGGFPEAGEATSGYLFEHDGFRLLVDCGSGVLA...,246,LYFQSNAMKMTVVGFWGGFPEAGEATSGYLFEHDGFRLLVDCGSGV...,251,A,AKTVVGFWGGFPEAGEATSGYLFEHDGFRLLVDCGSGVLAQLQKYI...,240,AMKMTVVGFWGGFPEAGEATSGYLFEHDGFRLLVDCGSGVLAQLQK...,245,C,1zkp.pdb.gz
3,6GRH_3,6GRH,6GRH_3_C,6GRH_3_D,SKHELSLVEVTHYTDPEVLAIVKDFHVRGNFASLPEFAERTFVSAV...,266,SKHELSLVEVTHYTDPEVLAIVKDFHVRGNFASLPEFAERTFVSAV...,266,C,MINVYSNLMSAWPATMAMSPKLNRNMPTFSQIWDYERITPASAAGE...,396,MINVYSNLMSAWPATMAMSPKLNRNMPTFSQIWDYERITPASAAGE...,396,D,6grh.pdb.gz
4,8R57_1,8R57,8R57_1_M,8R57_1_f,DLMTALQLVMKKSSAHDGLVKGLREAAKAIEKHAAQICVLAEDCDQ...,118,DLMTALQLVMKKSSAHDGLVKGLREAAKAIEKHAAQICVLAEDCDQ...,118,M,PKKQKHKHKKVKLAVLQFYKVDDATGKVTRLRKECPNADCGAGTFM...,64,PKKQKHKHKKVKLAVLQFYKVDDATGKVTRLRKECPNADCGAGTFM...,64,f,8r57.pdb.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1973,4YO8_0,4YO8,4YO8_0_A,4YO8_0_B,HENLYFQGVQKIGILGAMREEITPILELFGVDFEEIPLGGNVFHKG...,238,HENLYFQGVQKIGILGAMREEITPILELFGVDFEEIPLGGNVFHKG...,238,A,HHHHHENLYFQGVQKIGILGAMREEITPILELFGVDFEEIPLGGNV...,242,HHHHHENLYFQGVQKIGILGAMREEITPILELFGVDFEEIPLGGNV...,242,B,4yo8.pdb.gz
1974,3CKI_0,3CKI,3CKI_0_A,3CKI_0_B,DPMKNTCKLLVVADHRFYRYMGRGEESTTTNYLIELIDRVDDIYRN...,256,DPMKNTCKLLVVADHRFYRYMGRGEESTTTNYLIELIDRVDDIYRN...,256,A,CTCSPSHPQDAFCNSDIVIRAKVVGKKLVKEGPFGTLVYTIKQMKM...,121,CTCSPSHPQDAFCNSDIVIRAKVVGKKLVKEGPFGTLVYTIKQMKM...,121,B,3cki.pdb.gz
1975,7MHY_1,7MHY,7MHY_1_M,7MHY_1_N,QVQLRQSGAELAKPGASVKMSCKASGYTFTNYWLHWIKQRPGQGLE...,118,QVQLRQSGAELAKPGASVKMSCKASGYTFTNYWLHWIKQRPGQGLE...,118,M,DVLMTQTPLSLPVSLGDQVSISCRSSQSIVHNTYLEWYLQKPGQSP...,109,DVLMTQTPLSLPVSLGDQVSISCRSSQSIVHNTYLEWYLQKPGQSP...,109,N,7mhy.pdb.gz
1976,7MHY_2,7MHY,7MHY_2_O,7MHY_2_P,IQLVQSGPELVKISCKASGYTFTNYGMNWVRQAPGKGLKWMGWINT...,100,IQLVQSGPELVKISCKASGYTFTNYGMNWVRQAPGKGLKWMGWINT...,100,O,VLMTQTPLSLPVSISCRSSQSIVHSNGNTYLEWYLQKPGQSPKLLI...,94,VLMTQTPLSLPVSISCRSSQSIVHSNGNTYLEWYLQKPGQSPKLLI...,94,P,7mhy.pdb.gz


In [43]:
sequence_df[sequence_df.seq_binder_len != sequence_df.pdb_binder_len]

Unnamed: 0,interface_id,PDB,ID1,ID2,seq_target,seq_target_len,seq_pdb_target,pdb_target_len,target_chain,seq_binder,seq_binder_len,seq_pdb_binder,pdb_binder_len,binder_chain,pdb_path
2,1ZKP_0,1ZKP,1ZKP_0_A,1ZKP_0_C,LYFQSNAKTVVGFWGGFPEAGEATSGYLFEHDGFRLLVDCGSGVLA...,246,LYFQSNAMKMTVVGFWGGFPEAGEATSGYLFEHDGFRLLVDCGSGV...,251,A,AKTVVGFWGGFPEAGEATSGYLFEHDGFRLLVDCGSGVLAQLQKYI...,240,AMKMTVVGFWGGFPEAGEATSGYLFEHDGFRLLVDCGSGVLAQLQK...,245,C,1zkp.pdb.gz
6,3KOQ_0,3KOQ,3KOQ_0_A,3KOQ_0_B,GNFVELAKKRYSCRNYQDRKVEKEKLEKVLDVARIAPTGGNRQPQR...,169,GMNFVELAKKRYSCRNYQDRKVEKEKLEKVLDVARIAPTGGNRQPQ...,173,A,NFVELAKKRYSCRNYQDRKVEKEKLEKVLDVARIAPTGGNRQPQRL...,168,MNFVELAKKRYSCRNYQDRKVEKEKLEKVLDVARIAPTGGNRQPQR...,172,B,3koq.pdb.gz
21,3DTZ_3,3DTZ,3DTZ_3_D,3DTZ_3_E,TEIYTSVLSYRLLEGKAYSDADTRSLDRRSIDEFFSANPGYINFHI...,215,TEIYTSVLSYRLLEGKAYSDADTRSLDRMMRSIDEFFSANPGYINF...,223,D,TEIYTSVLSYRLLEGKAYSDADTRSLDRRSIDEFFSANPGYINFHI...,215,TEIYTSVLSYRLLEGKAYSDADTRSLDRMMRSIDEFFSANPGYINF...,223,E,3dtz.pdb.gz
23,7OEY_0,7OEY,7OEY_0_A,7OEY_0_B,TILSDVKALGQQIWLDNLSRSLVQSGELAQMLKQGVGVTSNPAIFQ...,349,TILSDVKALGQQIWLDNLSRSLVQSGELAQMLKQGVCGVTSNPAIF...,350,A,TILSDVKALGQQIWLDNLSRSLVQSGELAQMLKQGVGVTSNPAIFQ...,349,TILSDVKALGQQIWLDNLSRSLVQSGELAQMLKQGVCGVTSNPAIF...,350,B,7oey.pdb.gz
37,6DXE_0,6DXE,6DXE_0_A,6DXE_0_C,SSLDEIRQAQRADGPAGILAIGTANPENHVLQAEYPDYYFRITNSE...,388,SSLDEIRQAQRADGPAGILAIGTANPENHVLQAEYPDYYFRITNSE...,389,A,SSLDEIRQAQRADGPAGILAIGTANPENHVLQAEYPDYYFRITNSE...,388,SSLDEIRQAQRADGPAGILAIGTANPENHVLQAEYPDYYFRITNSE...,389,C,6dxe.pdb.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1951,5NKM_0,5NKM,5NKM_0_A,5NKM_0_B,DIAKWVEHARTCYSTQLDTKIKVIGVIGKDYPDHGKGDNINCYLRE...,335,MDIAKWVEHARTCYSTQLDTKIKVIGVIGKDYPDHGKGDNINCYLR...,345,A,KESVRFLTDFGEISDAISDLLTSSPNFNVISAIGPQGAGKSTLLSL...,252,MKESVRFLTDFGEISDAISDLLTSSPNFNVISAIGPQGAGKSTLLS...,260,B,5nkm.pdb.gz
1955,1S3Q_1,1S3Q,1S3Q_1_B,1S3Q_1_D,SISEKVEALNRQINAEIYSAYLYLSASYFDSIGLKGFSNWRVQWQE...,154,SISEKMVEALNRQINAEIYSAYLYLSMASYFDSIGLKGFSNWMRVQ...,162,B,SISEKVEALNRQINAEIYSAYLYLSASYFDSIGLKGFSNWRVQWQE...,154,SISEKMVEALNRQINAEIYSAYLYLSMASYFDSIGLKGFSNWMRVQ...,162,D,1s3q.pdb.gz
1956,2RDM_1,2RDM,2RDM_1_B,2RDM_1_C,EAVTILLADDEAILLLDFESTLTDAGFLVTAVSSGAKAIELKSGAA...,120,EAVTILLADDEAILLLDFESTLTDAGFLVTAVSSGAKAIEMLKSGA...,122,B,LEAVTILLADDEAILLLDFESTLTDAGFLVTAVSSGAKAIELKSGA...,121,LEAVTILLADDEAILLLDFESTLTDAGFLVTAVSSGAKAIEMLKSG...,123,C,2rdm.pdb.gz
1963,5U5I_0,5U5I,5U5I_0_A,5U5I_0_B,AAPVVNDCTGKVGQAVAEAAVAAGLRLVPLSLTGPGRGGKRVVIGN...,263,AAPVMVNDCTGKVGQAVAEAAVAAGLRLVPLSLTGPGRGGKRVVIG...,272,A,LAAAPVVNDCTGKVGQAVAEAAVAAGLRLVPLSLTGPGRGGKRVVI...,269,LAAAPVMVNDCTGKVGQAVAEAAVAAGLRLVPLSLTGPGRGGKRVV...,278,B,5u5i.pdb.gz


In [44]:
# Extracting sequences form pdbs
# sequence_df.to_csv("/work3/s232958/data/PPint_DB/PPint_test_w_pbd_lens.csv")
sequence_df.to_csv("/work3/s232958/data/PPint_DB/PPint_train_w_pbd_lens.csv")

In [59]:
# cols = ["interface_id", "dimer", "target_binder_id"]

# # Create long format for target
# df_target = pd.DataFrame({
#     "interface_id": sequence_df["interface_id"],
#     "role": "target",
#     "ID": sequence_df["ID1"],
#     "sequence": sequence_df["seq_target"],
#     "length": sequence_df["seq_target_len"],
#     "dimer": sequence_df["dimer"],
#     "target_binder_id": sequence_df["target_binder_id"]
# })

# # Create long format for binder
# df_binder = pd.DataFrame({
#     "interface_id": sequence_df["interface_id"],
#     "role": "binder",
#     "ID": sequence_df["ID2"],
#     "sequence": sequence_df["seq_binder"],
#     "length": sequence_df["seq_binder_len"],
#     "dimer": sequence_df["dimer"],
#     "target_binder_id": sequence_df["target_binder_id"]
# })

# # Combine
# sequence_df_LONG = pd.concat([df_target, df_binder], ignore_index=True)
# sequence_df_LONG["PDB"] = [row["interface_id"].split("_")[0] for __, row in sequence_df_LONG.iterrows()]
# sequence_df_LONG["chainname"] = [row["ID"].split("_")[-1] for __, row in sequence_df_LONG.iterrows()]

In [119]:
# sequence_df_LONG[name_column] = (sequence_df_LONG["PDB"] + "_" + sequence_df_LONG["chainname"]).tolist()

# # Basic cleaning
# sequence_df_LONG = sequence_df_LONG[sequence_df_LONG[sequence_column].notna()]
# sequence_df_LONG = sequence_df_LONG.drop_duplicates(subset=[name_column, sequence_column])
# sequence_df_LONG["pdb_path"] = [f'{row["PDB"].lower()}.pdb.gz' for __, row in sequence_df_LONG.iterrows()]
# sequence_df_LONG

In [120]:
# random_sample = sequence_df_LONG.sample()
# seq = random_sample.sequence.item()
# interface_id = random_sample.interface_id
# print(f"Seq: {seq}")
# print(f"PDB_ID: {interface_id.item().split('_')[0]}")

In [26]:
# pdb_path = os.path.join(download_dir, "6bjp.pdb.gz")
# clean_pdb_path = os.path.join(download_dir, "6bjp_clean.pdb")
# # clean_pdb_path = os.path.join(download_dir, "6bjp_clean_clean.pdb")

# clean_lines = []

# # Read gzipped PDB
# with gzip.open(pdb_path, "rt") as f:
#     lines = f.readlines()

# # Filter ANISOU lines
# for line in lines:
#     if not line.startswith("ANISOU"):
#         clean_lines.append(line)

# # Write cleaned PDB (uncompressed)
# with open(clean_pdb_path, "w") as f:
#     f.writelines(clean_lines)

In [None]:
# clean_pdb_path = os.path.join(download_dir, "6bjp_clean_clean.pdb")

# clean_lines = []

# with open(clean_pdb_path, "r") as f:
#     lines = f.readlines()
    
# for i, line in enumerate(lines):
#     if line.startswith("ATOM") and len(line.split()[3]) != 4:
#         clean_lines.append(line)
#     if line.startswith("ATOM") and len(line.split()[3]) == 4 and line.split()[3][0] == "A":
#         new_line = line[:16]+" "+line[17:]
#         # print(new_line)
#         clean_lines.append(new_line)

# with open(clean_pdb_path, "w") as f:
#     f.writelines(clean_lines)

In [None]:
# for i, line in enumerate(clean_lines):
#     if line.startswith("ATOM"):
#         a = clean_lines[i].split()[2]
#         b = clean_lines[i+1].split()[2]
#         if a == b:
#             print(i, clean_lines[i][:-1])
#             print(i+1, clean_lines[i+1][:-1])

In [46]:
# name = "6bjp_clean_clean.pdb"
# try:
#     embeddings = calculate_esm_if_embeddings(model, alphabet, pdb_path, chain_id)
# except Exception as e:
#     print(f"Failed for {name}: {e}", file=sys.stderr)
    
# out_path = os.path.join("/work3/s232958/data/PPint_DB/esmif_embeddings_noncanonical", "6BJP_A" + ".npy")
# # np.save(out_path, embeddings)

In [None]:
### ! Failed for 8HRH_B: 'LPD'
# name = "8HRH_B"
# pdb_path = os.path.join(download_dir, f'{name.lower().split("_")[0]}.pdb.gz')
# structure = load_structure_from_gz(pdb_path, "B")[:-3]
# coords, _seq = extract_coords_from_structure(structure)
# coords = torch.as_tensor(coords, dtype=torch.float32, device=device)

# device = next(model.parameters()).device
# batch_converter = CoordBatchConverter(alphabet)
# batch = [(coords, None, None)]
# coords, confidence, strs, tokens, padding_mask = batch_converter(batch, device=device)
# with torch.no_grad():
#     encoder_out = model.encoder.forward(coords, padding_mask, confidence, return_all_hiddens=False)
# # remove beginning and end (bos and eos tokens)
# embeddings = encoder_out['encoder_out'][0][1:-1, 0].cpu().numpy()
# out_path = os.path.join("/work3/s232958/data/PPint_DB/esmif_embeddings_noncanonical", "8HRH_B" + ".npy")
# np.save(out_path, embeddings)