# Fold Candidate Sequences with ESM3

In [1]:
import pandas as pd
import os

In [2]:
## Read in candidates data
candidates_df = pd.read_excel('../candidates.xlsx', sheet_name='Antibody Candidates')
antigens_df = pd.read_excel('../candidates.xlsx', sheet_name='Antigens')

## Get the antigen sequence for Nipah Glycoprotein G
antigen_seq = antigens_df.loc[antigens_df['antigen_id'] == 'nipah_gpG', 'antigen_sequence'].values[0]
print(f"Using Antigen Sequence: {antigen_seq}")

Using Antigen Sequence: ICLQKTSNQILKPKLISYTLPVVGQSGTCITDPLLAMDEGYFAYSHLERIGSCSRGVSKQRIIGVGEVLDRGDEVPSLFMTNVWTPPNPNTVYHCSAVYNNEFYYVLCAVSTVGDPILNSTYWSGSLMMTRLAVKPKSNGGGYNQHQLALRSIEKGRYDKVMPYGPSGIKQGDTLYFPAVGFLVRTEFKYNDSNCPITKCQYSKPENCRLSMGIRPNSHYILRSGLLKYNLSDGENPKVVFIEISDQRLSIGSPSKIYDSLGQPVFYQASFSWDTMIKFGDVLTVNPLVVNWRNNTVISRPGQSQCPRFNTCPEICWEGVYNDAFLIDRINWISAGVFLDSNQTAENPVFTVFKDNEILYRAQLASEDTNAQKTITNCFLLKNKIWCISLVEIYDTGDNVIRPKLFAVKIPEQCTH


## ...with ESM3

In [3]:
## Define sequence folding function for ESM3
from esm.sdk import client
from esm.sdk.api import ESMProtein, GenerationConfig

## Set ESM API Token in environment variable
os.environ["ESM_API_TOKEN"] = "1pIBroJm2HVLhzL5JmQ7gP"

def fold_sequence_esm3(sequence:str, model_name:str, token:str, soc:bool) -> str:
    ## Load the ESM model
    model = client(model=model_name, url="https://forge.evolutionaryscale.ai", token=token)
    ## Prepare the sequence
    sequence = sequence.replace(" ", "").replace("\n", "")
    ## Generate the structure
    try:
        input = ESMProtein(sequence=sequence, potential_sequence_of_concern=soc)
        # input = ESMProtein(sequence=sequence)
        config = GenerationConfig(track="structure", num_steps=10, temperature=0.1)
        generation = model.generate(input, config)
        protein_complex = generation.to_protein_complex()
        pdb_str = protein_complex.to_pdb_string()
        return pdb_str
    except Exception as e:
        print(f"Error folding sequence {sequence}: {e}")
        return ""

In [5]:
## For each candidate, fold the antibody-antigen complex
token = os.getenv("ESM_API_TOKEN")

for idx, row in candidates_df.iterrows():
    model = client(model="esm3-medium-multimer-2024-09", url="https://forge.evolutionaryscale.ai", token=token)
    
    antibody_id = row['antibody_id']
    print(f"Folding Candidate: {antibody_id}")

    test_chotia_pass = row['test_chotia_pass']
    if not test_chotia_pass:
        print(f"\tSkipping Candidate {row['antibody_id']} due to Chotia test failure.")
        continue

    output_pdb_path = f'../data/candidates/structures_esm3/{antibody_id}_complex.pdb'
    ## Check if the PDB already exists
    if os.path.exists(output_pdb_path):
        print(f"\tPDB for Candidate {antibody_id} already exists. Skipping folding.")
        continue

    h_chain_seq = row['h_chain']
    l_chain_seq = row['l_chain']

    sequence = f'{h_chain_seq}|{l_chain_seq}|{antigen_seq}'

    pdb_str = fold_sequence_esm3(sequence, "esm3-medium-multimer-2024-09", token, soc=True)

    with open(output_pdb_path, 'w') as f:
        f.write(pdb_str)

Folding Candidate: sbio-nipahgpg-001
	PDB for Candidate sbio-nipahgpg-001 already exists. Skipping folding.
Folding Candidate: sbio-nipahgpg-002
	PDB for Candidate sbio-nipahgpg-002 already exists. Skipping folding.
Folding Candidate: sbio-nipahgpg-003
	PDB for Candidate sbio-nipahgpg-003 already exists. Skipping folding.
Folding Candidate: sbio-nipahgpg-004
	PDB for Candidate sbio-nipahgpg-004 already exists. Skipping folding.
Folding Candidate: sbio-nipahgpg-005
	PDB for Candidate sbio-nipahgpg-005 already exists. Skipping folding.
Folding Candidate: sbio-nipahgpg-006
	Skipping Candidate sbio-nipahgpg-006 due to Chotia test failure.
Folding Candidate: sbio-nipahgpg-007
	Skipping Candidate sbio-nipahgpg-007 due to Chotia test failure.
Folding Candidate: sbio-nipahgpg-008
	PDB for Candidate sbio-nipahgpg-008 already exists. Skipping folding.


## ...with Boltz-2

In [None]:
import yaml

## Define function to create Boltz-2 YAML configuration for multimer folding

def create_boltz_yaml(h_chain_seq: str, l_chain_seq: str, antigen_seq: str, output_path: str):
    config_json = {
        "version": 1,
        "sequences": [
            {
            "protein": {
                "id": "H",
                "sequence": h_chain_seq
            }
            },
            {
            "protein": {
                "id": "L",
                "sequence": l_chain_seq
            }
            },
            {
            "protein": {
                "id": "A",
                "sequence": antigen_seq
            }
            }
        ]
        }
    
    with open(output_path, 'w') as f:
        yaml.dump(config_json, f)

In [None]:
## Define function for running Boltz-2 folding using the cford38 Docker image

def run_boltz_folding(yaml_config_path: str, output_pdb_path: str):
    docker_command = f"docker run --gpus all -v ./:/mnt/ --name boltz --rm -it cford38/boltz:2.1.1_withweights boltz predict {yaml_config_path} --out_dir /mnt/{os.path.dirname(output_pdb_path)} --use_msa_server"

    os.system(docker_command)