# Post Analyses of Generated Antibody Sequences

In [3]:
import pandas as pd
from abnumber import Chain
import os
from esm.sdk import client
from esm.sdk.api import ESMProtein, GenerationConfig
from esm.utils.structure.protein_chain import ProteinChain

In [4]:
test_cases_df = pd.read_excel('test_cases.xlsx', sheet_name='test_cases')
test_cases_df

Unnamed: 0,pdb_id,source,h_chain_id,l_chain_id,antigen_ids,h_chain_seq,l_chain_seq,antigen_seqs,antibody_sequences,highlighted_epitope_sequences,Generated_001,Generated_002,Generated_003
0,ma-ccy4e,https://www.modelarchive.org/doi/10.5452/ma-ccy4e,H,L,Y,DVQLQESGPSLVKPSQTLSLTCSVTGDSITSDYWSWIRKFPGNRLE...,DIVLTQSPATLSVTPGNSVSLSCRASQSIGNNLHWYQQKSHESPRL...,KVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNRN...,DVQLQESGPSLVKPSQTLSLTCSVTGDSITSDYWSWIRKFPGNRLE...,KVFGRCELAAAM[K][R]HGL[D][N][Y]RG[Y][S]LG[N]WVC...,QVQLQQPGAELVKPGASVKMSCKASGYTFTSYWMHWIKQRPGQGLE...,VQLLESGAEVKKPGASVKVSCKASGYTFISYYMNWVRQAPGQRLEW...,EVQLVESGGGLVKPGGSLKLSCAASGFAFTSYDMSWVRQTPEKRLE...
1,ma-dpr9i,https://www.modelarchive.org/doi/10.5452/ma-dpr9i,H,L,A,DVQLQESGPSLVKPSQTLSLTCSVTGDSITSDYWSWIRKFPGNRLE...,DIVLTQSPATLSVTPGNSVSLSCRASQSIGNNLHWYQQKSHESPRL...,NLCPFHEVFNATTFASVYAWNRKRISNCVADYSVIYNFAPFFAFKC...,DVQLQESGPSLVKPSQTLSLTCSVTGDSITSDYWSWIRKFPGNRLE...,NLCPFHEVFNATTFASVYAWNRKRISNCVADYSVIYNFAPFFAFKC...,QVQLVESGGGVVQPGRSLRLSCAASGFTFSNYGMHWVRQAPGKGLE...,EVQLVESGGGLIQPGGSLRLSCAASEFIVSRNYMSWVRQAPGKGLE...,QMQLVESGGGVVQPGRSLRLSCAASAFTFSNIHWMSWVRQPPGKGL...


In [5]:
def test_numbering(seq:str="", chain_type:str="") -> str:
    try:
        chain = Chain(seq, scheme="chothia")
        if chain_type == "heavy":
            if not chain.is_heavy_chain():
                return 'FAIL'
            else:
                return 'PASS'
        elif chain_type == "light":
            if not chain.is_light_chain():
                return 'FAIL'
            else:
                return 'PASS'
        else:
            return 'PASS'
    except Exception as e:
        print(f"Error with sequence {seq}: {e}")
        return 'FAIL'

In [6]:
def fix_multichain_pdb_str(pdb_str: str, chain_a_end_index: int) -> str:
    """Returns a fixed pdb string where all chains get unique identifiers.
    At the moment there is a bug where all the chains are written out as
    "Chain A". This function adjusts the second chain to have a unique chain
    identifier.
    Adapted from: https://colab.research.google.com/gist/thomas-a-neil/720683f97de624bc6822bf6e9629e298/forward_fold_multimer_dec_2024.ipynb#scrollTo=HDPnd-5xnAo2
    By Neil Thomas
    """
    fixed_lines = []
    for line in pdb_str.splitlines():
        if line.startswith("ATOM") or line.startswith("HETATM"):
            residue = line[17:20] # Residue name (4th column)
            chain = line[21]  # Chain identifier (5th column)
            residue_index = int(line[22:26].strip())  # Residue index (6th column)
            ## Replace the chain identifier with "H" or "L" depending on the split
            if residue_index > chain_a_end_index:
                # Replace the chain with "L"
                line = line[:21] + "L" + line[22:]
            else:
                line = line[:21] + "H" + line[22:]
            ## If residue is "UNK", remove the line
            if residue == "UNK":
                line = ""
        fixed_lines.append(line)
    return "\n".join(fixed_lines)

def fold_sequence(sequence:str, model_name:str, token:str) -> str:
    ## Load the ESM model
    model = client(model=model_name, url="https://forge.evolutionaryscale.ai", token=token)
    ## Prepare the sequence
    sequence = sequence.replace(" ", "").replace("\n", "")
    ## Generate the structure
    try:
        structure_prediction = model.generate(
            ESMProtein(sequence=sequence),
            GenerationConfig(
                track="structure", num_steps=len(sequence) // 4, temperature=0
            ),
        )
        ## Fix the PDB string to have L/H chain identifiers and remove "UNK" residues
        CHAIN_A_END_INDEX = structure_prediction.sequence.index("|")
        pdb_str = fix_multichain_pdb_str(structure_prediction.to_pdb_string(), CHAIN_A_END_INDEX)
        return pdb_str
    except Exception as e:
        print(f"Error folding sequence {sequence}: {e}")
        return ""

## Run Tests

In [None]:
## Set your ESM token
os.environ['ESM_TOKEN'] = ""

In [11]:
for idx, row in test_cases_df.iterrows():
    antigen_seqs = row['antigen_seqs']
    h_chain_seq = row['Generated_001'].split('|')[0]
    l_chain_seq = row['Generated_001'].split('|')[1]
    ## Test numbering
    test_cases_df.at[idx, 'test_h_chain_numbering'] = test_numbering(seq=h_chain_seq, chain_type="heavy")
    test_cases_df.at[idx, 'test_l_chain_numbering'] = test_numbering(seq=l_chain_seq, chain_type="light")
    ## Fold the sequences
    combined_seq = f"{h_chain_seq}|{l_chain_seq}|{antigen_seqs}"
    pdb_str = fold_sequence(combined_seq, model_name="esm3-medium-multimer-2024-09", token=os.getenv("ESM_TOKEN"))
    ## Save the PDB string
    test_cases_df.at[idx, 'pdb_str'] = pdb_str


Error folding sequence QVQLVESGGGVVQPGRSLRLSCAASGFTFSNYGMHWVRQAPGKGLEWVAVISYDGSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRADDTAVYYCARGDKGYSDYWGQGTLVTVSSASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEPK|DIQMTQSPSSLSASVGDRVTITCRASQSISTHLNWYQQKPGKAPKLLIYAASSLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSYSTPWTFGQGTKVEIKRTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYACEVTHQGLSSPVTKSFNRGE|NLCPFHEVFNATTFASVYAWNRKRISNCVADYSVIYNFAPFFAFKCYGVSPTKLNDLCFTNVYADSFVIRGNEVSQIAPGQTGNIADYNYKLPDDFTGCVIAWNSNKLDSKPSGNYNYLYRLLRKSKLKPFERDISTEIYQAGNKPCNGVAGPNCYSPLQSYGFRPTYGVGHQPYRVVVLSFELLHAPATVCGP: 'ESMProteinError' object has no attribute 'sequence'


In [13]:
test_cases_df

Unnamed: 0,pdb_id,source,h_chain_id,l_chain_id,antigen_ids,h_chain_seq,l_chain_seq,antigen_seqs,antibody_sequences,highlighted_epitope_sequences,Generated_001,Generated_002,Generated_003,test_h_chain_numbering,test_l_chain_numbering,pdb_str
0,ma-ccy4e,https://www.modelarchive.org/doi/10.5452/ma-ccy4e,H,L,Y,DVQLQESGPSLVKPSQTLSLTCSVTGDSITSDYWSWIRKFPGNRLE...,DIVLTQSPATLSVTPGNSVSLSCRASQSIGNNLHWYQQKSHESPRL...,KVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNRN...,DVQLQESGPSLVKPSQTLSLTCSVTGDSITSDYWSWIRKFPGNRLE...,KVFGRCELAAAM[K][R]HGL[D][N][Y]RG[Y][S]LG[N]WVC...,QVQLQQPGAELVKPGASVKMSCKASGYTFTSYWMHWIKQRPGQGLE...,VQLLESGAEVKKPGASVKVSCKASGYTFISYYMNWVRQAPGQRLEW...,EVQLVESGGGLVKPGGSLKLSCAASGFAFTSYDMSWVRQTPEKRLE...,PASS,PASS,ATOM 1 N GLN H 1 -4.677 -21.887...
1,ma-dpr9i,https://www.modelarchive.org/doi/10.5452/ma-dpr9i,H,L,A,DVQLQESGPSLVKPSQTLSLTCSVTGDSITSDYWSWIRKFPGNRLE...,DIVLTQSPATLSVTPGNSVSLSCRASQSIGNNLHWYQQKSHESPRL...,NLCPFHEVFNATTFASVYAWNRKRISNCVADYSVIYNFAPFFAFKC...,DVQLQESGPSLVKPSQTLSLTCSVTGDSITSDYWSWIRKFPGNRLE...,NLCPFHEVFNATTFASVYAWNRKRISNCVADYSVIYNFAPFFAFKC...,QVQLVESGGGVVQPGRSLRLSCAASGFTFSNYGMHWVRQAPGKGLE...,EVQLVESGGGLIQPGGSLRLSCAASEFIVSRNYMSWVRQAPGKGLE...,QMQLVESGGGVVQPGRSLRLSCAASAFTFSNIHWMSWVRQPPGKGL...,PASS,PASS,


## Visualize

In [16]:
import py3Dmol

In [19]:
pdb_data = test_cases_df['pdb_str'].dropna().tolist()[0]
pdb_data

'ATOM      1  N   GLN H   1      -4.677 -21.887   3.860  1.00  0.82           N  \nATOM      2  CA  GLN H   1      -3.406 -21.250   4.188  1.00  0.82           C  \nATOM      3  C   GLN H   1      -2.888 -20.420   3.016  1.00  0.82           C  \nATOM      4  CB  GLN H   1      -2.367 -22.298   4.588  1.00  0.82           C  \nATOM      5  O   GLN H   1      -2.927 -20.867   1.868  1.00  0.82           O  \nATOM      6  CG  GLN H   1      -1.828 -22.121   6.001  1.00  0.82           C  \nATOM      7  CD  GLN H   1      -1.515 -23.441   6.680  1.00  0.82           C  \nATOM      8  NE2 GLN H   1      -0.388 -23.495   7.381  1.00  0.82           N  \nATOM      9  OE1 GLN H   1      -2.281 -24.404   6.577  1.00  0.82           O  \nATOM     10  N   VAL H   2      -2.365 -19.365   3.237  1.00  0.84           N  \nATOM     11  CA  VAL H   2      -1.727 -18.500   2.250  1.00  0.84           C  \nATOM     12  C   VAL H   2      -0.407 -19.120   1.796  1.00  0.84           C  \nATOM     13  CB

In [20]:
with open('test_case.pdb', 'w') as f:
    f.write(pdb_data)

In [None]:
view = py3Dmol.view()
view.addModel(pdb_data)
view.setStyle( {'chain':'A'}, { 'cartoon': {'color': '#005035' }})
view.setStyle( {'chain':'B'}, { 'cartoon': {'color': '#A49665'}})
view.zoomTo()

<py3Dmol.view at 0x16f4c6450>