# Post Analyses of Generated Antibody Sequences

In [5]:
import pandas as pd
from abnumber import Chain
import os
from esm.sdk import client
from esm.sdk.api import ESMProtein, GenerationConfig
from esm.utils.structure.protein_chain import ProteinChain

In [6]:
test_antigens_df = pd.read_excel('test_cases.xlsx', sheet_name='test_cases')
test_antigens_df.head()

Unnamed: 0,antigen_id,pdb_id,source,antigen_source,antigen_name,antigen_ids,antigen_seqs,highlighted_epitope_seqs
0,BHRF1,2wh6,https://www.rcsb.org/structure/2WH6,virus,Epstein-Barr virus strain ag876 BHRF1,A,MGSHHHHHHSQDPMAYSTREILLALCIRDSRVHGNGTLHPVLELAA...,AYSTREILLALCIRDSRVHGNGTLHPVLELAARETPLRLSPEDTVV...
1,EGFR,8hgo,https://www.rcsb.org/structure/8HGO,human,EGFR/HER2 ectodomain,B,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...,QVCTGTDMKLRLPASPETHLDMLRHLYQGCQVVQGNLELTYLPTNA...
2,IL-7Ra,3di3,https://www.rcsb.org/structure/3DI3,human,glycosylated human interleukin-7 receptor alph...,A,MGDCDIEGKDGKQYESVLMVSIDQLLDSMKEIGSNCLNNEFNFFKR...,KDG[K][Q]YE[S][V][L]M[V]SID[Q]LLDSMKEIGSNCLNNE...
3,MBP,1peb (1nl5),https://www.rcsb.org/structure/1PEB,bacteria,maltose,A,KIEEGKLVIWINGDKGYNGLAEVGKKFEKDTGIKVTVEHPDKLEEK...,KIEEGKLVIWING[D][K]GYNGLAEVGKKFEKDTGIKVTVEHPDK...
4,PD-L1,4z18,https://www.rcsb.org/structure/4Z18,human,programmed cell death ligand 1,A,MFTVTVPKDLYVVEYGSNMTIECKFPVEKQLDLAALIVYWEMEDKN...,[M]F[T]V[T]VPKDLYVVEYGSNMTIECKF[P][V]EKQLDLAAL...


In [7]:
generated_seqs_df = pd.read_excel('test_cases.xlsx', sheet_name='generated_seqs')
generated_seqs_df.head()

Unnamed: 0,antigen_id,model,seq_id,h_chain,l_chain
0,ma-ccy4e,peleke-phi-4,peleke-phi4_ma-ccy4e_01,QVQLQQPGAELVKPGASVKMSCKASGYTFTSYWMHWIKQRPGQGLE...,DIVMTQSPLSLPVTPGEPASISCRSSQSLLHSNGYNYLDWYLQKPG...
1,ma-ccy4e,peleke-phi-4,peleke-phi4_ma-ccy4e_02,VQLLESGAEVKKPGASVKVSCKASGYTFISYYMNWVRQAPGQRLEW...,DIVLTQSPDSLAVSLGERATINCKSSQNNKNYLAWYQQKPGQPPKV...
2,ma-ccy4e,peleke-phi-4,peleke-phi4_ma-ccy4e_03,EVQLVESGGGLVKPGGSLKLSCAASGFAFTSYDMSWVRQTPEKRLE...,DIVMTQSPLSLPVTLGQPASISCRSSQDGNTFLHWYQQKPDGTVLL...
3,ma-dpr9i,peleke-phi-4,peleke-phi4_ma-dpr9i_01,QVQLVESGGGVVQPGRSLRLSCAASGFTFSNYGMHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQSISTHLNWYQQKPGKAPKL...
4,ma-dpr9i,peleke-phi-4,peleke-phi4_ma-dpr9i_02,EVQLVESGGGLIQPGGSLRLSCAASEFIVSRNYMSWVRQAPGKGLE...,DIQMTQSPSSLSAAVGDRVTITCRASQSISTHLHWYQQKPGKAPKL...


In [8]:
## Join dataframes on test_case_id
test_cases_df = pd.merge(generated_seqs_df, test_antigens_df, on='antigen_id', suffixes=('_gen', '_test'))

test_cases_df.drop(columns=['model', 'pdb_id', 'source', 'antigen_name', 'antigen_source', 'antigen_ids', 'highlighted_epitope_seqs'], inplace=True)

## Filter to seq_id = 1
# test_cases_df = test_cases_df[test_cases_df['seq_id'] == 1].reset_index(drop=True)

test_cases_df.head()

Unnamed: 0,antigen_id,seq_id,h_chain,l_chain,antigen_seqs
0,PD-1,peleke-phi4_PD-1_0a,QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYGISWVRQAPGQGLE...,DIQMTQSPSTLSASVGDRVTITCRASQSIGAWLAWYQQKPGKAPKL...,NPPTFSPALLVVTEGDNATFTCSFSNTSESFVLNWYRMSPSNQTDK...
1,BHRF1,peleke-phi4_BHRF1_0a,QVQLVQSGAEVKKPGASVKVSCKASGYTFTGYYFHWVRQAPGQGPE...,DIQMTQSPSSLSASVGDRVTITCRASQDISDYLNWYQQKPGKAPKL...,MGSHHHHHHSQDPMAYSTREILLALCIRDSRVHGNGTLHPVLELAA...
2,MBP,peleke-phi4_MBP_0a,VQLVQSGAEVKKPGSSVKVSCKASGGTFNSYAFSWVRQAPGQGLEW...,IVMTQSPLSLPVTLGQPASISCRSSQSLVHSNGNTYLEWYLQKPGQ...,KIEEGKLVIWINGDKGYNGLAEVGKKFEKDTGIKVTVEHPDKLEEK...
3,BHRF1,peleke-phi-4_BHRF1_01,QVQLVQSGAEVKKPGSSVKVSCKASGGTFNTGISWVRQAPGQGLEW...,DIVLTQSPATLSVTPGDSVSLSCRASQTISKNNLHWYQQKSHESPR...,MGSHHHHHHSQDPMAYSTREILLALCIRDSRVHGNGTLHPVLELAA...
4,BHRF1,peleke-phi-4_BHRF1_02,EVQLEESGGGLVQPGGSLRLSCAASGFNVVDFSLHWVRQAPGKGLE...,DIQMTQTTSSLSASLGDRVTISCRASQDISNYLNWYQQKPDGTVKL...,MGSHHHHHHSQDPMAYSTREILLALCIRDSRVHGNGTLHPVLELAA...


In [9]:
def test_numbering(seq:str="", chain_type:str="") -> str:
    try:
        chain = Chain(seq, scheme="chothia")
        if chain_type == "heavy":
            if not chain.is_heavy_chain():
                return 'FAIL'
            else:
                return 'PASS'
        elif chain_type == "light":
            if not chain.is_light_chain():
                return 'FAIL'
            else:
                return 'PASS'
        else:
            return 'PASS'
    except Exception as e:
        print(f"Error with sequence {seq}: {e}")
        return 'FAIL'

In [14]:
def fold_sequence(sequence:str, model_name:str, token:str, soc:bool) -> str:
    ## Load the ESM model
    model = client(model=model_name, url="https://forge.evolutionaryscale.ai", token=token)
    ## Prepare the sequence
    sequence = sequence.replace(" ", "").replace("\n", "")
    ## Generate the structure
    try:
        input = ESMProtein(sequence=sequence, potential_sequence_of_concern=soc)
        # input = ESMProtein(sequence=sequence)
        config = GenerationConfig(track="structure", num_steps=10, temperature=0.1)
        generation = model.generate(input, config)
        protein_complex = generation.to_protein_complex()
        pdb_str = protein_complex.to_pdb_string()
        return pdb_str
    except Exception as e:
        print(f"Error folding sequence {sequence}: {e}")
        return ""

## Run Tests

### Numbering Tests

In [None]:
for idx, row in test_cases_df.iterrows():
    antigen_seqs = row['antigen_seqs']
    h_chain_seq = row['h_chain']
    l_chain_seq = row['l_chain']
    ## Test numbering
    test_cases_df.at[idx, 'test_h_chain_numbering'] = test_numbering(seq=h_chain_seq, chain_type="heavy")
    test_cases_df.at[idx, 'test_l_chain_numbering'] = test_numbering(seq=l_chain_seq, chain_type="light")



### Structure Predictions

In [4]:
## Set your ESM token
os.environ['ESM_TOKEN'] = "3qnvcdQ7yVmBH7ImMRRNQb"

In [None]:
for idx, row in test_cases_df.iterrows():
    antigen_seqs = row['antigen_seqs']
    h_chain_seq = row['h_chain']
    l_chain_seq = row['l_chain']
    ## Fold the sequences
    combined_seq = f"{h_chain_seq}|{l_chain_seq}|{antigen_seqs}"

    ## Check if pdb_str already exists
    if 'pdb_str' in row and len(row['pdb_str']) > 0:
        print(f"Skipping folding for {row['seq_id']} as pdb_str already exists.")
        continue

    pdb_str = fold_sequence(combined_seq, model_name="esm3-medium-multimer-2024-09", token=os.getenv("ESM_TOKEN"), soc=False)
    ## Try again if the string is empty
    if pdb_str == "":
        fold_sequence(combined_seq, model_name="esm3-medium-multimer-2024-09", token=os.getenv("ESM_TOKEN"), soc=True)
    ## Save the PDB string
    test_cases_df.at[idx, 'pdb_str'] = pdb_str




Error folding sequence QVQLVQSGAEVKKPGASVKVSCKASGYTFTGYYFHWVRQAPGQGPEWVGWINGGNGDTSYAQKFQGRVTLTDDTSTSTAYMELSSLRSEDTAVYYCARETAYGWYFDYWGQGTLVTVSS|DIQMTQSPSSLSASVGDRVTITCRASQDISDYLNWYQQKPGKAPKLLIYAASRLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQYNSYPLTFGQGTKVEIK|MGSHHHHHHSQDPMAYSTREILLALCIRDSRVHGNGTLHPVLELAARETPLRLSPEDTVVLRYHVLLEEIIERNSETFTETWNRFITHTEHVDLDFNSVFLEIFHRGDPSLGRALAWMAWCMHACRTLCCNQSTPYYVVDLSVRGMLEASEGLDGWIHQQGGWSTLIEDNIPG: 'ESMProteinError' object has no attribute 'to_protein_complex'
Error folding sequence QVQLVQSGAEVKKPGSSVKVSCKASGGTFNTGISWVRQAPGQGLEWMGGIIPIFTGKYAQKFQGRVTITADESTTTAYMELSSLRSEDTAVYYCARGGDYGDYWGQGTTVTVSS|DIVLTQSPATLSVTPGDSVSLSCRASQTISKNNLHWYQQKSHESPRLLIKYASQSISGIPSRFSGSGSGTDFTLSINSVETEDFGMYFCQQSNSWPYTFGGGTKLEIK|MGSHHHHHHSQDPMAYSTREILLALCIRDSRVHGNGTLHPVLELAARETPLRLSPEDTVVLRYHVLLEEIIERNSETFTETWNRFITHTEHVDLDFNSVFLEIFHRGDPSLGRALAWMAWCMHACRTLCCNQSTPYYVVDLSVRGMLEASEGLDGWIHQQGGWSTLIEDNIPG: 'ESMProteinError' object has no attribute 'to_protein_complex'
Error folding sequence EVQ

KeyboardInterrupt: 

In [None]:
## Show rows where pdb_str is not empty
test_struct_df = test_cases_df[test_cases_df['pdb_str'].str.len() > 0][['seq_id', 'pdb_str']]

In [None]:
## Write out the predicted complexes data frame
test_struct_df.to_csv("predicted_complexes.csv", index=False)

In [None]:
## Loop through the rows and write each PDB string to a file
predicted_complexes_dir = "./structures/predicted_complexes"

for idx, row in test_cases_df.iterrows():
    pdb_str = row['pdb_str']
    if len(pdb_str) > 0:
        with open(f"{predicted_complexes_dir}/predicted__{row['seq_id']}__esm3.pdb", "w") as f:
            f.write(pdb_str)

## Visualize

In [None]:
import py3Dmol

In [None]:
pdb_data = test_cases_df['pdb_str'].dropna().tolist()[0]
pdb_data

In [None]:
with open('test_case.pdb', 'w') as f:
    f.write(pdb_data)

In [None]:
view = py3Dmol.view()
view.addModel(pdb_data)
view.setStyle( {'chain':'A'}, { 'cartoon': {'color': '#005035' }})
view.setStyle( {'chain':'B'}, { 'cartoon': {'color': '#A49665'}})
view.zoomTo()