# Post Analyses of Generated Antibody Sequences

In [8]:
import pandas as pd
from abnumber import Chain
import os
from esm.sdk import client
from esm.sdk.api import ESMProtein, GenerationConfig
from esm.utils.structure.protein_chain import ProteinChain

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
test_cases_df = pd.read_excel('test_cases.xlsx', sheet_name='test_cases')
test_cases_df

Unnamed: 0,pdb_id,source,h_chain_id,l_chain_id,antigen_ids,h_chain_seq,l_chain_seq,antigen_seqs,antibody_sequences,highlighted_epitope_sequences,Generated_001,Generated_002,Generated_003
0,ma-ccy4e,https://www.modelarchive.org/doi/10.5452/ma-ccy4e,H,L,Y,DVQLQESGPSLVKPSQTLSLTCSVTGDSITSDYWSWIRKFPGNRLE...,DIVLTQSPATLSVTPGNSVSLSCRASQSIGNNLHWYQQKSHESPRL...,KVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNRN...,DVQLQESGPSLVKPSQTLSLTCSVTGDSITSDYWSWIRKFPGNRLE...,KVFGRCELAAAM[K][R]HGL[D][N][Y]RG[Y][S]LG[N]WVC...,QVQLQQPGAELVKPGASVKMSCKASGYTFTSYWMHWIKQRPGQGLE...,VQLLESGAEVKKPGASVKVSCKASGYTFISYYMNWVRQAPGQRLEW...,EVQLVESGGGLVKPGGSLKLSCAASGFAFTSYDMSWVRQTPEKRLE...
1,ma-dpr9i,https://www.modelarchive.org/doi/10.5452/ma-dpr9i,H,L,A,DVQLQESGPSLVKPSQTLSLTCSVTGDSITSDYWSWIRKFPGNRLE...,DIVLTQSPATLSVTPGNSVSLSCRASQSIGNNLHWYQQKSHESPRL...,NLCPFHEVFNATTFASVYAWNRKRISNCVADYSVIYNFAPFFAFKC...,DVQLQESGPSLVKPSQTLSLTCSVTGDSITSDYWSWIRKFPGNRLE...,NLCPFHEVFNATTFASVYAWNRKRISNCVADYSVIYNFAPFFAFKC...,QVQLVESGGGVVQPGRSLRLSCAASGFTFSNYGMHWVRQAPGKGLE...,EVQLVESGGGLIQPGGSLRLSCAASEFIVSRNYMSWVRQAPGKGLE...,QMQLVESGGGVVQPGRSLRLSCAASAFTFSNIHWMSWVRQPPGKGL...


In [6]:
def test_numbering(seq:str="") -> bool:
    try:
        chain = Chain(seq, scheme="chothia")
        return True
    except Exception as e:
        print(f"Error with sequence {seq}: {e}")
        return False

In [None]:
def fix_multichain_pdb_str(pdb_str: str, chain_a_end_index: int) -> str:
    """Returns a fixed pdb string where all chains get unique identifiers.
    At the moment there is a bug where all the chains are written out as
    "Chain A". This function adjusts the second chain to have a unique chain
    identifier.
    Adapted from: https://colab.research.google.com/gist/thomas-a-neil/720683f97de624bc6822bf6e9629e298/forward_fold_multimer_dec_2024.ipynb#scrollTo=HDPnd-5xnAo2
    By Neil Thomas
    """
    fixed_lines = []
    for line in pdb_str.splitlines():
        if line.startswith("ATOM") or line.startswith("HETATM"):
            residue = line[17:20] # Residue name (4th column)
            chain = line[21]  # Chain identifier (5th column)
            residue_index = int(line[22:26].strip())  # Residue index (6th column)
            ## Replace the chain identifier with "H" or "L" depending on the split
            if residue_index > chain_a_end_index:
                # Replace the chain with "L"
                line = line[:21] + "L" + line[22:]
            else:
                line = line[:21] + "H" + line[22:]
            ## If residue is "UNK", remove the line
            if residue == "UNK":
                line = ""
        fixed_lines.append(line)
    return "\n".join(fixed_lines)

def fold_sequence(sequence:str, model_name:str, token:str) -> str:
    ## Load the ESM model
    model = client(model=model_name, url="https://forge.evolutionaryscale.ai", token=token)
    ## Prepare the sequence
    sequence = sequence.replace(" ", "").replace("\n", "")
    ## Generate the structure
    structure_prediction = model.generate(
        ESMProtein(sequence=sequence),
        GenerationConfig(
            track="structure", num_steps=len(sequence) // 4, temperature=0
        ),
    )
    ## Fix the PDB string to have L/H chain identifiers and remove "UNK" residues
    CHAIN_A_END_INDEX = structure_prediction.sequence.index("|")
    pdb_str = fix_multichain_pdb_str(structure_prediction.to_pdb_string(), CHAIN_A_END_INDEX)
    return pdb_str