# Run Evaluation Tests on Candidates

In [4]:
import pandas as pd
import os, re, subprocess, json

In [5]:
## Read in candidates data
candidates_df = pd.read_excel('../candidates.xlsx', sheet_name='Antibody Candidates')
antigens_df = pd.read_excel('../candidates.xlsx', sheet_name='Antigens')

## Get the antigen sequence for Nipah Glycoprotein G
antigen_seq = antigens_df.loc[antigens_df['antigen_id'] == 'nipah_gpG', 'antigen_sequence'].values[0]
print(f"Using Antigen Sequence: {antigen_seq}")

Using Antigen Sequence: ICLQKTSNQILKPKLISYTLPVVGQSGTCITDPLLAMDEGYFAYSHLERIGSCSRGVSKQRIIGVGEVLDRGDEVPSLFMTNVWTPPNPNTVYHCSAVYNNEFYYVLCAVSTVGDPILNSTYWSGSLMMTRLAVKPKSNGGGYNQHQLALRSIEKGRYDKVMPYGPSGIKQGDTLYFPAVGFLVRTEFKYNDSNCPITKCQYSKPENCRLSMGIRPNSHYILRSGLLKYNLSDGENPKVVFIEISDQRLSIGSPSKIYDSLGQPVFYQASFSWDTMIKFGDVLTVNPLVVNWRNNTVISRPGQSQCPRFNTCPEICWEGVYNDAFLIDRINWISAGVFLDSNQTAENPVFTVFKDNEILYRAQLASEDTNAQKTITNCFLLKNKIWCISLVEIYDTGDNVIRPKLFAVKIPEQCTH


## iPSAE

In [None]:
## Loop through each candidate and run the ipsae script
scores_df = pd.DataFrame(
    columns=[
        'antibody_id',
        'average_pae',
        'average_pde',
        'average_plddt',
        'ipsae_score',
        'pae_score',
        'pde_score',
        'plddt_score'
        ]
    )

for idx, row in candidates_df.iterrows():
    antibody_id = row['antibody_id']
    print(f"Scoring Boltz-2 outputs for Candidate: {antibody_id}")
    prediction_dir = f'../data/candidates/structures_boltz2/boltz_results_{antibody_id}/predictions/{antibody_id}/'
    stem_name = f'{antibody_id}_model_0'

    ## Check if prediction directory exists
    if not os.path.exists(prediction_dir):
        print(f"Prediction directory not found for {antibody_id}, skipping...")
        continue

    command = f'python helper_scripts/boltz_ipsae_score.py {prediction_dir} {stem_name}'
    # output = os.system(command)

    output = subprocess.run(command.split(' '), capture_output=True, text=True)
    # print(output.stdout)

    score_dict = json.loads(output.stdout.replace('\'','\"'))
    score_df_row = pd.DataFrame([score_dict])
    score_df_row['antibody_id'] = antibody_id

    scores_df = pd.concat([scores_df, score_df_row], ignore_index=True)

In [24]:
scores_df

Unnamed: 0,antibody_id,average_pae,average_pde,average_plddt,ipsae_score,pae_score,pde_score,plddt_score
0,sbio-nipahgpg-001,12.403783,4.492502,0.923735,0.492462,0.586541,0.85025,0.009237
1,sbio-nipahgpg-002,13.11187,5.195256,0.922231,0.475989,0.562938,0.826825,0.009222
2,sbio-nipahgpg-003,5.68208,1.471314,0.904497,0.612239,0.810597,0.950956,0.009045
3,sbio-nipahgpg-004,6.712657,1.952444,0.911649,0.593708,0.776245,0.934919,0.009116
4,sbio-nipahgpg-005,13.62081,5.730904,0.914657,0.463824,0.545973,0.80897,0.009147
5,sbio-nipahgpg-008,13.464549,5.305217,0.930099,0.470211,0.551182,0.823159,0.009301
6,sbio-nipahgpg-009,10.421204,3.52936,0.919072,0.528514,0.652627,0.882355,0.009191
7,sbio-nipahgpg-010,13.818919,5.666187,0.903362,0.461796,0.539369,0.811127,0.009034
8,sbio-nipahgpg-011,13.512441,5.077036,0.918171,0.471818,0.549585,0.830765,0.009182
9,sbio-nipahgpg-012,9.039774,2.962634,0.912894,0.552582,0.698674,0.901246,0.009129


## Generate PDBs from CIFs

In [6]:
## Convert .CIF files to .PDB format
from Bio import PDB
import os

def convert_cif_to_pdb(cif_file_path):
    """
    Convert a .cif (mmCIF) protein structure file to .pdb format.

    Parameters
    ----------
    cif_file_path : str
        Path to the input .cif file.

    Returns
    -------
    str
        Path to the generated .pdb file.
    """
    ## Validate file extension
    if not cif_file_path.lower().endswith(".cif"):
        raise ValueError("Input file must have a .cif extension.")

    ## Determine output file path
    pdb_file_path = os.path.splitext(cif_file_path)[0] + ".pdb"

    ## Initialize parser and writer
    parser = PDB.MMCIFParser(QUIET=True)
    io = PDB.PDBIO()

    ## Extract structure ID from filename
    structure_id = os.path.basename(cif_file_path).split('.')[0]

    ## Parse structure
    structure = parser.get_structure(structure_id, cif_file_path)

    ## Write to PDB format
    io.set_structure(structure)
    io.save(pdb_file_path)

    print(f"Converted: {cif_file_path} → {pdb_file_path}")
    return pdb_file_path


In [9]:
for idx, row in candidates_df.iterrows():
    antibody_id = row['antibody_id']
    print(f"Making a PDB file for Candidate: {antibody_id}")
    structure_file_path = f'../data/candidates/structures_boltz2/boltz_results_{antibody_id}/predictions/{antibody_id}/{antibody_id}_model_0.cif'

    ## Check if prediction directory exists
    if not os.path.exists(structure_file_path):
        print(f"Prediction directory not found for {antibody_id}, skipping...")
        continue

    pdb_file_path = convert_cif_to_pdb(structure_file_path)

Making a PDB file for Candidate: sbio-nipahgpg-001
Converted: ../data/candidates/structures_boltz2/boltz_results_sbio-nipahgpg-001/predictions/sbio-nipahgpg-001/sbio-nipahgpg-001_model_0.cif → ../data/candidates/structures_boltz2/boltz_results_sbio-nipahgpg-001/predictions/sbio-nipahgpg-001/sbio-nipahgpg-001_model_0.pdb
Making a PDB file for Candidate: sbio-nipahgpg-002
Converted: ../data/candidates/structures_boltz2/boltz_results_sbio-nipahgpg-002/predictions/sbio-nipahgpg-002/sbio-nipahgpg-002_model_0.cif → ../data/candidates/structures_boltz2/boltz_results_sbio-nipahgpg-002/predictions/sbio-nipahgpg-002/sbio-nipahgpg-002_model_0.pdb
Making a PDB file for Candidate: sbio-nipahgpg-003
Converted: ../data/candidates/structures_boltz2/boltz_results_sbio-nipahgpg-003/predictions/sbio-nipahgpg-003/sbio-nipahgpg-003_model_0.cif → ../data/candidates/structures_boltz2/boltz_results_sbio-nipahgpg-003/predictions/sbio-nipahgpg-003/sbio-nipahgpg-003_model_0.pdb
Making a PDB file for Candidate: s

## HADDOCK3

In [10]:
## Define HADDOCK3 Scoring Function
def haddock3_score(pdb_path:str) -> dict:

  try:
    ## Run haddock3-score CLI
    command = ["haddock3-score", "--full", pdb_path]
    sp_result = subprocess.run(command, capture_output=True, text=True, check=True)

    ## Parse result
    metrics = {}

    ## Extract HADDOCK score
    match = re.search(r"HADDOCK-score \(emscoring\) = ([\-\d\.]+)", sp_result.stdout)
    if match:
        metrics["score"] = float(match.group(1))

    ## Extract individual energy terms
    matches = re.findall(r"(\w+)=([\-\d\.]+)", sp_result.stdout)
    for key, value in matches:
        metrics[key] = float(value)

    ## Calculate total score
    metrics["total"] = metrics["vdw"] + metrics["elec"]

    ## Remove air
    del metrics["air"]

    return metrics

  except subprocess.CalledProcessError as e:
    print("HADDOCK3 Error occurred:", e.stderr)
    return {}

In [None]:
## Loop through each candidate and run the ipsae script
haddock_df = pd.DataFrame(
    columns=[
        'antibody_id',
        ]
    )

for idx, row in candidates_df.iterrows():
    antibody_id = row['antibody_id']
    print(f"Running HADDOCK for Candidate: {antibody_id}")
    pdb_path = os.path.abspath(f'../data/candidates/structures_boltz2/boltz_results_{antibody_id}/predictions/{antibody_id}/{antibody_id}_model_0.pdb')

    ## Check if prediction directory exists
    if not os.path.exists(pdb_path):
        print(f"Prediction directory not found for {antibody_id}, skipping...")
        continue

    print(f"Scoring PDB file at: {pdb_path}")

    haddock_dict = haddock3_score(pdb_path = pdb_path)
    # haddock_df_row = pd.DataFrame([haddock_dict])
    # haddock_df_row['antibody_id'] = antibody_id
    print(haddock_dict)

    # haddock_df = pd.concat([haddock_df, haddock_df_row], ignore_index=True)
