# Run Evaluation Tests on Candidates

In [1]:
import pandas as pd
import os, re, subprocess, json

In [2]:
## Read in candidates data
candidates_df = pd.read_excel('../candidates.xlsx', sheet_name='Antibody Candidates')
antigens_df = pd.read_excel('../candidates.xlsx', sheet_name='Antigens')

## Get the antigen sequence for Nipah Glycoprotein G
antigen_seq = antigens_df.loc[antigens_df['antigen_id'] == 'nipah_gpG', 'antigen_sequence'].values[0]
print(f"Using Antigen Sequence: {antigen_seq}")

Using Antigen Sequence: ICLQKTSNQILKPKLISYTLPVVGQSGTCITDPLLAMDEGYFAYSHLERIGSCSRGVSKQRIIGVGEVLDRGDEVPSLFMTNVWTPPNPNTVYHCSAVYNNEFYYVLCAVSTVGDPILNSTYWSGSLMMTRLAVKPKSNGGGYNQHQLALRSIEKGRYDKVMPYGPSGIKQGDTLYFPAVGFLVRTEFKYNDSNCPITKCQYSKPENCRLSMGIRPNSHYILRSGLLKYNLSDGENPKVVFIEISDQRLSIGSPSKIYDSLGQPVFYQASFSWDTMIKFGDVLTVNPLVVNWRNNTVISRPGQSQCPRFNTCPEICWEGVYNDAFLIDRINWISAGVFLDSNQTAENPVFTVFKDNEILYRAQLASEDTNAQKTITNCFLLKNKIWCISLVEIYDTGDNVIRPKLFAVKIPEQCTH


## iPSAE

In [3]:
## Loop through each candidate and run the ipsae script
scores_df = pd.DataFrame(
    columns=[
        'antibody_id',
        'average_pae',
        'average_pde',
        'average_plddt',
        'ipsae_score',
        'pae_score',
        'pde_score',
        'plddt_score'
        ]
    )

for idx, row in candidates_df.iterrows():
    antibody_id = row['antibody_id']
    print(f"Scoring Boltz-2 outputs for Candidate: {antibody_id}")
    prediction_dir = f'../data/candidates/structures_boltz2_frankenchain/boltz_results_{antibody_id}/predictions/{antibody_id}/'
    stem_name = f'{antibody_id}_model_0'

    ## Check if prediction directory exists
    if not os.path.exists(prediction_dir):
        print(f"Prediction directory not found for {antibody_id}, skipping...")
        continue

    command = f'python helper_scripts/boltz_ipsae_score.py {prediction_dir} {stem_name}'
    # output = os.system(command)

    output = subprocess.run(command.split(' '), capture_output=True, text=True)
    # print(output.stdout)

    score_dict = json.loads(output.stdout.replace('\'','\"'))
    score_df_row = pd.DataFrame([score_dict])
    score_df_row['antibody_id'] = antibody_id

    scores_df = pd.concat([scores_df, score_df_row], ignore_index=True)

Scoring Boltz-2 outputs for Candidate: sbio-nipahgpg-001
Scoring Boltz-2 outputs for Candidate: sbio-nipahgpg-002


  scores_df = pd.concat([scores_df, score_df_row], ignore_index=True)


Scoring Boltz-2 outputs for Candidate: sbio-nipahgpg-003
Scoring Boltz-2 outputs for Candidate: sbio-nipahgpg-004
Scoring Boltz-2 outputs for Candidate: sbio-nipahgpg-005
Scoring Boltz-2 outputs for Candidate: sbio-nipahgpg-006
Prediction directory not found for sbio-nipahgpg-006, skipping...
Scoring Boltz-2 outputs for Candidate: sbio-nipahgpg-007
Prediction directory not found for sbio-nipahgpg-007, skipping...
Scoring Boltz-2 outputs for Candidate: sbio-nipahgpg-008
Scoring Boltz-2 outputs for Candidate: sbio-nipahgpg-009
Scoring Boltz-2 outputs for Candidate: sbio-nipahgpg-010
Scoring Boltz-2 outputs for Candidate: sbio-nipahgpg-011
Scoring Boltz-2 outputs for Candidate: sbio-nipahgpg-012
Scoring Boltz-2 outputs for Candidate: sbio-nipahgpg-013
Scoring Boltz-2 outputs for Candidate: sbio-nipahgpg-014
Scoring Boltz-2 outputs for Candidate: sbio-nipahgpg-015
Scoring Boltz-2 outputs for Candidate: sbio-nipahgpg-016
Scoring Boltz-2 outputs for Candidate: sbio-nipahgpg-017
Scoring Boltz

In [4]:
scores_df

Unnamed: 0,antibody_id,average_pae,average_pde,average_plddt,ipsae_score,pae_score,pde_score,plddt_score
0,sbio-nipahgpg-001,7.79013,2.350078,0.903231,0.575341,0.740329,0.921664,0.009032
1,sbio-nipahgpg-002,13.333527,4.954325,0.898034,0.475371,0.555549,0.834856,0.00898
2,sbio-nipahgpg-003,14.439913,5.500241,0.863144,0.455055,0.51867,0.816659,0.008631
3,sbio-nipahgpg-004,11.430137,3.962842,0.897653,0.510663,0.618995,0.867905,0.008977
4,sbio-nipahgpg-005,13.146399,4.797989,0.899243,0.479433,0.561787,0.840067,0.008992
5,sbio-nipahgpg-008,12.917172,4.952446,0.909328,0.480975,0.569428,0.834918,0.009093
6,sbio-nipahgpg-009,11.204225,3.741708,0.894594,0.515877,0.626526,0.875276,0.008946
7,sbio-nipahgpg-010,14.631048,6.022535,0.881599,0.447339,0.512298,0.799249,0.008816
8,sbio-nipahgpg-011,9.604836,3.514567,0.903587,0.539501,0.679839,0.882848,0.009036
9,sbio-nipahgpg-012,13.131809,4.763036,0.895497,0.479965,0.562273,0.841232,0.008955


## Generate PDBs from CIFs

In [6]:
## Convert .CIF files to .PDB format
from Bio import PDB
import os

def convert_cif_to_pdb(cif_file_path):
    """
    Convert a .cif (mmCIF) protein structure file to .pdb format.

    Parameters
    ----------
    cif_file_path : str
        Path to the input .cif file.

    Returns
    -------
    str
        Path to the generated .pdb file.
    """
    ## Validate file extension
    if not cif_file_path.lower().endswith(".cif"):
        raise ValueError("Input file must have a .cif extension.")

    ## Determine output file path
    pdb_file_path = os.path.splitext(cif_file_path)[0] + ".pdb"

    ## Initialize parser and writer
    parser = PDB.MMCIFParser(QUIET=True)
    io = PDB.PDBIO()

    ## Extract structure ID from filename
    structure_id = os.path.basename(cif_file_path).split('.')[0]

    ## Parse structure
    structure = parser.get_structure(structure_id, cif_file_path)

    ## Write to PDB format
    io.set_structure(structure)
    io.save(pdb_file_path)

    print(f"Converted: {cif_file_path} → {pdb_file_path}")
    return pdb_file_path


In [9]:
for idx, row in candidates_df.iterrows():
    antibody_id = row['antibody_id']
    print(f"Making a PDB file for Candidate: {antibody_id}")
    structure_file_path = f'../data/candidates/structures_boltz2/boltz_results_{antibody_id}/predictions/{antibody_id}/{antibody_id}_model_0.cif'

    ## Check if prediction directory exists
    if not os.path.exists(structure_file_path):
        print(f"Prediction directory not found for {antibody_id}, skipping...")
        continue

    pdb_file_path = convert_cif_to_pdb(structure_file_path)

Making a PDB file for Candidate: sbio-nipahgpg-001
Converted: ../data/candidates/structures_boltz2/boltz_results_sbio-nipahgpg-001/predictions/sbio-nipahgpg-001/sbio-nipahgpg-001_model_0.cif → ../data/candidates/structures_boltz2/boltz_results_sbio-nipahgpg-001/predictions/sbio-nipahgpg-001/sbio-nipahgpg-001_model_0.pdb
Making a PDB file for Candidate: sbio-nipahgpg-002
Converted: ../data/candidates/structures_boltz2/boltz_results_sbio-nipahgpg-002/predictions/sbio-nipahgpg-002/sbio-nipahgpg-002_model_0.cif → ../data/candidates/structures_boltz2/boltz_results_sbio-nipahgpg-002/predictions/sbio-nipahgpg-002/sbio-nipahgpg-002_model_0.pdb
Making a PDB file for Candidate: sbio-nipahgpg-003
Converted: ../data/candidates/structures_boltz2/boltz_results_sbio-nipahgpg-003/predictions/sbio-nipahgpg-003/sbio-nipahgpg-003_model_0.cif → ../data/candidates/structures_boltz2/boltz_results_sbio-nipahgpg-003/predictions/sbio-nipahgpg-003/sbio-nipahgpg-003_model_0.pdb
Making a PDB file for Candidate: s

## HADDOCK3

In [10]:
## Define HADDOCK3 Scoring Function
def haddock3_score(pdb_path:str) -> dict:

  try:
    ## Run haddock3-score CLI
    command = ["haddock3-score", "--full", pdb_path]
    sp_result = subprocess.run(command, capture_output=True, text=True, check=True)

    ## Parse result
    metrics = {}

    ## Extract HADDOCK score
    match = re.search(r"HADDOCK-score \(emscoring\) = ([\-\d\.]+)", sp_result.stdout)
    if match:
        metrics["score"] = float(match.group(1))

    ## Extract individual energy terms
    matches = re.findall(r"(\w+)=([\-\d\.]+)", sp_result.stdout)
    for key, value in matches:
        metrics[key] = float(value)

    ## Calculate total score
    metrics["total"] = metrics["vdw"] + metrics["elec"]

    ## Remove air
    del metrics["air"]

    return metrics

  except subprocess.CalledProcessError as e:
    print("HADDOCK3 Error occurred:", e.stderr)
    return {}

In [None]:
## Loop through each candidate and run the ipsae script
haddock_df = pd.DataFrame(
    columns=[
        'antibody_id',
        ]
    )

for idx, row in candidates_df.iterrows():
    antibody_id = row['antibody_id']
    print(f"Running HADDOCK for Candidate: {antibody_id}")
    pdb_path = os.path.abspath(f'../data/candidates/structures_boltz2/boltz_results_{antibody_id}/predictions/{antibody_id}/{antibody_id}_model_0.pdb')

    ## Check if prediction directory exists
    if not os.path.exists(pdb_path):
        print(f"Prediction directory not found for {antibody_id}, skipping...")
        continue

    print(f"Scoring PDB file at: {pdb_path}")

    haddock_dict = haddock3_score(pdb_path = pdb_path)
    # haddock_df_row = pd.DataFrame([haddock_dict])
    # haddock_df_row['antibody_id'] = antibody_id
    print(haddock_dict)

    # haddock_df = pd.concat([haddock_df, haddock_df_row], ignore_index=True)
