In [49]:
from pandaprot import PandaProt
from biopandas.pdb import PandasPdb
import os
import pandas as pd
from Bio.SeqUtils import seq1

In [50]:

sequences_df = pd.read_csv("sabdab_sequences.csv")
base_path_to_pdbs = "./pdbs"

In [51]:
def get_epitope_residues_pandaprot(pdb_file, h_chain_id, l_chain_id, antigen_ids):
    try:
        # Decompress if needed
        if pdb_file.endswith('.gz'):
            import gzip, shutil
            temp_pdb = pdb_file[:-3]
            with gzip.open(pdb_file, 'rt') as f_in, open(temp_pdb, 'w') as f_out:
                shutil.copyfileobj(f_in, f_out)
            pdb_path = temp_pdb
        else:
            pdb_path = pdb_file

        # Get available chains
        pdb_df = PandasPdb().read_pdb(pdb_path)
        available_chains = set(str(c).strip() for c in pdb_df.df['ATOM']['chain_id'].unique())

        # Only use chains that are present
        h_chain_id = h_chain_id if h_chain_id in available_chains else None
        l_chain_id = l_chain_id if l_chain_id in available_chains else None
        antigen_ids = [c for c in antigen_ids if c in available_chains]
        chains = [c for c in [h_chain_id, l_chain_id] if c] + antigen_ids

        if not h_chain_id or not l_chain_id or not antigen_ids:
            print(f"Skipping {pdb_file}: Required chains not found. Available: {available_chains}")
            if pdb_file.endswith('.gz'):
                os.remove(temp_pdb)
            return []

        analyzer = PandaProt(pdb_path, chains=chains)
        interactions = analyzer.map_interactions()

        epitope_residues = []
        relevant_interactions = []
        for interaction_type, interactions_list in interactions.items():
            for interaction in interactions_list:
                chain1 = interaction.get('chain1', interaction.get('donor_chain', ''))
                chain2 = interaction.get('chain2', interaction.get('acceptor_chain', ''))
                res1 = interaction.get('residue1', interaction.get('donor_residue', ''))
                res2 = interaction.get('residue2', interaction.get('acceptor_residue', ''))
                # Only consider antigen-antibody interactions
                for antigen_id in antigen_ids:
                    if (
                        (chain1 == antigen_id and chain2 in [h_chain_id, l_chain_id]) or
                        (chain2 == antigen_id and chain1 in [h_chain_id, l_chain_id])
                    ):
                        epitope_residues.append(f"{antigen_id}:{res1 if chain1 == antigen_id else res2}")
                        relevant_interactions.append((interaction_type, interaction))
                    # Ignore antigen-antigen interactions

        # Print only relevant interactions
        if relevant_interactions:
            print(f"Relevant interactions for {os.path.basename(pdb_file)}:")
            for interaction_type, interaction in relevant_interactions:
                print(f"  - {interaction_type}: {interaction}")
        else:
            print(f"No relevant interactions found for {os.path.basename(pdb_file)}.")

        if pdb_file.endswith('.gz'):
            os.remove(temp_pdb)
        return sorted(set(epitope_residues))
    except Exception as e:
        print(f"Error processing {pdb_file}: {e}")
        return []

# Run on the first 10 entries
for idx, row in sequences_df.head(30).iterrows():
    pdb_file = f"{base_path_to_pdbs}/{row['pdb_id']}.pdb.gz"
    h_chain_id = row['h_chain_id']
    l_chain_id = row['l_chain_id']
    antigen_ids = [c.strip() for c in row['antigen_ids'].split('|')]
    residues = get_epitope_residues_pandaprot(pdb_file, h_chain_id, l_chain_id, antigen_ids)
    print(f"{row['pdb_id']} epitope residues: {', '.join(residues) if residues else 'None found'}")

Successfully loaded structure from ./pdbs/8xa4.pdb
Found 1392 interactions:
  - hydrogen_bonds: 44
  - ionic_interactions: 12
  - hydrophobic_interactions: 78
  - pi_stacking: 2
  - pi_cation: 0
  - salt_bridges: 12
  - cation_pi: 0
  - ch_pi: 38
  - disulfide_bridges: 0
  - sulfur_aromatic: 0
  - water_mediated: 0
  - metal_coordination: 0
  - halogen_bonds: 0
  - amide_aromatic: 4
  - van_der_waals: 1194
  - amide_amide: 8
Relevant interactions for 8xa4.pdb.gz:
  - hydrogen_bonds: {'type': 'hydrogen_bond', 'donor_chain': 'A', 'donor_residue': 'GLU 149', 'donor_atom': 'O', 'acceptor_chain': 'C', 'acceptor_residue': 'SER 104', 'acceptor_atom': 'OG', 'distance': np.float32(3.2159338)}
  - hydrogen_bonds: {'type': 'hydrogen_bond', 'donor_chain': 'A', 'donor_residue': 'ARG 176', 'donor_atom': 'NH1', 'acceptor_chain': 'C', 'acceptor_residue': 'HIS 107', 'acceptor_atom': 'NE2', 'distance': np.float32(3.037882)}
  - hydrogen_bonds: {'type': 'hydrogen_bond', 'donor_chain': 'A', 'donor_residue

In [53]:
def build_resnum_to_seq_idx_map(pdb_file, chain_id):
    """
    Returns a dict mapping PDB residue numbers (as int) to sequence indices (1-based) for a given chain.
    """
    pdb = PandasPdb().read_pdb(pdb_file)
    atom_df = pdb.df['ATOM']
    # Only keep rows for the specified chain
    chain_df = atom_df[atom_df['chain_id'] == chain_id]
    # Get unique residue numbers in order of appearance
    residues = chain_df[['residue_number', 'residue_name']].drop_duplicates()
    resnum_to_idx = {}
    for idx, (resnum, _) in enumerate(residues.values, 1):  # 1-based index
        resnum_to_idx[int(resnum)] = idx
    return resnum_to_idx

In [54]:
import re

In [None]:
def highlight_epitope_in_sequence(sequence, chain_id, epitope_residues, resnum_to_idx):
    """
    Places brackets around epitope residues in the antigen sequence.
    sequence: str, full antigen sequence (1-letter code)
    chain_id: str, chain identifier (e.g., 'A')
    epitope_residues: list of str, e.g., ['A:ARG 176', ...]
    resnum_to_idx: dict mapping PDB residue numbers to sequence indices (1-based)
    """
    import re
    pattern = re.compile(rf"^{chain_id}:(?:\w+)\s*(\d+)$")
    epitope_seq_indices = set()
    for res in epitope_residues:
        m = pattern.match(res)
        if m:
            pdb_resnum = int(m.group(1))
            seq_idx = resnum_to_idx.get(pdb_resnum)
            if seq_idx:
                epitope_seq_indices.add(seq_idx)
    highlighted = ""
    for i, aa in enumerate(sequence, 1):
        if i in epitope_seq_indices:
            highlighted += f"[{aa}]"
        else:
            highlighted += aa
    return highlighted

# Group by pdb_id and aggregate antigen_ids and antigen_seqs using '|'
grouped_df = sequences_df.groupby('pdb_id').agg({
    'h_chain_id': 'first',
    'l_chain_id': 'first',
    'antigen_ids': lambda x: '|'.join(x.astype(str)),
    'antigen_seqs': lambda x: '|'.join(x.astype(str)) if 'antigen_seqs' in sequences_df.columns else ''
}).reset_index()

combined_results = []
for idx, row in grouped_df.iterrows():
    pdb_file = f"{base_path_to_pdbs}/{row['pdb_id']}.pdb.gz"
    h_chain_id = row['h_chain_id']
    l_chain_id = row['l_chain_id']
    antigen_ids = [c.strip() for c in row['antigen_ids'].split('|')]
    antigen_seqs = str(row['antigen_seqs']).split('|') if pd.notnull(row['antigen_seqs']) else []
    residues = get_epitope_residues_pandaprot(pdb_file, h_chain_id, l_chain_id, antigen_ids)
    chain_list, seq_list, res_list = [], [], []
    for i, antigen_chain in enumerate(antigen_ids):
        antigen_sequence = antigen_seqs[i] if i < len(antigen_seqs) else None
        if antigen_sequence and antigen_sequence != 'nan':
            try:
                resnum_to_idx = build_resnum_to_seq_idx_map(pdb_file, antigen_chain)
                highlighted_seq = highlight_epitope_in_sequence(antigen_sequence, antigen_chain, residues, resnum_to_idx)
            except Exception as e:
                print(f"Error mapping for {row['pdb_id']} chain {antigen_chain}: {e}")
                highlighted_seq = None
        else:
            highlighted_seq = None
        chain_list.append(antigen_chain)
        seq_list.append(highlighted_seq if highlighted_seq else "")
        chain_residues = [r for r in residues if r.startswith(f"{antigen_chain}:")]
        res_list.append('|'.join(chain_residues))
    combined_results.append({
        'pdb_id': row['pdb_id'],
        'antigen_chains': '|'.join(chain_list),
        'highlighted_epitope_sequences': '|'.join(seq_list),
        'epitope_residues': '|'.join(res_list)
    })


# Create DataFrame and merge with original
highlight_df = pd.DataFrame(combined_results)
# Merge on pdb_id and antigen_chain (if you want to keep all original columns)
# merged_df = pd.merge(sequences_df, highlight_df, left_on=['pdb_id'], right_on=['pdb_id'], how='left')

# Save to new CSV (recommended to avoid overwriting original)
# merged_df.to_csv("sabdab_sequences_with_epitope.csv", index=False)

Successfully loaded structure from ./pdbs/8xa4.pdb
Found 1392 interactions:
  - hydrogen_bonds: 44
  - ionic_interactions: 12
  - hydrophobic_interactions: 78
  - pi_stacking: 2
  - pi_cation: 0
  - salt_bridges: 12
  - cation_pi: 0
  - ch_pi: 38
  - disulfide_bridges: 0
  - sulfur_aromatic: 0
  - water_mediated: 0
  - metal_coordination: 0
  - halogen_bonds: 0
  - amide_aromatic: 4
  - van_der_waals: 1194
  - amide_amide: 8
Relevant interactions for 8xa4.pdb.gz:
  - hydrogen_bonds: {'type': 'hydrogen_bond', 'donor_chain': 'A', 'donor_residue': 'GLU 149', 'donor_atom': 'O', 'acceptor_chain': 'C', 'acceptor_residue': 'SER 104', 'acceptor_atom': 'OG', 'distance': np.float32(3.2159338)}
  - hydrogen_bonds: {'type': 'hydrogen_bond', 'donor_chain': 'A', 'donor_residue': 'ARG 176', 'donor_atom': 'NH1', 'acceptor_chain': 'C', 'acceptor_residue': 'HIS 107', 'acceptor_atom': 'NE2', 'distance': np.float32(3.037882)}
  - hydrogen_bonds: {'type': 'hydrogen_bond', 'donor_chain': 'A', 'donor_residue

In [56]:
highlight_df



Unnamed: 0,pdb_id,antigen_chains,highlighted_epitope_sequences,epitope_residues
0,8xa4,A|B,SCNGLYYQGSCYI[L]HSD[Y]KSFEDAKANCAAESSTLPNKSDVL...,A:ARG 176|A:ASP 146|A:ASP 150|A:ASP 170|A:GLN ...
1,8z3y,A|B,TLSAEDKAAVERSKMIEKQLQKDKQVYRATHRLLLLGADNSGKSTI...,|
2,9cph,A,KIEEGKLVIWINGDKGYNGLAEVGKKFEKDTGIKVTVEHPDKLEEK...,A:ALA 1116|A:ALA 1122|A:ALA 1128|A:ALA 900|A:A...
3,9d7i,E,LWVTVYYGVPVWKDAETTLFCASDNVWATHACVPTDPNPQEIHLEN...,E:ARG 429|E:ARG 469|E:ASN 177|E:ASN 197|E:ASN ...
4,9d7i,C,LWVTVYYGVPVWKDAETTLFCASDNVWATHACVPTDPNPQEIHLEN...,C:ARG 469|C:ASN 197|C:ASN 280|C:ASN 425|C:ASP ...
5,9d7o,E,LWVTVYYGVPVWKDAETTLFCASDNVWATHACVPTDPNPQEIHLEN...,E:ARG 429|E:ARG 469|E:ASN 197|E:ASN 280|E:ASN ...
6,9d7p,E,LWVTVYYGVPVWKDAETTLFCASDNVWATHACVPTDPNPQEIHLEN...,E:ARG 429|E:ARG 469|E:ASN 197|E:ASN 280|E:ASN ...
7,9d7p,C,LWVTVYYGVPVWKDAETTLFCASDNVWATHACVPTDPNPQEIHLEN...,C:ARG 469|C:ASN 280|C:ASN 425|C:ASP 368|C:ASP ...
8,9hb3,R,LLARAELALLSIVFVAVALSNGLVLAALARRGRRGHWAPIHVFIGH...,R:ALA 539|R:ARG 538|R:ASN 584|R:ASP 543|R:ASP ...
9,9iut,C,[M][P][I][W][K][F][P][D],C:ASP 618|C:ILE 613|C:LYS 615|C:MET 611|C:PHE ...


In [57]:
highlight_df.to_csv("sabdab_highlighted_epitopes.csv", index=False)