# Find Polar Contacts

In [13]:
from biopandas.pdb import PandasPdb
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist
import os

In [None]:
## Read in Fabs list
fabs = pd.read_csv('../data/sabdab/sabdab_with_sequences.tsv', sep='\t')#.head(1)

## Filter NaN sequences
fabs = fabs[~fabs['Hchain'].isna()]
fabs = fabs[~fabs['Lchain'].isna()]
fabs = fabs[~fabs['AntigenChains'].isna()]

In [10]:
fabs

Unnamed: 0,pdb,Hchain,Lchain,AntigenChains,HeavySeq,LightSeq,AntigenSeq
4,8xa4,C,D,A | B,QLQLQESGPGLVKPSETLSLTCTVSGGSISSNNDYWGWIRQPPGKG...,EIVLTQSPGTLSLSPGERVTLSCRASQRVSSTYLAWYQQKPGQAPR...,SCNGLYYQGSCYILHSDYKSFEDAKANCAAESSTLPNKSDVLTTWL...
5,8z3y,S,s,A | B,VQLVESGGGLVQPGGSRKLSCSASGFAFSSFGMHWVRQAPEKGLEW...,,TLSAEDKAAVERSKMIEKQLQKDKQVYRATHRLLLLGADNSGKSTI...
9,9cph,H,L,A,EVQLVESGGGLVQPGGSLRLSCAASGFNLSSSSIHWVRQAPGKGLE...,AQMTQSPSSLSASVGDRVTITCRASQSVSSAVAWYQQKPGKAPKLL...,KIEEGKLVIWINGDKGYNGLAEVGKKFEKDTGIKVTVEHPDKLEEK...
10,9d7i,H,G,E,VQLQESGPGVVKSSETLSLTCTVSGGSMGGTYWSWLRLSPGKGLEW...,YELTQPPSVSVSPGQTATITCSGASTNVCWYQVKPGQSPEVVIFEN...,LWVTVYYGVPVWKDAETTLFCASDNVWATHACVPTDPNPQEIHLEN...
11,9d7i,J,I,C,VQLQESGPGVVKSSETLSLTCTVSGGSMGGTYWSWLRLSPGKGLEW...,YELTQPPSVSVSPGQTATITCSGASTNVCWYQVKPGQSPEVVIFEN...,LWVTVYYGVPVWKDAETTLFCASDNVWATHACVPTDPNPQEIHLEN...
...,...,...,...,...,...,...,...
18297,6ejm,H,h,B,VMLVESGGGFVKPGGSLKLSCAASGFTFRSYIMSWVRQTPEKRLEW...,,FVNKDQIAKDVKQFYDQALQQAVVDNNAKAVVKTFHETLDCCGSST...
18298,7lo6,J,I,C,EVQLVESGAEVKKPGSSVKVSCKASGDTFIRYSFTWVRQAPGQGLE...,DIVMTQSPATLSVSPGERATLSCRASESVSSDLAWYQQKPGQAPRL...,NLWVTVYYGVPVWKDAETTLFCASDAKAYETEKHNVWATHACVPTD...
18299,3vi3,H,L,D,QVHLQQSGAELMKPGASVKISCKATGYTFTSYWIEWVKQRPGHGLE...,DIVMTQATPSIPVTPGESVSISCRSNKSLLHSNGNTYLYWFLQRPG...,NRCLKANAKSCGECIQAGPNCGWCTNSTFRCDDLEALKKKGCPPDD...
18300,6zdg,F,G,D,EVQLVESGGGVVQPGRSLRLSCAASAFTFSSYDMHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKL...,TNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFK...


In [4]:
def find_polar_contacts(pdb_path: str, chain1: str, chain2: str, cutoff=3.5):
    """
    Find polar contacts between two input chains in a PDB file using BioPandas.
    
    Parameters:
    pdb_path (str): Path to the PDB file.
    chain1 (str): Identifier for the first chain.
    chain2 (str): Identifier for the second chain.
    cutoff (float): Distance cutoff for identifying polar contacts.
    
    Returns:
    dict: A dictionary with chain identifiers as keys and lists of residue numbers involved in polar contacts as values.
    """
    ## Load PDB file
    ppdb = PandasPdb().read_pdb(pdb_path)
    df = ppdb.df['ATOM']

    ## Select polar atoms (N, O) from each chain
    polar_atoms_chain1 = df[(df['chain_id'] == chain1) & (df['element_symbol'].isin(['N', 'O']))]
    polar_atoms_chain2 = df[(df['chain_id'] == chain2) & (df['element_symbol'].isin(['N', 'O']))]

    ## Calculate pairwise distances between polar atoms
    distances = cdist(polar_atoms_chain1[['x_coord', 'y_coord', 'z_coord']].values, 
                      polar_atoms_chain2[['x_coord', 'y_coord', 'z_coord']].values)

    ## Find pairs of atoms within the distance cutoff
    pairs = np.argwhere(distances <= cutoff)

    ## Find corresponding residue numbers and names
    residue_numbers_chain1 = polar_atoms_chain1['residue_number'].values
    residue_numbers_chain2 = polar_atoms_chain2['residue_number'].values

    residue_names_chain1 = polar_atoms_chain1['residue_name'].values
    residue_names_chain2 = polar_atoms_chain2['residue_name'].values

    polar_contact_residues = []
    for pair in pairs:
        polar_contact_residues.append([
            residue_names_chain1[pair[0]],
            residue_numbers_chain1[pair[0]],
            residue_names_chain2[pair[1]],
            residue_numbers_chain2[pair[1]]
            ])

    ## Make polar contact residues a set
    polar_contact_residues = set(map(tuple, polar_contact_residues))

    ## Assemble dictionary
    polar_contact_residues_dict = {
        f"resn_chain{chain1}": [],
        f"resi_chain{chain1}": [],
        f"resn_chain{chain2}": [],
        f"resi_chain{chain2}": []
        }
    
    for pair in polar_contact_residues:
        polar_contact_residues_dict[f"resn_chain{chain1}"].append(pair[0])
        polar_contact_residues_dict[f"resi_chain{chain1}"].append(pair[1])
        polar_contact_residues_dict[f"resn_chain{chain2}"].append(pair[2])
        polar_contact_residues_dict[f"resi_chain{chain2}"].append(pair[3])

    ## Convert to dataframe
    polar_contact_residues_df = pd.DataFrame(polar_contact_residues_dict)

    return polar_contact_residues_df

In [33]:
## Make dataframe
positions = pd.DataFrame(columns=["pdb", "pdb_path", "antigen_contacts"])

## Loop through experiments and get interfacing residues
for index, fab in fabs.iterrows():
    cutoff = 3
    pdb_id = fab['pdb']
    pdb_path = f"../data/sabdab/pdbs_test/{pdb_id}.pdb.gz"
    h_chain_id = fab['Hchain']
    l_chain_id = fab['Lchain']
    antigen_chain_ids = fab['AntigenChains'].split('|')  # Assuming the first antigen chain is used

    if not os.path.exists(pdb_path):
        # print(f"[SKIP] File not found: {pdb_file}")
        continue

    antigen_contacts = []

    print(f"Processing {pdb_id}: H chain `{h_chain_id}`, L chain `{l_chain_id}`, and antigen chains `{antigen_chain_ids}`")
    for antigen_chain_id in antigen_chain_ids:
        ## Find polar contacts between the H/L chains and the antigen chain
        antigen_contacts_with_h = find_polar_contacts(pdb_path, h_chain_id, antigen_chain_id, cutoff)[f'resi_chain{antigen_chain_id}'].tolist()
        antigen_contacts_with_l = find_polar_contacts(pdb_path, l_chain_id, antigen_chain_id, cutoff)[f'resi_chain{antigen_chain_id}'].tolist()
        antigen_contacts.extend(antigen_contacts_with_h + antigen_contacts_with_l)

    positions.loc[index, "pdb"] = pdb_id
    positions.loc[index, "pdb_path"] = pdb_path
    positions.loc[index, "antigen_contacts"] = list(sorted(set(antigen_contacts)))

Processing 1a3r: H chain `H`, L chain `L`, and antigen chains `['P']`
Processing 1a2y: H chain `B`, L chain `A`, and antigen chains `['C']`
Processing 1a3l: H chain `H`, L chain `L`, and antigen chains `['L']`
Processing 1a4k: H chain `H`, L chain `L`, and antigen chains `['H']`
Processing 1a4k: H chain `B`, L chain `A`, and antigen chains `['B']`
Processing 1a14: H chain `H`, L chain `L`, and antigen chains `['N']`


In [34]:
positions

Unnamed: 0,pdb,pdb_path,antigen_contacts
8512,1a3r,../data/sabdab/pdbs_test/1a3r.pdb.gz,"[157, 159, 160, 161, 163, 165, 169]"
9540,1a2y,../data/sabdab/pdbs_test/1a2y.pdb.gz,"[24, 102, 117, 119, 121]"
11763,1a3l,../data/sabdab/pdbs_test/1a3l.pdb.gz,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
13113,1a4k,../data/sabdab/pdbs_test/1a4k.pdb.gz,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
17884,1a4k,../data/sabdab/pdbs_test/1a4k.pdb.gz,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
17985,1a14,../data/sabdab/pdbs_test/1a14.pdb.gz,"[329, 330, 370]"


In [7]:
positions.to_csv("polar_contacts.csv", index=False)