# Find Polar Contacts

In [None]:
from biopandas.pdb import PandasPdb
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist
import os

In [None]:
## Read in Fabs list
fabs = pd.read_csv('sabdab_sequences_only_dedup.tsv', sep='\t')#.head(1)

## Filter NaN sequences
fabs = fabs[~fabs['Heavy_chain'].isna()]
fabs = fabs[~fabs['Light_chain'].isna()]
fabs = fabs[~fabs['Antigen_Chains'].isna()]

In [None]:
fabs

In [None]:
def find_polar_contacts(pdb_path: str, chain1: str, chain2: str, cutoff=3.5):
    """
    Find polar contacts between two input chains in a PDB file using BioPandas.
    Residue numbers for chain2 (antigen) are renumbered to start at 1.
    """
    from biopandas.pdb import PandasPdb
    import pandas as pd
    import numpy as np
    from scipy.spatial.distance import cdist

    ## Load PDB file
    ppdb = PandasPdb().read_pdb(pdb_path)
    df = ppdb.df['ATOM']

    ## Select polar atoms (N, O) from each chain
    polar_atoms_chain1 = df[(df['chain_id'] == chain1) & (df['element_symbol'].isin(['N', 'O']))]
    polar_atoms_chain2 = df[(df['chain_id'] == chain2) & (df['element_symbol'].isin(['N', 'O']))]

    ## Map original residue numbers in chain2 to sequential indices starting at 1
    unique_residues_chain2 = polar_atoms_chain2[['residue_number']].drop_duplicates().sort_values('residue_number')
    residue_number_map_chain2 = {res_num: idx+1 for idx, res_num in enumerate(unique_residues_chain2['residue_number'])}

    ## Calculate pairwise distances between polar atoms
    distances = cdist(polar_atoms_chain1[['x_coord', 'y_coord', 'z_coord']].values, 
                      polar_atoms_chain2[['x_coord', 'y_coord', 'z_coord']].values)

    ## Find pairs of atoms within the distance cutoff
    pairs = np.argwhere(distances <= cutoff)

    ## Find corresponding residue numbers and names
    residue_numbers_chain1 = polar_atoms_chain1['residue_number'].values
    residue_numbers_chain2 = polar_atoms_chain2['residue_number'].values

    residue_names_chain1 = polar_atoms_chain1['residue_name'].values
    residue_names_chain2 = polar_atoms_chain2['residue_name'].values

    polar_contact_residues = []
    for pair in pairs:
        polar_contact_residues.append([
            residue_names_chain1[pair[0]],
            residue_numbers_chain1[pair[0]],
            residue_names_chain2[pair[1]],
            residue_number_map_chain2[residue_numbers_chain2[pair[1]]]  # mapped index
            ])

    ## Make polar contact residues a set
    polar_contact_residues = set(map(tuple, polar_contact_residues))

    ## Assemble dictionary
    polar_contact_residues_dict = {
        f"resn_chain{chain1}": [],
        f"resi_chain{chain1}": [],
        f"resn_chain{chain2}": [],
        f"resi_chain{chain2}": []
        }
    
    for pair in polar_contact_residues:
        polar_contact_residues_dict[f"resn_chain{chain1}"].append(pair[0])
        polar_contact_residues_dict[f"resi_chain{chain1}"].append(pair[1])
        polar_contact_residues_dict[f"resn_chain{chain2}"].append(pair[2])
        polar_contact_residues_dict[f"resi_chain{chain2}"].append(pair[3])  # mapped index

    ## Convert to dataframe
    polar_contact_residues_df = pd.DataFrame(polar_contact_residues_dict)

    return polar_contact_residues_df

In [None]:
## Make dataframe
positions = pd.DataFrame(columns=["pdb", "pdb_path", "antigen_contacts"])

## Loop through experiments and get interfacing residues
for index, fab in fabs.iterrows():
    cutoff = 3
    pdb_id = fab['pdb']
    pdb_path = f"pdbs/{pdb_id}.pdb.gz"
    h_chain_id = fab['Heavy_chain']
    l_chain_id = fab['Light_chain']
    antigen_chain_ids = fab['Antigen_Chains'].split('|')  # Assuming the first antigen chain is used

    if not os.path.exists(pdb_path):
        # print(f"[SKIP] File not found: {pdb_file}")
        continue

    antigen_contacts = []

    print(f"Processing {pdb_id}: H chain `{h_chain_id}`, L chain `{l_chain_id}`, and antigen chains `{antigen_chain_ids}`")
    for antigen_chain_id in antigen_chain_ids:
        ## Find polar contacts between the H/L chains and the antigen chain
        antigen_contacts_with_h = find_polar_contacts(pdb_path, h_chain_id, antigen_chain_id, cutoff)[f'resi_chain{antigen_chain_id}'].tolist()
        antigen_contacts_with_l = find_polar_contacts(pdb_path, l_chain_id, antigen_chain_id, cutoff)[f'resi_chain{antigen_chain_id}'].tolist()
        antigen_contacts.extend(antigen_contacts_with_h + antigen_contacts_with_l)

    positions.loc[index, "pdb"] = pdb_id
    positions.loc[index, "pdb_path"] = pdb_path
    positions.loc[index, "antigen_contacts"] = list(sorted(set(antigen_contacts)))

In [None]:
positions

In [None]:
positions.to_csv("polar_contacts.tsv", index=False)

In [1]:
import pandas as pd

# Load deduplicated SAbDab data
sabdab_df = pd.read_csv("sabdab_sequences_only_dedup.tsv", sep="\t")

# Load polar contact positions
polar_contacts_df = pd.read_csv("polar_contacts.tsv")

# Merge on 'pdb'
merged_df = sabdab_df.merge(polar_contacts_df[["pdb", "antigen_contacts"]], on="pdb", how="left")

# Save to new file
merged_df.to_csv("sabdab_sequences_only_dedup_with_contacts.tsv", sep="\t", index=False)
print("Merged file saved as sabdab_sequences_only_dedup_with_contacts.tsv")

FileNotFoundError: [Errno 2] No such file or directory: 'polar_contacts.tsv'