In [1]:
import pickle

pickle_path = "/home/vit/Projects/cryptoshow-analysis/data/E-regular-binding-site-predictor/P2Rank_CONS_pockets_DEF_TRANS.pkl" # downloaded from LIGYSIS data storage: https://zenodo.org/records/13121414
with open(pickle_path, "rb") as f:
    data = pickle.load(f)
data[data['rep_chain'] == '1auk_A']

Unnamed: 0,rep_chain,ID,RANK,score,prob,n_sas_points,n_surf_atoms,aas,centre,n_aas,RoG,centre_trans,up_aas,SASA,VOL
17,1auk_A,1,1,4.96,0.235,55,35,"[105, 107, 11, 12, 132, 134, 135, 155, 211, 21...","(10.7094, 2.6381, 0.9206)",20,8.75,"(8.84, 2.537, 2.848)","[123, 125, 29, 30, 150, 152, 153, 173, 229, 23...",423.22,829.0
18,1auk_A,2,2,4.4,0.198,13,14,"[259, 296, 297, 298, 318, 321, 340, 344, 41]","(-12.5886, -4.513, 0.017)",9,6.88,"(-12.901, -4.557, 1.343)","[277, 314, 315, 316, 336, 339, 358, 362, 59]",56.55,63.0
19,1auk_A,3,3,1.52,0.022,35,15,"[268, 275, 276, 277, 278, 280, 282, 391, 393]","(8.3387, -7.7317, 14.2516)",9,6.45,"(7.563, -8.247, 13.14)","[286, 293, 294, 295, 296, 298, 300, 409, 411]",246.18,164.0


# Chain mapping
Map label chain id to auth chain id using the ligysis df.

In [2]:
import pickle

with open('/home/vit/Projects/cryptoshow-analysis/data/E-regular-binding-site-predictor/LIGYSIS_sites_DEF_TRANS.pkl', 'rb') as file:
    df = pickle.load(file)

auth_to_label_chain_mapping = {}
for pdb_id, auth_chain_id, label_chain_id in zip(df['pdb_id'], df['auth_asym_id'], df['struct_asym_id']):
    auth_to_label_chain_mapping[(pdb_id, auth_chain_id)] = label_chain_id

# save P2rank predictions
Save the p2rank predictions in the same format as in `/home/vit/Projects/cryptoshow-analysis/data/A-cluster-ligysis-data/cryptobench-clustered-binding-sites.txt`

The numbering is AUTH.

In [3]:
CIF_FILES_PATH = '/home/vit/Projects/deeplife-project/data/cif_files'

mapping = {'Aba': 'A', 'Ace': 'X', 'Acr': 'X', 'Ala': 'A', 'Aly': 'K', 'Arg': 'R', 'Asn': 'N', 'Asp': 'D', 'Cas': 'C',
           'Ccs': 'C', 'Cme': 'C', 'Csd': 'C', 'Cso': 'C', 'Csx': 'C', 'Cys': 'C', 'Dal': 'A', 'Dbb': 'T', 'Dbu': 'T',
           'Dha': 'S', 'Gln': 'Q', 'Glu': 'E', 'Gly': 'G', 'Glz': 'G', 'His': 'H', 'Hse': 'S', 'Ile': 'I', 'Leu': 'L',
           'Llp': 'K', 'Lys': 'K', 'Men': 'N', 'Met': 'M', 'Mly': 'K', 'Mse': 'M', 'Nh2': 'X', 'Nle': 'L', 'Ocs': 'C',
           'Pca': 'E', 'Phe': 'F', 'Pro': 'P', 'Ptr': 'Y', 'Sep': 'S', 'Ser': 'S', 'Thr': 'T', 'Tih': 'A', 'Tpo': 'T',
           'Trp': 'W', 'Tyr': 'Y', 'Unk': 'X', 'Val': 'V', 'Ycm': 'C', 'Sec': 'U', 'Pyl': 'O', 'Mhs': 'H', 'Snm': 'S',
           'Mis': 'S', 'Seb': 'S', 'Hic': 'H', 'Fme': 'M', 'Asb': 'D', 'Sah': 'C', 'Smc': 'C', 'Tpq': 'Y', 'Onl': 'X',
           'Tox': 'W', '5x8': 'X', 'Ddz': 'A'}


def three_to_one(three_letter_code):
    if three_letter_code[0].upper() + three_letter_code[1:].lower() not in mapping:
        return 'X'
    return mapping[three_letter_code[0].upper() + three_letter_code[1:].lower()]

def map_label_to_auth(pdb_id: str, auth_chain_id: str, label_chain_id: str, binding_residues: set):
    import biotite.database.rcsb as rcsb
    import biotite.structure.io.pdbx as pdbx
    from biotite.structure.io.pdbx import get_structure
    from biotite.structure import get_residues

    cif_file_path = rcsb.fetch(pdb_id, "cif", CIF_FILES_PATH)
    cif_file = pdbx.CIFFile.read(cif_file_path)

    # read auth residues
    protein = get_structure(cif_file, model=1, use_author_fields=True)
    protein = protein[(protein.atom_name == "CA") 
                        & (protein.element == "C") 
                        & (protein.chain_id == auth_chain_id) ]
    auth_residue_ids, residue_types = get_residues(protein)
    auth_residue_ids = [str(i) for i in auth_residue_ids]
    
    # read label residues
    protein = get_structure(cif_file, model=1, use_author_fields=False)
    protein = protein[(protein.atom_name == "CA") 
                        & (protein.element == "C") 
                        & (protein.chain_id == label_chain_id) ]
    label_residue_ids, _ = get_residues(protein)
    label_residue_ids = [str(i) for i in label_residue_ids]

    # loop over auth residues and label residues simultaneously and map binding residues
    mapped_binding_residues = set()
    for auth_residue_id, label_residue_id, residue_type in zip(auth_residue_ids, label_residue_ids, residue_types):
        if label_residue_id in binding_residues:
            mapped_binding_residues.add(f'{three_to_one(residue_type)}{auth_residue_id}')
    return mapped_binding_residues


predicted_binding_sites = {}
for protein_id, predicted_binding_site in zip(data['rep_chain'], data['aas']):
    protein_id = protein_id.replace('_', '')
    pdb_id, auth_chain_id = protein_id[:4], protein_id[4:]
    if (pdb_id, auth_chain_id) not in auth_to_label_chain_mapping:
        print(f'Warning: Missing chain mapping for {pdb_id} {auth_chain_id}')
        continue
    label_chain_id = auth_to_label_chain_mapping[(pdb_id, auth_chain_id)]

    if protein_id not in predicted_binding_sites:
        predicted_binding_sites[protein_id] = []
        
    # map the binding site from label to auth
    predicted_binding_site = map_label_to_auth(pdb_id, auth_chain_id, label_chain_id, predicted_binding_site)
    predicted_binding_sites[protein_id].append(predicted_binding_site)




# Why
why was P2Rank run on more structures than number of sturctures in ligysis?

In [6]:
len(predicted_binding_sites)

3115