# Extract Uniprot Ids
Use the IDs to scan the LIGYSIS-web database.

In [None]:
import json

DATA_PATH = '/home/vit/Projects/cryptoshow-analysis/data/data-extraction'
DATASET_PATH = F'{DATA_PATH}/cryptobench-dataset/folds/test.json'

In [None]:
with open(DATASET_PATH, 'r') as f:
    data = json.load(f)

uniprot_ids = [entry[0]['uniprot_id'] for key, entry in data.items() if '-' not in entry[0]['uniprot_id']]

with open(f'{DATA_PATH}/uniprot_ids.txt', 'w') as f_out:
    for uid in uniprot_ids:
        f_out.write(f"{uid}\n")

# Check Uniprot availability in LIGYSIS-web
Check which Uniprot IDs are available in UniProt. [Source](github.com/bartongroup/LIGYSIS-web/blob/master/static/data/LIGYSIS_protein_names_dict_RF3.pkl)

In [None]:
import pickle
import json

DATASET_PATH = f'{DATA_PATH}/cryptobench-dataset/folds/test.json'

with open(DATASET_PATH, 'r') as f:
    data = json.load(f)

uniprot_ids = [entry[0]['uniprot_id'] for key, entry in data.items() if '-' not in entry[0]['uniprot_id']]

with open(f'{DATA_PATH}/LIGYSIS-files/LIGYSIS_protein_names_dict_RF3.pkl', 'rb') as f:
    pkl_data = pickle.load(f)

In [2]:
set_pkl_ids = set(pkl_data.values())

## Which UniProt IDs were missing in the LIGYSIS-web
214/222 Uniprot IDs are available in LIGYSIS-web.

In [3]:
for uniprot_id in uniprot_ids:
    if uniprot_id not in set_pkl_ids:
        print(uniprot_id)

D2B3F1
Q9H0M0
Q8NB16
A9WIU3
A0A3E2YLT4
P50286
Q8DQ84
Q86W50


# Load LIGYSIS data
Analyze the data from LIGYSIS-web database

In [None]:
import pickle
with open(f'{DATA_PATH}/LIGYSIS-files/LIGYSIS_master_fps_dict.pkl', 'rb') as f:
    pkl_data = pickle.load(f)

## Filter relevant ligands
See CryptoBench study which ligands are considered relevant.

In [None]:
import os, csv
import sys
import pandas as pd
from rdkit import Chem
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')
sys.path.append('/home/vit/Projects/cryptoshow-analysis/src/utils')
import cryptoshow_utils

IGNORED_GROUPS_LIST = ['HOH', 'DOD', 'WAT', 'UNK', 'ABA', 'MPD', 'GOL', 'SO4', 'PO4']
P2RANK_ATOMS_NUM_THRESHOLD = 5

# load all ligand smiles
ligand_smiles_df = pd.read_csv(f'{DATA_PATH}/ligand.tsv', sep='\t')
cached_smiles = {}


def check_ligand_atom_count(smiles):
    for i in str(smiles).split(';'):
        i = i.strip()
        if i in cached_smiles.keys():
            if cached_smiles[i]:
                return True
            else:
                continue
        try:
            molecule = Chem.MolFromSmiles(i)
            atoms_count = molecule.GetNumAtoms()
        except:
            cached_smiles[i] = False
            continue

        is_valid_smiles = True
        if atoms_count < P2RANK_ATOMS_NUM_THRESHOLD:
            is_valid_smiles = False

        cached_smiles[i] = is_valid_smiles
        return is_valid_smiles

    return False

def is_in_ignored_group(ligand):
    return ligand in IGNORED_GROUPS_LIST

def is_valid_ligand(ligand):
    if ligand in ligand_smiles_df['#CCD'].values:
        smiles_versions = ligand_smiles_df.loc[ligand_smiles_df['#CCD'] == ligand, 'SMILES'].values[0].split(';')
        for smiles in smiles_versions:
            # check if in cached smiles:
            if smiles in cached_smiles:
                return cached_smiles[smiles]
            # check if in ignored groups
            if is_in_ignored_group(ligand):
                cached_smiles[smiles] = False
                return False
        # check atom count
        return check_ligand_atom_count(smiles_versions)
    else:
        return False

def initialize_cached_smiles():
    # read cached smiles
    cached_smiles_path = f'{DATA_PATH}/cached_smiles.csv'
    if os.path.exists(cached_smiles_path):
        with open(cached_smiles_path, 'r') as csvfile:
            reader = csv.reader(csvfile, delimiter=';')
            for row in reader:
                cached_smiles[row[0].strip()] = row[1] == 'True'

def filter_ligysis_data(data):
    initialize_cached_smiles()
    filtered_data = {}
    for uniprot_id in data.keys():
        for segment_id in data[uniprot_id].keys():
            for POI in data[uniprot_id][segment_id].keys():
                [_, ligand, _, _] = POI.split('_')
                if is_valid_ligand(ligand):
                    if uniprot_id not in filtered_data:
                        filtered_data[uniprot_id] = {}
                    if segment_id not in filtered_data[uniprot_id]:
                        filtered_data[uniprot_id][segment_id] = {}
                    filtered_data[uniprot_id][segment_id][POI] = data[uniprot_id][segment_id][POI]
    return filtered_data

ligysis = filter_ligysis_data(pkl_data)

## Map binding sites onto the CryptoBench apo structures
Take the output of pdb2uniprot tool to map the binding sites from UniProt sequences onto the PDB structures.

In [None]:
import json

DATASET_PATH = f'{DATA_PATH}/cryptobench-dataset/folds/test.json'

with open(DATASET_PATH, 'r') as f:
    cryptobench_data = json.load(f)

cryptobench_data = {key: entry for key, entry in cryptobench_data.items() if '-' not in entry[0]['apo_chain']}

In [None]:
import csv

# mapping of UniProt to PDB positions
uniprot_to_pdb = {}
with open(f'{DATA_PATH}/pdb2uniprot.tsv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    next(reader)  # Skip header 
    for row in reader:
        pdb_id = row[0]
        chain_id = row[1]
        protein_id = pdb_id + chain_id
        uniprot_id = row[4]
        # UniProt has to be in LIGYSIS - the LIGYSIS contains UniProts only from the CryptoBench test set
        # or, the protein is not in the cryptobench TEST dataset, because the pdb2uniprot.tsv also contains entries from the TRAIN/VALIDATION dataset
        assert uniprot_id in ligysis or protein_id not in cryptobench_data, f"{uniprot_id} not in ligysis and {protein_id} in cryptobench_data"
        if uniprot_id not in uniprot_to_pdb:
            uniprot_to_pdb[uniprot_id] = {}
        pdb_position = row[3]
        if pdb_position == 'null':
            continue
        uniprot_position = row[6]
        uniprot_to_pdb[uniprot_id][uniprot_position] = pdb_position


## Merge LIGYSIS and CryptoBench
Map LIGYSIS binding sites onto the CryptoBench APO structures if the LIGYSIS binding site residues are present in the APO structure

In [5]:
number_of_POIs = 0
number_of_valid_POIs = 0

cryptobench_binding_sites = {}
# Add LIGYSIS binding sites to the CryptoBench data if all LIGYSIS residues are present in the CryptoBench apo structure
for pdb_id, entries in cryptobench_data.items():
    cryptobench_binding_sites[pdb_id] = {}
    uniprot_id = entries[0]['uniprot_id']

    assert uniprot_id in uniprot_to_pdb, f"{uniprot_id} not in uniprot_to_pdb"
    # 214 out of 222 structures from the CryptoBench TEST set have a record in LIGYSIS
    if uniprot_id in ligysis:
        # add LIGYSIS binding sites if all LIGYSIS residues are present in the CryptoBench apo structure
        for segment_ids in ligysis[uniprot_id].keys():
            for segment_id in segment_ids:
                for POI in ligysis[uniprot_id][segment_id].keys():
                    # check if the LIGYSIS residues are present in the CryptoBench apo structure
                    all_residues_present = True
                    # map the LIGYSIS residues from UniProt to PDB positions
                    mapped_residues = []
                    for residue in ligysis[uniprot_id][segment_id][POI]:
                        if str(residue) not in uniprot_to_pdb[uniprot_id]:
                            all_residues_present = False
                            break
                        mapped_residues.append(uniprot_to_pdb[uniprot_id][str(residue)])
                    if all_residues_present:
                        cryptobench_binding_sites[pdb_id][POI + '_LIGYSIS'] = mapped_residues
                        number_of_valid_POIs += 1
                    number_of_POIs += 1
print(f"{number_of_valid_POIs}/{number_of_POIs} valid POIs ({number_of_valid_POIs/number_of_POIs*100:.2f}%) where all LIGYSIS residues are present in the CryptoBench apo structures (removal of binding sites with missing residues in APO structures)")

# Add CryptoBench binding sites
for pdb_id, entries in cryptobench_data.items():
    for entry in entries:
        holo_pdb_id = entry['holo_pdb_id']
        ligand = entry['ligand']
        ligand_index = entry['ligand_index']
        ligand_chain_id = entry['ligand_chain']
        residues = [i.split('_')[1] for i in entry['apo_pocket_selection']]

        POI = '_'.join([holo_pdb_id, ligand, ligand_chain_id, ligand_index, 'CryptoBench'])
        if pdb_id not in cryptobench_binding_sites:
            cryptobench_binding_sites[pdb_id] = {}
        cryptobench_binding_sites[pdb_id][POI] = residues

5042/5326 valid POIs (94.67%) where all LIGYSIS residues are present in the CryptoBench apo structures (removal of binding sites with missing residues in APO structures)


## Cluster binding sites
Use LIGYSIS methodology to cluster binding sites together.

In [10]:
CLUSTER_DISTANCE = 0.5 # distance threshold to cut the hierarchical clustering tree, threshold follows the LBS comparison paper: https://jcheminf.biomedcentral.com/articles/10.1186/s13321-024-00923-z)
CLUSTERING_METHOD = 'average' # method to use for hierarchical clustering; see https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html

# This code was taken from the LIGYSIS repository (https://github.com/bartongroup/LIGYSIS/blob/running-arpeggio/ligysis.py); see LICENSE file

def get_intersect_rel_matrix(binding_ress):
    """
    Given a set of ligand binding residues, calculates a
    similarity matrix between all the different sets of ligand
    binding residues.
    """
    inters = {i: {} for i in range(len(binding_ress))}
    for i in range(len(binding_ress)):
        inters[i][i] = intersection_rel(binding_ress[i], binding_ress[i])
        for j in range(i+1, len(binding_ress)):
            inters[i][j] = intersection_rel(binding_ress[i], binding_ress[j])
            inters[j][i] = inters[i][j]
    return inters

def intersection_rel(l1, l2):
    """
    Calculates relative intersection.
    """
    len1 = len(l1)
    len2 = len(l2)
    I_max = min([len1, len2])
    I = len(list(set(l1).intersection(l2)))
    return I/I_max

def get_binding_site_clusters(binding_sites):
    """Get binding site clusters.

    Args:
        binding_sites (list): List of binding sites.
    """
    import scipy
    import pandas as pd
    
    irel_matrix = get_intersect_rel_matrix(binding_sites)
    irel_df = pd.DataFrame(irel_matrix)
    dist_df = 1 - irel_df # distance matrix in pd.Dataframe() format
    condensed_dist_mat = scipy.spatial.distance.squareform(dist_df) # condensed distance matrix to be used for clustering
    linkage = scipy.cluster.hierarchy.linkage(condensed_dist_mat, method=CLUSTERING_METHOD, optimal_ordering=True)
    return scipy.cluster.hierarchy.cut_tree(linkage, height=CLUSTER_DISTANCE)

### Create annotations
Create sequences and map the PDB labeling onto those sequences. Get distance matrix of all the residues.

In [None]:
import numpy as np

with open(f'{DATA_PATH}/cryptobench-clustered-binding-sites.csv', 'w') as out_f:
    for pdb_id, binding_sites in cryptobench_binding_sites.items():
        POIs = list(binding_sites.keys())
        binding_sites = list(binding_sites.values())
        # get clusters of binding sites
        if len(binding_sites) > 1:
            clusters = get_binding_site_clusters(binding_sites).reshape(-1)
        else:
            clusters = [0]
        # loop over each cluster, merge it together, collect the ligands and check if it is cryptic
        for cluster_id in range(max(clusters) + 1):
            is_cryptic = False
            binding_ligands = set()
            binding_residues = set()
            for i, binding_site in enumerate(binding_sites):
                assert len(clusters) == len(POIs), "Length mismatch"
                if clusters[i] == cluster_id:
                    # collect the ligand
                    [_, ligand, _, _, source_dataset] = POIs[i].split('_')
                    binding_ligands.add(ligand)
                    # check if it is cryptic
                    if source_dataset == 'CryptoBench':
                        is_cryptic = True
                    # collect the residues
                    binding_residues.update(binding_site)
            
            chain_id = cryptobench_data[pdb_id][0]['apo_chain']

            # retrieve sequence from mmCIF file and map the residues to the mmCIF numbering
            binding_residues, sequence = cryptoshow_utils.map_auth_to_mmcif_numbering(pdb_id, chain_id, binding_residues)

            # get coordinates
            coordinates = cryptoshow_utils.get_coordinates(pdb_id, chain_id)

            # write to file
            out_f.write(f"{pdb_id}{cryptobench_data[pdb_id][0]['apo_chain']};{' '.join(binding_ligands)};{'CRYPTIC' if is_cryptic else 'NON_CRYPTIC'};{' '.join(binding_residues)};{sequence}\n")
            np.save(f'{DATA_PATH}/coordinates/{pdb_id}{chain_id}.npy', coordinates)

# Extract CryptoBench train set
Extract per-pocket annotations of the CryptoBench train set.

In [None]:
import json

DATASET_PATH = [f'{DATA_PATH}/folds/train-fold-0.json',
                f'{DATA_PATH}/folds/train-fold-1.json',
                f'{DATA_PATH}/folds/train-fold-2.json',
                f'{DATA_PATH}/folds/train-fold-3.json']

cryptobench_data = {}
for DATASET_PATH_i in DATASET_PATH:
    with open(DATASET_PATH_i, 'r') as f:
        cryptobench_data_i = json.load(f)
    cryptobench_data.update(cryptobench_data_i)

cryptobench_data = {key: entry for key, entry in cryptobench_data.items() if '-' not in entry[0]['apo_chain']}

### Load mapping 
Get numbering mapping from sequence to structure.

In [None]:
import csv

# mapping of UniProt to PDB positions
uniprot_to_pdb = {}
with open(f'{DATA_PATH}/pdb2uniprot.tsv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    next(reader)  # Skip header 
    for row in reader:
        pdb_id = row[0]
        chain_id = row[1]
        protein_id = pdb_id + chain_id
        uniprot_id = row[4]

        if uniprot_id not in uniprot_to_pdb:
            uniprot_to_pdb[uniprot_id] = {}
        pdb_position = row[3]
        if pdb_position == 'null':
            continue
        uniprot_position = row[6]
        uniprot_to_pdb[uniprot_id][uniprot_position] = pdb_position


### Extract from JSON
Extract relevant data from the CryptoBench JSON.

In [5]:
cryptobench_binding_sites = {}

for pdb_id, entries in cryptobench_data.items():
    for entry in entries:
        holo_pdb_id = entry['holo_pdb_id']
        ligand = entry['ligand']
        ligand_index = entry['ligand_index']
        ligand_chain_id = entry['ligand_chain']
        residues = [i.split('_')[1] for i in entry['apo_pocket_selection']]

        POI = '_'.join([holo_pdb_id, ligand, ligand_chain_id, ligand_index, 'CryptoBench'])
        if pdb_id not in cryptobench_binding_sites:
            cryptobench_binding_sites[pdb_id] = {}
        cryptobench_binding_sites[pdb_id][POI] = residues

In [None]:
import numpy as np

with open(f'{DATA_PATH}/cryptobench-train-clustered-binding-sites.txt', 'w') as out_f:
    for pdb_id, binding_sites in cryptobench_binding_sites.items():
        POIs = list(binding_sites.keys())
        binding_sites = list(binding_sites.values())
        # get clusters of binding sites
        if len(binding_sites) > 1:
            clusters = get_binding_site_clusters(binding_sites).reshape(-1)
        else:
            clusters = [0]
        # loop over each cluster, merge it together, collect the ligands and check if it is cryptic
        for cluster_id in range(max(clusters) + 1):
            binding_ligands = set()
            binding_residues = set()
            for i, binding_site in enumerate(binding_sites):
                assert len(clusters) == len(POIs), "Length mismatch"
                if clusters[i] == cluster_id:
                    # collect the ligand
                    [_, ligand, _, _, source_dataset] = POIs[i].split('_')
                    binding_ligands.add(ligand)
                    # collect the residues
                    binding_residues.update(binding_site)
            
            chain_id = cryptobench_data[pdb_id][0]['apo_chain']

            # retrieve sequence from mmCIF file and map the residues to the mmCIF numbering
            binding_residues, sequence = cryptoshow_utils.map_auth_to_mmcif_numbering(pdb_id, chain_id, binding_residues)

            # get coordinates
            coordinates = cryptoshow_utils.get_coordinates(pdb_id, chain_id)

            # write to file
            out_f.write(f"{pdb_id}{cryptobench_data[pdb_id][0]['apo_chain']};{' '.join(binding_ligands)};'CRYPTIC';{' '.join(binding_residues)};{sequence}\n")
            np.save(f'{DATA_PATH}/coordinates/{pdb_id}{chain_id}.npy', coordinates)