In [None]:
from moldf import read_mol2
import os

PATH = '/home/skrhakv/Near-Hit-Scoring/data/scPDB'


# loop over mol2 files
There are two files:
1. `site.mol2`: pocket residues
2. `protein.mol2`: all residues

We load both and extract the sequence and the numbering of the binding residues.

In [None]:
mapping = {'Aba': 'A', 'Ace': 'X', 'Acr': 'X', 'Ala': 'A', 'Aly': 'K', 'Arg': 'R', 'Asn': 'N', 'Asp': 'D', 'Cas': 'C',
           'Ccs': 'C', 'Cme': 'C', 'Csd': 'C', 'Cso': 'C', 'Csx': 'C', 'Cys': 'C', 'Dal': 'A', 'Dbb': 'T', 'Dbu': 'T',
           'Dha': 'S', 'Gln': 'Q', 'Glu': 'E', 'Gly': 'G', 'Glz': 'G', 'His': 'H', 'Hse': 'S', 'Ile': 'I', 'Leu': 'L',
           'Llp': 'K', 'Lys': 'K', 'Men': 'N', 'Met': 'M', 'Mly': 'K', 'Mse': 'M', 'Nh2': 'X', 'Nle': 'L', 'Ocs': 'C',
           'Pca': 'E', 'Phe': 'F', 'Pro': 'P', 'Ptr': 'Y', 'Sep': 'S', 'Ser': 'S', 'Thr': 'T', 'Tih': 'A', 'Tpo': 'T',
           'Trp': 'W', 'Tyr': 'Y', 'Unk': 'X', 'Val': 'V', 'Ycm': 'C', 'Sec': 'U', 'Pyl': 'O', 'Mhs': 'H', 'Snm': 'S',
           'Mis': 'S', 'Seb': 'S', 'Hic': 'H', 'Fme': 'M', 'Asb': 'D', 'Sah': 'C', 'Smc': 'C', 'Tpq': 'Y', 'Onl': 'X',
           'Tox': 'W', '5x8': 'X', 'Ddz': 'A'}


def three_to_one(three_letter_code):
    if three_letter_code[0].upper() + three_letter_code[1:].lower() not in mapping:
        return 'X'
    return mapping[three_letter_code[0].upper() + three_letter_code[1:].lower()]

sequences = {}
for directory in os.listdir(PATH):
    site_mol_file = read_mol2(mol2_file=f'{PATH}/{directory}/site.mol2')
    site = site_mol_file['ATOM']
    binding_residues = site[site['atom_name'] == 'CA']['subst_name'].tolist()

    protein_mol_file = read_mol2(mol2_file=f'{PATH}/{directory}/protein.mol2')
    protein = protein_mol_file['ATOM']
    all_residues = protein[protein['atom_name'] == 'CA']['subst_name'].tolist()
    sequence = ''
    binding_residues_arr = []

    for i, res in enumerate(all_residues):
        sequence += three_to_one(res[:3])
        if res in binding_residues:
            binding_residues_arr.append(f'{three_to_one(res[:3])}{str(i)}')

    sequences[directory] = (sequence, binding_residues_arr)

# Use 90_SI subset
The scPDB was checked for sequence identity. It was shown that the optimal subset of scPDB for training is the one filtered for 90% sequence identity (https://github.com/podleyan/Near-Hit-Scoring).

Let's use this subset for training as well.

In [48]:
import pandas as pd

csv_path = '/home/skrhakv/Near-Hit-Scoring/data/input/proteins_SI_90.csv'
proteins_si = pd.read_csv(csv_path, low_memory=False)

# split entries like "1iki_A" into pdb_id and chain
proteins_si[['pdb_id', 'chain']] = proteins_si.iloc[:, 0].str.split('_', expand=True)
proteins_90_SI = proteins_si['pdb_id'].tolist()

proteins_added_90_SI = set()
with open('/home/skrhakv/Near-Hit-Scoring/data/input/scPDB_all.csv', 'w') as f:
    with open('/home/skrhakv/Near-Hit-Scoring/data/input/scPDB_90_SI.csv', 'w') as f90:
        for protein_id, (sequence, binding_residues) in sequences.items():
            pdb_id, chain_id = protein_id.split('_')
            line = f'{pdb_id};{chain_id};UNKNOWN;{" ".join(binding_residues)};{sequence}\n'
            f.write(line)
            if pdb_id in proteins_90_SI and pdb_id not in proteins_added_90_SI:
                f90.write(line)
                proteins_added_90_SI.add(pdb_id)

# generate distance matrices

In [None]:
import pandas as pd
import sys
sys.path.append('/home/skrhakv/cryptoshow-analysis/src/B-evaluate-cryptoshow')
import eval_utils

DISTANCE_MATRICES_PATH = '/home/skrhakv/cryptoshow-analysis/data/E-regular-binding-site-predictor/scPDB-distance-matrices'
csv_path = '/home/skrhakv/Near-Hit-Scoring/data/input/proteins_SI_90.csv'
proteins_si = pd.read_csv(csv_path, low_memory=False)

# split entries like "1iki_A" into pdb_id and chain
proteins_si[['pdb_id', 'chain']] = proteins_si.iloc[:, 0].str.split('_', expand=True)
proteins_90_SI = proteins_si['pdb_id'].tolist()

import numpy as np
for directory in os.listdir(PATH):
    protein_mol_file = read_mol2(mol2_file=f'{PATH}/{directory}/protein.mol2')
    protein_id = directory.replace('_', '')
    protein = protein_mol_file['ATOM']
    pdb_id = directory.split('_')[0]
    if pdb_id not in proteins_90_SI:
        continue
    all_residues = protein[protein['atom_name'] == 'CA'][['x', 'y', 'z']].values.tolist()
    distance_matrix = eval_utils.compute_distance_matrix(np.array(all_residues))
    np.save(f'{DISTANCE_MATRICES_PATH}/{protein_id}.npy', distance_matrix)


# generate sequences
generate sequences for generating ESM2 embeddings.

In [None]:
import csv

with open('/home/skrhakv/cryptoshow-analysis/data/E-regular-binding-site-predictor/scPDB_90_SI.csv', 'r') as f:
    file = csv.reader(f, delimiter=";")
    for row in file:
        protein_id = row[0] + row[1]
        sequence = row[4]
        with open(f'/home/skrhakv/esm2/data/scPDB/{protein_id}.txt', 'w') as seq_file:
            seq_file.write(sequence)

# Extract the correct chain IDs
The chain IDs from `proteins_SI_90.csv` are incorrect. Extract the correct ones.

In [41]:
import os
import re
import string
letters = 'ACDEFGHIKLMNPQRSTVWY'
PATH = '/home/vit/Projects/Near-Hit-Scoring/data/scPDB'


binding_sites = {}
for directory in os.listdir(PATH):
    pdb_id = directory.split('_')[0]
    with open(f'{PATH}/{directory}/IFP.txt') as f:
        binding_site = f.readline()
    binding_site = binding_site.strip().split('|')[1:]

    # determine "main" chain id (the one with most residues in binding site)
    chain_ids = {}
    for binding_residue in binding_site:
        chain_id = re.sub('\s+', ' ', binding_residue).split(' ')[0]
        if chain_id not in chain_ids:
            chain_ids[chain_id] = 1
        else:
            chain_ids[chain_id] += 1
    
    final_chain_id = None
    max_count = 0
    for chain_id in chain_ids:
        if chain_ids[chain_id] > max_count:
            max_count = chain_ids[chain_id]
            final_chain_id = chain_id

    chain_id = final_chain_id
    reformated_binding_site = []
    for binding_residue in binding_site:
        binding_residue = re.sub('\s+', ' ', binding_residue) # replace multiple spaces with one space ('B  S215' -> 'B S215')
        binding_residue_parts = binding_residue.split(' ') # split into parts ('B S215' -> ['B', 'S215'])
        if len(binding_residue_parts) < 2: # skip malformed entries ('BNAD601' etc)
            continue
        if binding_residue_parts[0] != chain_id:
            continue
        parts = re.findall(r'\D+|\d+', binding_residue_parts[1]) # split into letters and numbers ('S215' -> ['S', '215'])
        if len(parts[0]) > 1 or parts[0] not in letters: # skip non-standard amino acids (not sure what can occur here)
            continue
        reformated_binding_site.append(f'{parts[1]}')
    # print(pdb_id, chain_id, reformated_binding_site)
    if pdb_id not in binding_sites:
        binding_sites[f'{pdb_id}{chain_id}'] = reformated_binding_site
    else:
        tmp = set(reformated_binding_site)
        tmp = tmp.update(binding_sites[f'{pdb_id}{chain_id}'])
        binding_sites[f'{pdb_id}{chain_id}'] = list(tmp)

with open('/home/vit/Projects/cryptoshow-analysis/data/E-regular-binding-site-predictor/full_scPDB.csv', 'w') as output_file:
    for key in binding_sites:
        output_file.write(f'{key[:4]};{key[4:]};UNKNOWN;{" ".join(binding_sites[key])};UNKNOWN\n')

# Map auth to mmcif numbering
Map AUTH to sequences. Extract the sequences as well.
## WARNING: here we run the enhancement using the AHoJ-DB (see branch [scPDB_enhancement in CryptoBench](https://github.com/skrhakv/CryptoBench/tree/scPDB_enhancement))


In [19]:
# TODO: take code from visualize.ipynb and adapt here
import sys
import numpy as np

DATA_PATH = '/home/vit/Projects/cryptoshow-analysis/data/E-regular-binding-site-predictor/scPDB_enhanced_binding_sites.csv'
PRECOMPUTED = True

sys.path.append('/home/vit/Projects/cryptoshow-analysis/src/D-visualize')
import vis_utils

sys.path.append('/home/vit/Projects/cryptoshow-analysis/src/B-evaluate-cryptoshow')
sys.path.append('/home/vit/Projects/cryptoshow-analysis/src')
import eval_utils
import cryptoshow_utils
import csv

def read_test_binding_residues(data_path=DATA_PATH, pocket_types=['CRYPTIC']) -> set[int]:
    cryptic_binding_residues = {}

    with open(data_path, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=';')
        for row in reader:
            chain_id = row[1]
            pdb_id = row[0]
            protein_id = f'{pdb_id}{chain_id}'

            if row[3] == '':
                continue

            binding_residue_indices = [f'{chain_id}_{i}'for i in row[3].split(' ')]
            if row[2] in pocket_types:
                if protein_id not in cryptic_binding_residues:
                    cryptic_binding_residues[protein_id] = []
                cryptic_binding_residues[protein_id].append(binding_residue_indices)

    return cryptic_binding_residues

def reformat_binding_residues(binding_residues: dict) -> dict:
    reformated = {}
    for protein_id, residues in binding_residues.items():
        assert len(residues) == 1, "Expected only one pocket per protein in scPDB"
        reformated_protein_id = protein_id.replace('_', '')
        reformated_residues = np.array([str(''.join(filter(str.isdigit, residue.split('_')[1]))) for residue in residues[0]])
        reformated[reformated_protein_id] = reformated_residues
    return reformated

# load ground truth binding residues: these have mmcif numbering and need to be mapped to auth labeling
binding_residues_mmcifed = read_test_binding_residues(data_path=DATA_PATH, pocket_types=['NON_CRYPTIC'])
binding_residues_mmcifed = reformat_binding_residues(binding_residues_mmcifed)


In [None]:
with open("/home/vit/Projects/cryptoshow-analysis/data/E-regular-binding-site-predictor/scPDB_enhanced_binding_sites_translated.csv", 'w') as f:
    for protein_id, pocket in binding_residues_mmcifed.items():
        binding_residues_mapped, sequence = cryptoshow_utils.map_auth_to_mmcif_numbering(protein_id[:4], protein_id[4:], pocket)
        line = f'{protein_id[:4]};{protein_id[4:]};UNKNOWN;{" ".join([str(i) for i in binding_residues_mapped])};{sequence}\n'
        f.write(line)