In [13]:
import sys
import pymol
_stdouterr = sys.stdout, sys.stderr
pymol.finish_launching(['/usr/bin/pymol', '-q'])
sys.stdout, sys.stderr = _stdouterr

# load something into the PyMOL window
from pymol import cmd

CIF_FILES = '/home/vit/Projects/deeplife-project/data/cif_files'

In [2]:
import numpy as np
import os

def read_predictions(data_path: str, protein_ids: list[str]) -> dict[str, np.ndarray]:
    '''Read prediction pickle files for given protein IDs from the specified data path.
     Args:
        data_path (str): Path to the directory containing prediction pickle files.
        protein_ids (list of str): List of protein IDs to read predictions for.
    Returns:
        dict: A dictionary mapping protein IDs to their loaded predictions.
    '''
    import pickle
    predictions = {}
    for protein_id in protein_ids:
        filename = protein_id.replace('_', '')
        with open(f'{data_path}/{filename}.pkl', 'rb') as f:
            predictions[protein_id] = pickle.load(f)
    return predictions

def reformat_binding_residues(binding_residues: dict) -> dict:
    reformated = {}
    for protein_id, residues in binding_residues.items():
        reformated_protein_id = protein_id.replace('_', '')
        reformated_residues = [np.array([int(residue.split('_')[1]) for residue in pocket]) for pocket in residues]
        reformated[reformated_protein_id] = reformated_residues
    return reformated

In [27]:
import sys
sys.path.append('/home/vit/Projects/cryptoshow-analysis/src/B-evaluate-cryptoshow')
sys.path.append('/home/vit/Projects/cryptoshow-analysis/src')
import eval_utils
import cryptoshow_utils

CBS_DATA_PATH = '/home/vit/Projects/cryptoshow-analysis/data/A-cluster-ligysis-data/clustered-binding-sites.txt'
PREDICTIONS_PATH = '/home/vit/Projects/cryptoshow-analysis/data/D-visualize/predictions'

# load ground truth binding residues: these have mmcif numbering and need to be mapped to auth labeling
binding_residues, _ = eval_utils.read_test_binding_residues(data_path=CBS_DATA_PATH)
binding_residues = reformat_binding_residues(binding_residues)

# load predictions
p2rank_predictions = read_predictions(data_path=f'{PREDICTIONS_PATH}/p2rank', protein_ids=binding_residues.keys())
model_predictions = read_predictions(data_path=f'{PREDICTIONS_PATH}/finetuning-with-smoothing', protein_ids=binding_residues.keys())

# map binding residues to auth labeling
binding_residues = {protein_id: [cryptoshow_utils.map_mmcif_numbering_to_auth(protein_id[:4], protein_id[4:], pocket) for pocket in pockets] for protein_id, pockets in binding_residues.items()}
p2rank_predictions = {protein_id: [cryptoshow_utils.map_mmcif_numbering_to_auth(protein_id[:4], protein_id[4:], pocket) for pocket in pockets] for protein_id, pockets in p2rank_predictions.items()}
model_predictions = {protein_id: [cryptoshow_utils.map_mmcif_numbering_to_auth(protein_id[:4], protein_id[4:], pocket) for pocket in pockets] for protein_id, pockets in model_predictions.items()}

In [None]:
def generate_pymol_algebra_selection(protein_id: str, residues: np.ndarray) -> str:
    return f'{protein_id} and resi {"+".join([str(i) for i in residues])}'


for protein_id in binding_residues.keys():
    this_p2rank_predictions = np.concatenate(p2rank_predictions[protein_id]) if p2rank_predictions[protein_id] else np.array([])
    this_model_predictions = np.concatenate(model_predictions[protein_id]) if model_predictions[protein_id] else np.array([])
    true_binding_residues = np.concatenate(binding_residues[protein_id]) if binding_residues[protein_id] else np.array([])

    print(generate_pymol_algebra_selection(protein_id, true_binding_residues))
    cmd.reinitialize()
    cmd.set('fetch_path', cmd.exp_path(CIF_FILES), quiet=0)
    cmd.fetch(protein_id)
    cmd.zoom(protein_id)
    cmd.color('grey', protein_id)
    if len(true_binding_residues) != 0:
        cmd.color('red', generate_pymol_algebra_selection(protein_id, true_binding_residues))
    else:
        continue
    
    if len(this_p2rank_predictions) != 0:
        cmd.color('blue', generate_pymol_algebra_selection(protein_id, this_p2rank_predictions))
    if len(this_model_predictions) != 0:
        cmd.color('green', generate_pymol_algebra_selection(protein_id, this_model_predictions))

    cmd.show('surface', generate_pymol_algebra_selection(protein_id, true_binding_residues))
    user_input = input(">Press Enter for the next protein (press 'q' to quit)...\n")
    if user_input.lower() == 'q':
        break