## Define the structure and pocket
For the **1jwpA** apo structure, the pocket annotation was identified using [AHoJ-DB](https://apoholo.cz/db/) by comparing it to its holo counterpart, the [1fqgA](https://apoholo.cz/db/entry/1fqg-A-PNM-523) structure. You can download the results and find the pocket selection there.

In [4]:
pdb_id = '1jwp'
chain_id = 'A'
pocket = 'A_69 A_70 A_73 A_105 A_130 A_132 A_170 A_216 A_234 A_235 A_236 A_237 A_238 A_240 A_244 A_272'
pocket = [int(i.split('_')[1]) for i in pocket.split(' ')]

Retrieve the sequence and binding indices within the sequence

In [4]:
import biotite.database.rcsb as rcsb
import biotite.structure.io.pdbx as pdbx
from biotite.structure.io.pdbx import get_structure
from biotite.sequence import ProteinSequence
import numpy as np

CIF_FILES_PATH = '/home/vit/Projects/deeplife-project/data/cif_files'

cif_file_path = rcsb.fetch(pdb_id, "cif", CIF_FILES_PATH)
cif_file = pdbx.CIFFile.read(cif_file_path)

protein = get_structure(cif_file, model=1)
protein = protein[(protein.atom_name == "CA") 
                       & (protein.element == "C") 
                       & (protein.chain_id == chain_id) ]

sequence = ''.join([ProteinSequence.convert_letter_3to1(residue.res_name) for residue in protein])
binding_indices = [f'{residue.chain_id}_{ProteinSequence.convert_letter_3to1(residue.res_name)}{i}' for i, residue in enumerate(protein) if residue.res_id in pocket]

Create the annotation and sequence files

In [7]:
assert len(binding_indices) == len(pocket)

with open(f'{pdb_id}{chain_id}.txt', 'w') as f:
    f.write(sequence)

with open(f'annotation.txt', 'w') as f:
    f.write(f'{pdb_id};{chain_id};UNK;{" ".join(binding_indices)}')

# ⚠️ CAUTION: ESM2-3B Embedding computation required!
For optimal performance, use a **GPU-equipped machine** when computing ESM2-3B embeddings, especially if processing multiple structures. While computation on a CPU-only machine should be possible, I haven't tested it. 

*Note: Computation of the ESM2 embedding is not part of this script. To generate embeddings, you may find [this script](https://github.com/skrhakv/esm2-generator/blob/master/compute-esm.py) in the [esm2-generator repository](https://github.com/skrhakv/esm2-generator) useful.*


### Run the prediction and evaluate

In [None]:
# This script is similar to the script provided in the CryptoBench dataset repository (https://osf.io/pz4a9/).

import numpy as np
from tensorflow import keras
import tensorflow_addons as tfa
import sys

# CAUTION: You need to specify the path to the CryptoBench dataset! It is available at: https://osf.io/pz4a9/
CRYPTOBENCH_PATH = '/path/to/cryptobench'
sys.path.append(f'{CRYPTOBENCH_PATH}/scripts')
from Protein import Protein

MODEL_PATH = f'{CRYPTOBENCH_PATH}/benchmark/best_trained'
STRUCTURE_ID = f'{pdb_id}{chain_id}'

# 0.95 decision threshold was used in the CryptoBench paper
DECISION_THRESHOLD = 0.95


def load_model():
    print("Loading CryptoBench model ...")
    return keras.models.load_model(MODEL_PATH,
                                   custom_objects={
                                       'MatthewsCorrelationCoefficient': tfa.metrics.MatthewsCorrelationCoefficient(num_classes=2)},
                                   compile=False)


def predict(X, model):
    print("Making prediction ...")
    return model.predict(X)


def load_data():
    print("Loading data - embeddings and annotations ...")
    embeddings = np.load(f'{STRUCTURE_ID}.npy')

    with open('annotation.txt', 'r') as f:
        annotations = f.read().split(';')[3].split(' ')

    # the format of each annotation is as follows: 
    # 'A_G210' denotes a single binding residue, which belongs to the 'A' chain,
    # 'G' denotes that the residue is Glycine, and the corresponding embedding
    # can be found at index 210 in the embeddings array
    annotations = [int(i.split('_')[1][1:]) for i in annotations]
    y = [0] * embeddings.shape[0]
    for ix in annotations:
        y[ix] = 1

    return embeddings, y


def print_evaluation(evaluation):
    print(
        f'\n\n\nEvaluation for {evaluation.id} with decision threshold = {DECISION_THRESHOLD}:\n')
    print(f'AUC: {evaluation.auc}')
    print(f'AUPRC: {evaluation.auprc}')
    print(f'ACC: {evaluation.accuracy}')
    print(f'TPR: {evaluation.get_TPR()}')
    print(f'FPR: {evaluation.get_FPR()}')
    print(f'MCC: {evaluation.mcc}')
    print(f'F1: {evaluation.f1}')


def evaluate(prediction, actual):
    evaluation = Protein(STRUCTURE_ID, prediction, actual,
                         threshold=DECISION_THRESHOLD)
    print_evaluation(evaluation)
    return evaluation

model = load_model()
embeddings, annotations = load_data()
predictions = predict(embeddings, model)
evaluation = evaluate(predictions, annotations)




Loading CryptoBench model ...
Loading data - embeddings and annotations ...
Making prediction ...



Evaluation for 1jwpA with decision threshold = 0.95:

AUC: 0.9375
AUPRC: 0.6728923909788846
ACC: 0.9391634980988594
TPR: 0.0
FPR: 0.0
MCC: 0.0
F1: 0.9686274509803922
