In [1]:
import sys
sys.path.append('../../utils/')
import biotite_utils
import dataset_utils
import statistics

def count_amino_acids(dataset):
    binding_amino_acids = []
    nonbinding_amino_acids = []
    
    for id, binding_residues in dataset.items():
        count_binding_amino_acids = 0
        count_nonbinding_amino_acids = 0
        if id == '8j1kA': continue # skip 8j1kA, it throws some weird error I don't want to debug

        # print(f'Processing {id} ...')
        
        protein = biotite_utils.load_structure(id)
        chain_id = id[4:]
        protein_structure = biotite_utils.get_protein_backbone(protein, chain_id)
        
        for residue in protein_structure:
            if residue.res_id in binding_residues:
                count_binding_amino_acids += 1
            else:
                count_nonbinding_amino_acids += 1
    
        binding_amino_acids.append(count_binding_amino_acids)
        nonbinding_amino_acids.append(count_nonbinding_amino_acids)
    
    return binding_amino_acids, nonbinding_amino_acids
   


# avg. \# of binding residues, avg. \# of residues

In [2]:
DATASET = 'rigid-dataset'
DATASET_PATH = f'../../../datasets/{DATASET}'

rigid_ids = dataset_utils.get_annotations(DATASET_PATH)
binding_aas, nonbinding_aas = count_amino_acids(rigid_ids)
print(f'RIGID binding: {statistics.mean(binding_aas)} +- {statistics.stdev(binding_aas)}')
print(f'RIGID nonbinding: {statistics.mean(nonbinding_aas)} +- {statistics.stdev(nonbinding_aas)}')


DATASET = 'cryptobench-dataset'
DATASET_PATH = f'../../../datasets/{DATASET}'

cryptobench_ids = dataset_utils.get_annotations(DATASET_PATH)
binding_aas, nonbinding_aas = count_amino_acids(cryptobench_ids)
print(f'CryptoBench binding: {statistics.mean(binding_aas)} +- {statistics.stdev(binding_aas)}')
print(f'CryptoBench nonbinding: {statistics.mean(nonbinding_aas)} +- {statistics.stdev(nonbinding_aas)}')


RIGID binding: 11.043478260869565 +- 6.696454588748716
RIGID nonbinding: 368.3416149068323 +- 198.28585255307013
CryptoBench binding: 17.542893725992318 +- 10.55715901591945
CryptoBench nonbinding: 286.61075544174133 +- 147.51514337585076


# avg. pRMSD

In [3]:
import json


for ids, dataset_name in zip([[i[:4] for i in rigid_ids.keys()], [i[:4] for i in cryptobench_ids.keys()]], ['rigid-dataset', 'cryptobench-dataset']):
    dataset_path = f'../../../datasets/{dataset_name}/dataset.json'
    with open(dataset_path) as f:
        dataset = json.load(f)

    pRMSDs = []
    for apo_id, holo_structures in dataset.items():
        if apo_id not in ids:
            continue
        for holo_structure in holo_structures:
            pRMSDs.append(holo_structure['pRMSD'])
    print(f'{dataset_name}: {statistics.mean(pRMSDs)} +- {statistics.stdev(pRMSDs)}')

rigid-dataset: 0.2611739502999143 +- 0.11367286157237538
cryptobench-dataset: 3.016845360824742 +- 0.8605282026929327


# number of structures

In [4]:
print(f'number of protein structures in CryptoBench: {len(cryptobench_ids)}')
print(f'number of protein structures in RIGID dataset: {len(rigid_ids)}')

number of protein structures in CryptoBench: 782
number of protein structures in RIGID dataset: 483
