In [1]:
import json
import os
import requests
import statistics
from itertools import groupby
from sklearn import preprocessing
import numpy as np


In [2]:
def get_entity_id(pdb_id, chain_id):
    pdb_info = requests.get(f"https://www.ebi.ac.uk/pdbe/api/pdb/entry/molecules/{pdb_id}").json()
    for entity in pdb_info[pdb_id]:
        if chain_id in entity['in_chains']:
            return entity['entity_id']
    return None

def get_conservation_score(pdb_id, chain_id):
    entity_id = get_entity_id(pdb_id, chain_id)
    return requests.get(f"https://www.ebi.ac.uk/pdbe/graph-api/pdb/sequence_conservation/{pdb_id}/{entity_id}").json()['data']

get_conservation_score('1a4u', 'B')


{'index': [1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  104,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  113,
  114,
  115,
  116,
  117,
  118,
  119,
  120,
  121,
  122,
  123,
  124,
  125,
  126,
  127,
  128,
  129,
  130,
  131,
  132,
  133,
  134,
  135,
  136,
  137,
  138,
  139,
  140,
  141,
  142,
  143,
  144,
  145,
  146,
  147,
  148,
  149,
  150,
  151,
  152,
  153,
  154,
  155,
  156,
  157,


In [3]:
with open('label_dataset.json') as f:
    cryptic_ds = json.load(f)
with open('label_noncryptic_dataset.json') as f:
    noncryptic_ds = json.load(f)

In [39]:
def compute_avg_conservation_score(data, cryptic_residues, noncryptic_residues):
    cryptic_score, noncryptic_score, otherwise_score = [], [], []
    for label_seq_id, score in zip(data['index'], data['conservation_score']):
        if label_seq_id in cryptic_residues: cryptic_score.append(score)
        elif label_seq_id in noncryptic_residues: noncryptic_score.append(score)
        else: otherwise_score.append(score)
    
    return cryptic_score, noncryptic_score, otherwise_score


for pdb_id, cryptic_residues in cryptic_ds.items():
    if os.path.isfile(f'cryptic_scores/{pdb_id}.txt'): continue
    
    # group by chain ID
    cryptic_residues = [list(g) for k, g in groupby(sorted(cryptic_residues), key=lambda x: x.split('_')[0])]
    
    for chain_group in cryptic_residues:
        chain_id = chain_group[0].split('_')[0]
        print(pdb_id, chain_id)
        noncryptic_residues = [] if pdb_id not in noncryptic_ds else [int(i.split('_')[1]) for i in noncryptic_ds[pdb_id] if i.split('_')[0] == chain_id]
        
        data = get_conservation_score(pdb_id, chain_id)
        cryptic_score, noncryptic_score, otherwise_score = compute_avg_conservation_score(data, [int(i.split('_')[1]) for i in chain_group], noncryptic_residues)
        with open(f'cryptic_scores/{pdb_id}.txt', 'w') as f:
            f.write(str(cryptic_score))
        with open(f'noncryptic_scores/{pdb_id}.txt', 'w') as f:
            f.write(str(noncryptic_score))
        with open(f'otherwise_scores/{pdb_id}.txt', 'w') as f:
            f.write(str(otherwise_score))
        print(cryptic_score, noncryptic_score, otherwise_score)


In [2]:
def read_score(path):
    with open(path) as f:
        conservations = f.readline().strip('\n').strip('[').strip(']').strip()
    if len(conservations) == 0: return []
    return [float(i) for i in conservations.split(',')]

# def normalize(values):
#     if len(values) == 0: return values
#     min_actual = min(values)
#     max_actual = max(values)
#     if max_actual == 0: return values 
#     if max_actual == min_actual: return [i / max_actual for i in values]
#     return [0 + (x - min_actual) * (1 - 0) / (max_actual - min_actual) for x in values]

cryptic_scores, noncryptic_scores, otherwise_scores = [], [], []

for filename in os.listdir('otherwise_scores'):
    cryptic_scores.extend(read_score(f'cryptic_scores/{filename}'))
    noncryptic_scores.extend(read_score(f'noncryptic_scores/{filename}'))
    otherwise_scores.extend(read_score(f'otherwise_scores/{filename}'))

print('Mean of cryptic pockets: ', statistics.mean(cryptic_scores), '+-', statistics.stdev(cryptic_scores))
print('Mean of non-cryptic pockets: ', statistics.mean(noncryptic_scores), '+-', statistics.stdev(noncryptic_scores))
print('Mean of non-pocket: ', statistics.mean(otherwise_scores), '+-', statistics.stdev(otherwise_scores))

Mean of cryptic pockets:  0.9972856552782203 +- 1.2407066345102165
Mean of non-cryptic pockets:  0.6131017806111233 +- 1.0341066324027792
Mean of non-pocket:  0.41381406219648975 +- 0.7810614727323248
