# DoGSite: Binding site prediction in protein complexes

In [1]:
# imports
import json
import io
from pathlib import Path
import requests
import sys
import time
import urllib.request
import pandas as pd

from IPython.display import Image
from Bio.PDB import *
from Bio.PDB.PDBExceptions import PDBConstructionWarning
import nglview as nv
import numpy as np
from rdkit import Chem



In [2]:
# constants
TEST_FILES = Path('../test_files/')
PROTEINS_PLUS_URL = 'http://localhost:8000/'
UPLOAD = PROTEINS_PLUS_URL + 'molecule_handler/upload/'
UPLOAD_JOBS = PROTEINS_PLUS_URL + 'molecule_handler/upload/jobs/'
PROTEINS = PROTEINS_PLUS_URL + 'molecule_handler/proteins/'
PROTEINSITES = PROTEINS_PLUS_URL + 'molecule_handler/protein_sites/'
ELECTRONDENSITYMAP = PROTEINS_PLUS_URL + 'molecule_handler/electron_density_maps/'
LIGANDS = PROTEINS_PLUS_URL + 'molecule_handler/ligands/'
DOGSITE = PROTEINS_PLUS_URL + 'dogsite/'
DOGSITE_INFO = PROTEINS_PLUS_URL + 'dogsite/info/'
DOGSITE_JOBS = PROTEINS_PLUS_URL + 'dogsite/jobs/'

In [3]:
# utils

# check server connection
try:
    response = requests.get(PROTEINS_PLUS_URL)
except requests.ConnectionError as error:
    if 'Connection refused' in str(error):
        print('WARNING: could not establish a connection to the server', file=sys.stderr)
    raise
    
def poll_job(job_id, poll_url, poll_interval=1, max_polls=10):
    """Poll the progress of a job
    
    Continously polls the server in regular intervals and updates the job information, especially the status.
    
    :param job_id: UUID of the job to poll
    :type job_id: str
    :param poll_url: URl to send the polling request to
    :type poll_url: str
    :param poll_interval: time interval between polls in seconds
    :type poll_interval: int
    :param max_polls: maximum number of times to poll before exiting
    :type max_polls: int
    :return: polled job
    :rtype: dict
    """
    job = requests.get(poll_url + job_id).json()
    status = job['status']
    current_poll = 0
    while status == 'pending' or status == 'running':
        print(f'Job {job_id} is { status }')
        current_poll += 1
        if current_poll >= max_polls:
            print(f'Job {job_id} has not completed after {max_polls} polling requests' \
                  f' and {poll_interval * max_polls} seconds')
            return job
        time.sleep(poll_interval)
        job = requests.get(poll_url + job_id).json()
        status = job['status']
    print(f'Job {job_id} completed with { status }')
    return job

## Predict, visualize and display statistics of binding pockets

We can upload a file and start a DoGSite job like this:

In [5]:
with open(TEST_FILES / '4agm.pdb') as upload_file:
    query = {'protein_file': upload_file}
    job_submission = requests.post(DOGSITE, files=query).json()
    dogsite_job = poll_job(job_submission['job_id'], DOGSITE_JOBS)

Job 099b98f8-4d5e-45cd-b7e9-7f9d13d6b68f completed with success


Then we can output all residues of e.g. the first pocket:

In [6]:
first_pocket = dogsite_job['output_pockets'][:1][0]
site = requests.get(PROTEINSITES + first_pocket).json()
residues_first_pocket = site['site_description']['residue_ids']
for i in residues_first_pocket:
        print(i)

{'name': 'VAL', 'chain': 'A', 'position': '97'}
{'name': 'PRO', 'chain': 'A', 'position': '98'}
{'name': 'SER', 'chain': 'A', 'position': '99'}
{'name': 'GLN', 'chain': 'A', 'position': '100'}
{'name': 'LYS', 'chain': 'A', 'position': '101'}
{'name': 'SER', 'chain': 'A', 'position': '166'}
{'name': 'MET', 'chain': 'A', 'position': '169'}
{'name': 'ALA', 'chain': 'B', 'position': '138'}
{'name': 'ASP', 'chain': 'B', 'position': '186'}
{'name': 'LEU', 'chain': 'B', 'position': '188'}
{'name': 'ARG', 'chain': 'B', 'position': '196'}
{'name': 'VAL', 'chain': 'B', 'position': '197'}
{'name': 'GLY', 'chain': 'B', 'position': '199'}
{'name': 'ASN', 'chain': 'B', 'position': '200'}
{'name': 'LEU', 'chain': 'B', 'position': '201'}
{'name': 'ALA', 'chain': 'B', 'position': '203'}
{'name': 'TYR', 'chain': 'B', 'position': '205'}
{'name': 'ASN', 'chain': 'B', 'position': '235'}
{'name': 'MET', 'chain': 'B', 'position': '237'}


In [12]:
protein_structure = PDBParser().get_structure('4agm', TEST_FILES / '4agm.pdb')

first_density = dogsite_job['output_densities'][0]
density_to_view = requests.get(ELECTRONDENSITYMAP + first_density).json()
ccp4_file_path = density_to_view['file']
with urllib.request.urlopen(ccp4_file_path) as response:
    view2 = nv.show_file(response, ext='ccp4')
    view2.add_component(nv.BiopythonStructure(protein_structure))
view2



NGLWidget()

In [13]:
view2.clear_representations()
view2.add_surface(wrap=True, contour=True, color="skyblue", opacity=0.8, isolevelType="value", isolevel=0.0001)

We can also get pocket statistics of all the predicted pockets. In the following a few exemplary properties are returned. Note that no ligand is associated with a pocket because we did not provide a ligand name or file.

In [9]:
dogsite_info = requests.get(DOGSITE_INFO + dogsite_job['dogsite_info']).json()
pockets_statistics = dogsite_info['info']

df = pd.DataFrame(pockets_statistics)
df[['name', 'depth', 'volume', 'donor', 'accept', 'lig_name']]

Unnamed: 0,name,depth,volume,donor,accept,lig_name
0,P_1,15.2,287.744,8,9,
1,P_2,16.819,200.704,5,9,
2,P_3,14.6205,181.76,4,7,
3,P_4,11.9197,115.2,4,5,
4,P_5,10.3073,115.2,4,4,
5,P_6,9.92774,109.056,9,5,
6,P_7,10.1509,104.96,9,5,


## Examples for other parameter options of DoGSite

In the following example we want to add a ligand so that we find the pocket that contains it. Also we want to output the subpockets which the actual pockets are formed of.

In [26]:
with open(TEST_FILES / 'NXG_A_1294.sdf') as upload_ligand_file:
    with open(TEST_FILES / '4agm.pdb') as upload_file:
        query = {'protein_file': upload_file, 'ligand_file': upload_ligand_file}
        options = {'calc_subpockets': True}
        job_submission = requests.post(DOGSITE, files=query, data=options).json()
        dogsite_job2 = poll_job(job_submission['job_id'], DOGSITE_JOBS)

dogsite_info2 = requests.get(DOGSITE_INFO + dogsite_job2['dogsite_info']).json()
pockets_statistics2 = dogsite_info2['info']

df = pd.DataFrame(pockets_statistics2)
df[['name', 'depth', 'volume', 'donor', 'accept', 'lig_name', 'lig_cov']]

Job 441c8690-2a95-4859-9fb9-8dcd766d51cf is running
Job 441c8690-2a95-4859-9fb9-8dcd766d51cf is running
Job 441c8690-2a95-4859-9fb9-8dcd766d51cf is running
Job 441c8690-2a95-4859-9fb9-8dcd766d51cf completed with success


Unnamed: 0,name,depth,volume,donor,accept,lig_name,lig_cov
0,P_1,15.2,287.744,8,9,non,0.0
1,P_2,16.819,200.704,5,9,non,0.0
2,P_3,14.6205,181.76,4,7,NXG_A_1294,43.3962
3,P_4,11.9197,115.2,4,5,non,0.0
4,P_4_1,9.86306,91.648,4,4,non,0.0
5,P_4_2,7.54718,23.552,0,1,non,0.0
6,P_5,10.3073,115.2,4,4,non,0.0
7,P_6,9.92774,109.056,9,5,non,0.0
8,P_7,10.1509,104.96,9,5,non,0.0


We could also specify for which chain we would like to predict pockets. In the output you can see that fewer pockets were found because of the restriction to chain 'A'. If we already know the ligand position we can use the ligand-bias option to tell DoGSite to focus the pocket prediction on the protein environment near the ligand. This is especially useful when the ligand is partially protruding into the solvent (see the different depth, volume and ligand coverage values in the output below): 

In [23]:
with open(TEST_FILES / 'NXG_A_1294.sdf') as upload_ligand_file:
    with open(TEST_FILES / '4agm.pdb') as upload_file:
        query = {'protein_file': upload_file, 'ligand_file': upload_ligand_file}
        options = {'chain_id': 'A', 'ligand_bias': True}
        job_submission = requests.post(DOGSITE, files=query, data=options).json()
        print(job_submission)
        dogsite_job3 = poll_job(job_submission['job_id'], DOGSITE_JOBS)

dogsite_info3 = requests.get(DOGSITE_INFO + dogsite_job3['dogsite_info']).json()
pockets_statistics3 = dogsite_info3['info']

df = pd.DataFrame(pockets_statistics3)
df[['name', 'depth', 'volume', 'donor', 'accept', 'lig_name', 'lig_cov']]

{'job_id': '37f99230-f65c-4db3-a427-9787c6f29667', 'retrieved_from_cache': True}
Job 37f99230-f65c-4db3-a427-9787c6f29667 completed with success


Unnamed: 0,name,depth,volume,donor,accept,lig_name,lig_cov
0,P_1,14.4444,246.272,7,8,NXG_A_1294,54.717
1,P_2,10.245,115.712,9,5,non,0.0
