In [1]:
# imports
import io
from pathlib import Path
import requests
import sys
import time
from urllib.parse import urljoin

from IPython.display import Image
from Bio.PDB import PDBParser
import nglview as nv
from rdkit import Chem



In [2]:
# constants
TEST_FILES = Path('../test_files/')
PROTEINS_PLUS_URL = 'http://localhost:8000/'
UPLOAD = urljoin(PROTEINS_PLUS_URL, 'molecule_handler/upload/')
UPLOAD_JOBS = urljoin(PROTEINS_PLUS_URL, 'molecule_handler/upload/jobs/')
PROTEINS = urljoin(PROTEINS_PLUS_URL, 'molecule_handler/proteins/')
LIGANDS = urljoin(PROTEINS_PLUS_URL, 'molecule_handler/ligands/')
PROTOSS = urljoin(PROTEINS_PLUS_URL, 'protoss/')
PROTOSS_JOBS = urljoin(PROTEINS_PLUS_URL, 'protoss/jobs/')

In [3]:
# utils

# check server connection
try:
    response = requests.get(PROTEINS_PLUS_URL)
except requests.ConnectionError as error:
    if 'Connection refused' in str(error):
        print('WARNING: could not establish a connection to the server', file=sys.stderr)
    raise
    
def poll_job(job_id, poll_url, poll_interval=1, max_polls=10):
    """Poll the progress of a job
    
    Continuosly polls the server in regular intervals and updates the job information, especially the status.
    
    :param job_id: UUID of the job to poll
    :type job_id: str
    :param poll_url: URl to send the polling request to
    :type poll_url: str
    :param poll_interval: time interval between polls in seconds
    :type poll_interval: int
    :param max_polls: maximum number of times to poll before exiting
    :type max_polls: int
    :return: polled job
    :rtype: dict
    """
    job = requests.get(poll_url + job_id).json()
    status = job['status']
    current_poll = 0
    while status == 'pending' or status == 'running':
        print(f'Job {job_id} is { status }')
        current_poll += 1
        if current_poll >= max_polls:
            print(f'Job {job_id} has not completed after {max_polls} polling requests' \
                  f' and {poll_interval * max_polls} seconds')
            return job
        time.sleep(poll_interval)
        job = requests.get(poll_url + job_id).json()
        status = job['status']
    print(f'Job {job_id} completed with { status }')
    return job

def print_data_fields(model):
    """Print the fields of a model
    
    :param model: data model
    :type model: dict
    """
    for field in model.keys():
        print(f' - "{field}"')

The molecule handler is an entrypoint to working with molecular data. It is largely optional because most other API calls can be made without a round trip to the molecule handler. It will register a protein in the database so that it can be referred to by only its ID. It will also detect ligands and generate 2D images for them.

Let's start with a PDB entry. To work with a PDB entry we only need to POST the PDB code to the server and it will query the PDB for us.

In [4]:
query = {'pdb_code': '4agm'}
job_submission = requests.post(UPLOAD, data=query).json()

This call is equivalent to the following file upload:

In [5]:
# with open(TEST_FILES / '4agm.pdb') as upload_file:
#     query = {'protein_file': upload_file}
#     job_submission = requests.post(UPLOAD, files=query).json()

We have immediately parsed the JSON response and can now keep working with a python dict containing the job submission data.

In [6]:
job_id = job_submission['job_id']
if job_submission['retrieved_from_cache']:
    print(f'Job {job_id} could be retrieved from cache')

Job 6f8eec30-443a-4467-bb76-80e1b362981b could be retrieved from cache


The job submission data contains the job ID (a UUID) and the information whether the job was retrieved from cache. Caching jobs saves the server and you a lot of CPU time. Chances are, if you are working on a PDB entry, it may already have been processed and you can retrieve it instantly. Let's do that now:

In [7]:
job = poll_job(job_id, UPLOAD_JOBS)
print('Job data fields:')
print_data_fields(job)
    
print()
protein_id = job['output_protein']
print(f'Preprocessed protein ID: {protein_id}')

Job 6f8eec30-443a-4467-bb76-80e1b362981b completed with success
Job data fields:
 - "id"
 - "status"
 - "date_created"
 - "date_last_accessed"
 - "error"
 - "protein_name"
 - "pdb_code"
 - "output_protein"
 - "protein_string"
 - "ligand_string"

Preprocessed protein ID: 403bf8fb-f5c3-42db-8ca7-1e2946695ee9


A job has a number of data fields, many of which are shared across jobs, such as "status" or "date_created". You can find a full list of fields in the [reference documentation](http://localhost:8000/api/schema/swagger-ui/). In this case we have preprocessed a PDB entry and so are interested in the "output_protein". This will be the ID of our protein. Let's retrieve our protein:

In [8]:
protein = requests.get(PROTEINS + protein_id).json()
print('Protein data fields:')
print_data_fields(protein)

Protein data fields:
 - "id"
 - "name"
 - "pdb_code"
 - "file_type"
 - "ligand_set"
 - "file_string"
 - "date_created"
 - "date_last_accessed"


As you can see the protein has a "file_string". We can use these to load the protein with biopython and display it in nglview:

In [9]:
protein_file = io.StringIO(protein['file_string'])
protein_structure = PDBParser().get_structure(protein['name'], protein_file)
view = nv.show_biopython(protein_structure)
view.add_representation(repr_type='ball+stick', selection='protein')
view



NGLWidget()

You can see that we're missing the ligands in the structure. The ligands are associated with the protein over the "ligand_set" field. Let's retrieve them:

In [10]:
print('Ligand IDs: ' + str(protein['ligand_set']))
ligand = requests.get(LIGANDS + protein['ligand_set'][0]).json()  # get the first ligand
other_ligand = requests.get(LIGANDS + protein['ligand_set'][1]).json()  # get the second ligand
Image(url=ligand['image'], width=400, height=400)  # freely scalabe SVG

Ligand IDs: ['62af3113-6ff7-487e-9664-6347d0e4d8eb', 'bfa44f98-71b1-45e4-a68e-3ccc13f792f0']


Preprocessing a structure splits the ligands from the protein and tries to generate 2D images for them. We can also load these into nglview:

In [11]:
ligand_structure = Chem.MolFromMolBlock(ligand['file_string'], removeHs=False)
other_ligand_structure = Chem.MolFromMolBlock(other_ligand['file_string'], removeHs=False)

view = nv.NGLWidget()
view.add_structure(nv.RdkitStructure(ligand_structure))
view.add_structure(nv.RdkitStructure(other_ligand_structure))
view.add_structure(nv.BiopythonStructure(protein_structure))
view

NGLWidget()

We can use the IDs of the ligands and the protein for other tools on the [proteins.plus](proteins.plus), for example protoss:

In [12]:
# run protoss on the server (detailed explanation in the protoss example)
query = {'protein_id': protein['id']}  # our preprocessed protein ID
protoss_job_submission = requests.post(PROTOSS, data=query).json()
protoss_job = poll_job(protoss_job_submission['job_id'], PROTOSS_JOBS)
protossed_protein = requests.get(PROTEINS + protoss_job['output_protein']).json()
protossed_protein_file = io.StringIO(protossed_protein['file_string'])

# load and visualize the protein with protoss hydrogens
protossed_protein_structure = PDBParser().get_structure(protossed_protein['name'], protossed_protein_file)
view = nv.show_biopython(protossed_protein_structure)
view.add_representation(repr_type='ball+stick', selection='protein')
view

Job 77a71fc0-2ddd-4bb3-859b-63c4f2c05cb8 completed with success




NGLWidget()

Notice how all we had to do was give the server the ID of the protein. The server will keep such entries for about a week after they were last accessed.