In [1]:
# imports
import io
from pathlib import Path
import requests
import sys
import time

from Bio.PDB import PDBParser
import nglview as nv
from rdkit import Chem
import pandas as pd




In [2]:
def table_style(row):
    color_schema = list()
    
    if row.name == 'EDIAm' or row.name == 'residueEDIATest':
        color = 'lightblue'
        style = 'background-color: ' + color
        color_schema.append(style)
        return color_schema*len(row.values)
    
    if 'Test' in row.name or 'Clash' in row.name:
        for i in range(len(row.values)):
            color = 'white'
            if row.values[i] == False:
                color = 'red'
            style = 'background-color: ' + color
            color_schema.append(style)
        return color_schema
    
    elif row.name == 'noCrystalContacts' or row.name == 'noAltLocs':
        for i in range(len(row.values)):
            color = 'white'
            if row.values[i] == False:
                color = 'red'
            style = 'background-color: ' + color
            color_schema.append(style)
        return color_schema
    
    else:
        style = 'background-color: white'
        color_schema.append(style)
        return (color_schema)*len(row.values)
        
    

In [3]:
# constants
TEST_FILES = Path('./test_files/')
PROTEINS_PLUS_URL = 'http://localhost:8000/'
UPLOAD = PROTEINS_PLUS_URL + 'molecule_handler/upload/'
UPLOAD_JOBS = PROTEINS_PLUS_URL + 'molecule_handler/upload/jobs/'
PROTEINS = PROTEINS_PLUS_URL + 'molecule_handler/proteins/'
LIGANDS = PROTEINS_PLUS_URL + 'molecule_handler/ligands/'
STRUCTUREPROFILER = PROTEINS_PLUS_URL + 'structureprofiler/'
STRUCTUREPROFILER_JOBS = PROTEINS_PLUS_URL + 'structureprofiler/jobs/'
OUTPUT_DATA = PROTEINS_PLUS_URL + 'structureprofiler/output/'

In [4]:
# utils

# check server connection
try:
    response = requests.get(PROTEINS_PLUS_URL)
except requests.ConnectionError as error:
    if 'Connection refused' in str(error):
        print('WARNING: could not establish a connection to the server', file=sys.stderr)
    raise
    
def poll_job(job_id, poll_url, poll_interval=1, max_polls=10):
    """Poll the progress of a job
    
    Continuosly polls the server in regular intervals and updates the job information, especially the status.
    
    :param job_id: UUID of the job to poll
    :type job_id: str
    :param poll_url: URl to send the polling request to
    :type poll_url: str
    :param poll_interval: time interval between polls in seconds
    :type poll_interval: int
    :param max_polls: maximum number of times to poll before exiting
    :type max_polls: int
    :return: polled job
    :rtype: dict
    """
    job = requests.get(poll_url + job_id).json()
    status = job['status']
    current_poll = 0
    while status == 'pending' or status == 'running':
        print(f'Job {job_id} is { status }')
        current_poll += 1
        if current_poll >= max_polls:
            print(f'Job {job_id} has not completed after {max_polls} polling requests' \
                  f' and {poll_interval * max_polls} seconds')
            return job
        time.sleep(poll_interval)
        job = requests.get(poll_url + job_id).json()
        status = job['status']
    print(f'Job {job_id} completed with { status }')
    return job

You can use Structureprofiler for automatic, objective and customizable profiling of X-ray protein structures. Based on the most frequently applied selection criteria, the given protein structure is evaluated. Results are given for the Complex, the ActiveSites and Ligands. For more information 

Let's take a look at the first protein:


In [5]:
protein_structure = PDBParser().get_structure('4agm', TEST_FILES / '4agm.pdb')
view = nv.show_biopython(protein_structure)
view.add_representation(repr_type='ball+stick', selection='ligand')
view



NGLWidget()

We can upload a protein file and start a job like this:

Note: Uploading a protein file is mandatory. However, there is an additional option to include an electron density map as well as a specific ligand. 

But let's keep it simple for now. 


In [6]:
with open(TEST_FILES / '4agm.pdb') as upload_file:
    query = {'protein_file': upload_file}
    job_submission = requests.post(STRUCTUREPROFILER, files=query).json()
structureprofiler_job = poll_job(job_submission['job_id'], STRUCTUREPROFILER_JOBS)
output_data = requests.get(OUTPUT_DATA + structureprofiler_job['output_data']).json()

Job e308a431-8308-4633-8b99-2ef6b1fc039b completed with success


The job is now successfully completed. The Structureprofiler produces a single output called "output_data", which can be divided into three tables each about Complex, Active Sites and Ligands. Where possible, the specfic values are shown, otherwise it is indicated whether a test is passed, which means, that the filter criteria is fullfilled. Here the failed criteria are marked red.
The number of columns in Ligands and ActiveSites tables differ between proteins, depending on the number of given Ligands and Active sites. The specific protein currently in question has two ligands shown in the structure above. 

In [7]:
complex_data = pd.DataFrame.from_dict([output_data['output_data']['complex']])
complex_data = complex_data.transpose()
complex_data.style.apply(table_style, axis=1)


Unnamed: 0,0
DPI,0.115000
rFree,0.197000
rFactor,0.173000
resolution,1.520000
overfittingTest,True
significanceTest,True
complexStructureProfilerTests,True


In [8]:
ligand_data = pd.DataFrame.from_dict(output_data['output_data']['ligands'])
ligand_data.columns = ['Ligand_1', 'Ligand_2']
ligand_data.style.apply(table_style, axis=1)

Unnamed: 0,Ligand_1,Ligand_2
ID,400,400
NROT,5,5
OWAB,18.200000,20.100000
logP,1.070000,1.070000
name,P86_A_400,P86_B_400
chain,A,B
HETCode,P86,P86
noAltLocs,True,True
heavyAtoms,21,21
stereoCenters,0,0


In [9]:
active_site_data = pd.DataFrame.from_dict(output_data['output_data']['active_sites'])
active_site_data.columns = ['Active_site_1', 'Active_site_2']
active_site_data.style.apply(table_style, axis=1)

Unnamed: 0,Active_site_1,Active_site_2
chains,"A,B","A,B"
ligand,P86_A_400,P86_B_400
noAltLocs,True,True
uniprotID,P04637,P04637
bondAnglesTest,False,False
bondLengthsTest,True,True
bFactorRatioTest,True,True
noIntermolecularClash,True,True
noIntramolecularClash,True,True
activeSiteStructureProfilerTests,False,False


Let's try the same protein as before. This time we also upload a electron density map as well as a non-native ligand. The ligand NXG_A_1294 is from another protein (PDB Code: 4AGN). 

In [10]:
with open(TEST_FILES / '4agm.pdb') as upload_file:
    with open(TEST_FILES / '4agm.ccp4', 'rb') as density_file:
            with open(TEST_FILES / 'NXG_A_1294.sdf') as upload_ligand_file:
                query = {'protein_file': upload_file, 'electron_density_map': density_file, 'ligand_file': upload_ligand_file}
                job_submission = requests.post(STRUCTUREPROFILER,files=query).json()
structureprofiler_job = poll_job(job_submission['job_id'], STRUCTUREPROFILER_JOBS,poll_interval=5, max_polls=100 )
output_data = requests.get(OUTPUT_DATA + structureprofiler_job['output_data']).json()

Job d984d624-12bb-4279-b7fc-603d69663013 completed with success


This call uses a local density file. We could also use a PDB Code, which is then used to retrieve the electron density map. This would look like this:

In [11]:
#with open(TEST_FILES / '4agm.pdb') as upload_file:
    #query = {'protein_file': upload_file}
    #params = {'pdb_code': '4agm'}
#structureprofiler_job = poll_job(structureprofiler(job_submission['job_id'], STRUCTUREPROFILER_JOBS)

In [12]:
complex_data = pd.DataFrame.from_dict([output_data['output_data']['complex']])
ligand_data = pd.DataFrame.from_dict(output_data['output_data']['ligands'])
active_sites_data = pd.DataFrame.from_dict(output_data['output_data']['active_sites'])


In [13]:
complex_data = pd.DataFrame.from_dict([output_data['output_data']['complex']])
complex_data = complex_data.transpose()
complex_data.style.apply(table_style, axis=1)

Unnamed: 0,0
DPI,0.115000
rFree,0.197000
rFactor,0.173000
resolution,1.520000
overfittingTest,True
significanceTest,True
complexStructureProfilerTests,True


In [14]:
ligand_data = pd.DataFrame.from_dict(output_data['output_data']['ligands'])
ligand_data.columns = ['Ligand_1', 'Ligand_2', 'Ligand_3']
ligand_data.style.apply(table_style, axis=1)

Unnamed: 0,Ligand_1,Ligand_2,Ligand_3
ID,400,400,1294
NROT,5,5,5
OWAB,18.200000,20.100000,0.000000
logP,2.490000,1.070000,2.650000
name,P86_A_400,P86_B_400,NXG_A_1294
EDIAm,0.320000,0.710000,0.380000
chain,A,B,A
HETCode,P86,P86,NXG
noAltLocs,True,True,True
heavyAtoms,21,21,24


In [15]:
active_site_data = pd.DataFrame.from_dict(output_data['output_data']['active_sites'])
active_site_data.columns = ['Active_site_1', 'Active_site_2', 'Active_site_3']
active_site_data.style.apply(table_style, axis=1)

Unnamed: 0,Active_site_1,Active_site_2,Active_site_3
chains,"A,B","A,B","A,B"
ligand,P86_A_400,P86_B_400,NXG_A_1294
noAltLocs,True,True,True
uniprotID,P04637,P04637,P04637
bondAnglesTest,False,False,False
bondLengthsTest,True,True,True
residueEDIATest,True,False,True
bFactorRatioTest,True,True,True
noIntermolecularClash,True,True,True
noIntramolecularClash,True,True,True
