# SIENA: Efficient Compilation of Selective Protein Binding Site Ensembles

In this notebook we show how to use the SIENA tool to compile a binding site ensemble for a given binding site.
The tool needs a protein structure and a defintion of the query binding site as input. The query bindings
site can be given through a reference ligand or by specifying a set of amino acid residues.

In just a few seconds SIENA scans the PDB for similar binding sites and automatically generates an structure
ensemble of the results with the query site.

Stefan Bietz and Matthias Rarey
Journal of Chemical Information and Modeling 2016 56 (1), 248-259
https://doi.org/10.1021/acs.jcim.5b00588

In [1]:
# imports
import json
import io
from pathlib import Path
import requests
import sys
import time
import warnings

from IPython.display import Image
from Bio.PDB import *
from Bio.PDB.PDBExceptions import PDBConstructionWarning
import nglview as nv
import numpy as np
from rdkit import Chem



In [2]:
# constants
TEST_FILES = Path('../test_files/')
PROTEINS_PLUS_URL = 'http://localhost:8000/'
UPLOAD = PROTEINS_PLUS_URL + 'molecule_handler/upload/'
UPLOAD_JOBS = PROTEINS_PLUS_URL + 'molecule_handler/upload/jobs/'
PROTEINS = PROTEINS_PLUS_URL + 'molecule_handler/proteins/'
LIGANDS = PROTEINS_PLUS_URL + 'molecule_handler/ligands/'
SIENA = PROTEINS_PLUS_URL + 'siena/'
SIENA_JOBS = PROTEINS_PLUS_URL + 'siena/jobs/'

In [3]:
# utils

# check server connection
try:
    response = requests.get(PROTEINS_PLUS_URL)
except requests.ConnectionError as error:
    if 'Connection refused' in str(error):
        print('WARNING: could not establish a connection to the server', file=sys.stderr)
    raise
    
def poll_job(job_id, poll_url, poll_interval=1, max_polls=10):
    """Poll the progress of a job
    
    Continuosly polls the server in regular intervals and updates the job information, especially the status.
    
    :param job_id: UUID of the job to poll
    :type job_id: str
    :param poll_url: URl to send the polling request to
    :type poll_url: str
    :param poll_interval: time interval between polls in seconds
    :type poll_interval: int
    :param max_polls: maximum number of times to poll before exiting
    :type max_polls: int
    :return: polled job
    :rtype: dict
    """
    job = requests.get(poll_url + job_id).json()
    status = job['status']
    current_poll = 0
    while status == 'pending' or status == 'running':
        print(f'Job {job_id} is { status }')
        current_poll += 1
        if current_poll >= max_polls:
            print(f'Job {job_id} has not completed after {max_polls} polling requests' \
                  f' and {poll_interval * max_polls} seconds')
            return job
        time.sleep(poll_interval)
        job = requests.get(poll_url + job_id).json()
        status = job['status']
    print(f'Job {job_id} completed with { status }')
    return job

def print_data_fields(model):
    """Print the fields of a model
    
    :param model: data model
    :type model: dict
    """
    for field in model.keys():
        print(f' - "{field}"')

In [4]:
protein_structure = PDBParser().get_structure('4agm', TEST_FILES / '4agm.pdb')
view = nv.show_biopython(protein_structure)
view



NGLWidget()

We can use a reference ligand to run a binding site search

In [5]:
# Let's look at the ligand we use as a reference for the query binding site 
with open(TEST_FILES / 'NXG_A_1294.sdf') as upload_ligand_file:    
    ligand_structure = Chem.MolFromMolBlock(upload_ligand_file.read(), removeHs=False)
    view.add_structure(nv.RdkitStructure(ligand_structure))

# the reference ligand is shown with hydrogens
view

NGLWidget()

Using Biopython and RDkit the residues of the binding site can be illustrated easily

In [7]:
def get_amino_acid_atoms(protein_structure):
    """Retrieves all atoms of amino acids in the structures residues 
    
    :param protein_structure: the protein structure
    :type protein_structure: Bio.PDB.Structure.Structure
    """
    return [a for a in protein_structure.get_atoms() if is_aa(a.parent)]

def conformer_to_matrix(conformer):
    """Copies the atom coordinates of the conformer in a numpy matrix.
    
    :param conformer: the conformer
    :type conformer: rdkit.Chem.rdchem.Conformer
    """
    coord_matrix = np.empty((conformer.GetNumAtoms(), 3))
    for i in range(coord_matrix.shape[0]):
        coord_matrix[i, :] = conformer.GetAtomPosition(i)
    return coord_matrix

def get_close_residues(protein_structure, ligand_structure, distance_threshold):
    """Determines protein residues close to ligand.
    
    A residue is close if at least one of it's atoms is closer than 
    distance_threshold to a ligand atom.
        
    :param protein_structure: the protein structure
    :type protein_structure: Bio.PDB.Structure.Structure
    :param ligand_structure: the ligand structure
    :type ligand_structure: rdkit.Chem.rdchem.Mol
    :param distance_threshold: distance treshold for close residues.
    :type distance_threshold: float
    """
    
    # Get all amino acid atoms in the input structure
    aa_atoms = get_amino_acid_atoms(protein_structure)

    # Extract the atom coordinates of the ligand into a numpy matrix
    lig_coord_matrix = conformer_to_matrix(ligand_structure.GetConformer())

    # Get 'close' residues by checking protein-ligand atom distances.
    # For illustrative purposes we just compare all-vs-all atom distances (On^2).
    close_residues = set()
    for protein_atom in aa_atoms:
        for i in range(ligand_structure.GetNumAtoms()):
            dist = np.linalg.norm(protein_atom.coord - lig_coord_matrix[i,:])
            if dist < distance_threshold:
                close_residues.add(protein_atom.parent)
    return list(close_residues)

close_residues = get_close_residues(protein_structure, ligand_structure, 6.5)
f'Found {len(close_residues)} close residues'

'Found 24 close residues'

With NGL's selection language we can visualize the side chains of the close residues in the 3D view

In [8]:
def residues_to_selection(residues):
    """Converts a list of biopython residues to a NGL selection string.
        
    :param residue: the residue list
    :type residue: List[Bio.PDB.Residue.Residue]
    """
    return ' or '.join([f'(:{r.parent.id} and {"".join(map(str, r.get_id()))})' for r in residues])

# Show pocket residues as stick
view.add_representation("ball+stick", selection=residues_to_selection(close_residues))
view

NGLWidget()

### Binding site search with a ligand query
Here we start the SIENA job on the server by uploading a PDB and ligand SDF file

In [9]:
with open(TEST_FILES / 'NXG_A_1294.sdf') as upload_ligand_file:
    with open(TEST_FILES / '4agm.pdb') as upload_file:
        query = {'protein_file': upload_file, 'ligand_file': upload_ligand_file}
        job_submission = requests.post(SIENA, files=query).json()
siena_job = poll_job(job_submission['job_id'], SIENA_JOBS)    

Job 29fb6230-f8b9-41a8-8853-1b681d2519f3 completed with success


Let's look at the ensemble of the first 5 binding sites found in the PDB search

In [10]:
protein_hits = []
for protein_id in siena_job['output_proteins'][:5]:
    protein_hits.append(requests.get(PROTEINS + protein_id).json())
    hit_name = protein_hits[-1]['name']
    print('Showing hit:', hit_name)

    with warnings.catch_warnings():
        warnings.simplefilter('ignore', PDBConstructionWarning)
        hit_structure = PDBParser().get_structure(hit_name, io.StringIO(protein_hits[-1]['file_string']))
    view.add_structure(nv.BiopythonStructure(hit_structure))
view

Showing hit: 2J1X_1
Showing hit: 2J1X_2
Showing hit: 2VUK_3
Showing hit: 2VUK_4
Showing hit: 2X0U_5


NGLWidget()

### Binding site search with a residue list query
A binding site can also be defined by a list of residues which we provide to SIENA through a json file

Let's start by making a new 3D viewer and show the side chains of the residues we want to use as binding site definition. 
We just re-use the close_residues we determined before.

In [11]:
view2 = nv.show_biopython(protein_structure)
view2.add_representation("ball+stick", selection=residues_to_selection(close_residues))
view2

NGLWidget()

In [12]:
def biopython_residue_to_dict(residue):
    """Convertes a biopython residue to a dict.
    
    :param residue: the residue
    :type residue: Bio.PDB.Residue.Residue
    """
    _, pos, icode = residue.get_id()
    icode = icode.strip()
    return {'name': residue.get_resname(), 'position': f'{pos}{icode}', 'chain': residue.parent.id}

We can simply convert a list of biopython residues to a json and send the json to the server to start a SIENA job with it 

In [13]:
residue_query_dict = {'residue_ids': [biopython_residue_to_dict(r) for r in close_residues]}
residue_query_json = json.dumps(residue_query_dict)

# call SIENA job again but with a protein_site_json query
with open(TEST_FILES / '4agm.pdb') as upload_file:
    query = {'protein_file': upload_file}
    other_job_submission = requests.post(SIENA, files=query, data={'protein_site_json': residue_query_json}).json()
other_siena_job = poll_job(other_job_submission['job_id'], SIENA_JOBS)  
other_job_submission
# json.dumps(residue_query_dict)

Job 96706925-7057-41c6-a13a-ac9b186eedf9 completed with success


{'job_id': '96706925-7057-41c6-a13a-ac9b186eedf9',
 'retrieved_from_cache': True}

Again we visualize the first retrieved binding sites 

In [14]:
protein_hits2 = []
for protein_id in other_siena_job['output_proteins'][:5]:
    protein_hits2.append(requests.get(PROTEINS + protein_id).json())
    hit_name = protein_hits2[-1]['name']
    print('Showing hit:', hit_name)

    with warnings.catch_warnings():
        warnings.simplefilter('ignore', PDBConstructionWarning)
        hit_structure = PDBParser().get_structure(hit_name, io.StringIO(protein_hits2[-1]['file_string']))
    view2.add_structure(nv.BiopythonStructure(hit_structure))
view2

Showing hit: 2J1X_1
Showing hit: 2J1X_2
Showing hit: 2VUK_3
Showing hit: 2VUK_4
Showing hit: 2X0U_5


NGLWidget()