# Find candidate proteins for protein-ligand free energy benchmarks

Start with the protein-ligand validation sets in BindinDB, filter for:

- Small(ish)
- No metals
- No histidines near the ligand

Would also be nice to filter for surface exposed binding pocket, but not sure exactly how to do that yet.

In [2]:
import os
import urllib
import pandas as pd
import subprocess as sp

from io import StringIO


In [3]:
%load_ext blackcellmagic

## Read in the protein-ligand validation sets from BindingDB
http://bindingdb.org/validation_sets/index.jsp

In [4]:
validation_set = pd.read_csv("validation_sets_PDBs.tsv", names={"PDB"})


Extract just the PDBs from this list.

In [5]:
pdbs = pd.Series.tolist(validation_set["PDB"])

In [6]:
print(f"Starting with {len(pdbs)} structures...")

Starting with 778 structures...


## Filter for proteins less than 200 AAs

In [7]:
def filter_chain_length(pdbs, length=200):
    url = "http://www.rcsb.org/pdb/rest/search"
    query_text = f"""
<orgPdbCompositeQuery version="1.0">
 <queryRefinement>
   <queryRefinementLevel>0</queryRefinementLevel>
      <orgPdbQuery>
        <version>head</version>
        <queryType>org.pdb.query.simple.StructureIdQuery</queryType>
        <structureIdList>{' '.join(pdbs)}</structureIdList>
      </orgPdbQuery>
 </queryRefinement>
 <queryRefinement>
   <queryRefinementLevel>1</queryRefinementLevel>
       <orgPdbQuery>
        <version>head</version>
        <queryType>org.pdb.query.simple.SequenceLengthQuery</queryType>
        <v_sequence.chainLength.min>1</v_sequence.chainLength.min>
        <v_sequence.chainLength.max>{length}</v_sequence.chainLength.max>
      </orgPdbQuery>
  </queryRefinement>
</orgPdbCompositeQuery>
"""
    request = urllib.request.Request(url, data=query_text.encode())
    try:
        response = urllib.request.urlopen(request)
    except urllib.error.HTTPError as e:
        print(f"PDB error...")
        return None

    page = response.read()
    page = page.decode("utf-8").split()
    return page


In [None]:
small_pdbs = filter_chain_length(pdbs, length=200)

In [None]:
small_pdbs = [i.split(":")[0] for i in small_pdbs]

In [None]:
print(f"Filtered to {len(small_pdbs)} structures...")

At this point, I originally planned to filter for metals in the structures.

But this turns out to be tricky because it is fine if there is a Chlorine as part of the ligand but we don't want there to Chlorine atoms floating around the structure that are cofactors or otherwise not part of a substructure.

Therefore, instead of searching the PDB now, I do search for metals later, using Chimera.

```python
def eliminate_metals(pdbs,
                    disallowed_elements=""):
    url = "http://www.rcsb.org/pdb/rest/search"
    query_text = f"""
<orgPdbCompositeQuery version="1.0">
 <queryRefinement>
   <queryRefinementLevel>0</queryRefinementLevel>
      <orgPdbQuery>
        <version>head</version>
        <queryType>org.pdb.query.simple.StructureIdQuery</queryType>
        <structureIdList>{" ".join(pdbs)}</structureIdList>
      </orgPdbQuery>
 </queryRefinement>
 <queryRefinement>
   <queryRefinementLevel>1</queryRefinementLevel>
       <orgPdbQuery>
        <version>head</version>
        <queryType>org.pdb.query.simple.ChemCompFormulaQuery</queryType>
        <formula>{disallowed_elements}</formula>
      </orgPdbQuery>
 </queryRefinement>
</orgPdbCompositeQuery>
"""
    request = urllib.request.Request(url, data=query_text.encode())
    try:
        response = urllib.request.urlopen(request)
    except urllib.error.HTTPError as e:
        print(f"PDB error...")
        return None

    page = response.read()
    page = page.decode("utf-8").split()
    return page
```

In [10]:
def get_structures_with_ligands(pdbs):
    url = "http://www.rcsb.org/pdb/rest/search"
    query_text = f"""
<orgPdbCompositeQuery version="1.0">
 <queryRefinement>
  <queryRefinementLevel>0</queryRefinementLevel>
  <orgPdbQuery>
    <version>head</version>
    <queryType>org.pdb.query.simple.StructureIdQuery</queryType>
    <structureIdList>{" ".join(pdbs)}</structureIdList>
  </orgPdbQuery>
 </queryRefinement>
 <queryRefinement>
  <queryRefinementLevel>1</queryRefinementLevel>
  <conjunctionType>and</conjunctionType>
  <orgPdbQuery>
    <version>head</version>
    <queryType>org.pdb.query.simple.NoLigandQuery</queryType>
    <description>Ligand Search : Has free ligands=yes</description>
    <haveLigands>yes</haveLigands>
  </orgPdbQuery>
 </queryRefinement>
</orgPdbCompositeQuery>
"""
    request = urllib.request.Request(url, data=query_text.encode())
    try:
        response = urllib.request.urlopen(request)
    except urllib.error.HTTPError as e:
        print(f"PDB error...")
        return None

    page = response.read()
    page = page.decode("utf-8").split()
    return page


In [11]:
with_ligands = get_structures_with_ligands(small_pdbs)

In [12]:
print(f"Filtered to {len(with_ligands)} structures...")

Filtered to 97 structures...


## Combine the small protein structures with their ligand(s) into a single `pandas` DataFrame

In [13]:
def combine_structure_and_ligand(pdbs):
    url = f"""https://www.rcsb.org/pdb/rest/customReport.xml?pdbids={",".join(pdbs)}&customReportColumns=structureId,uniprotAcc,ligandName,ligandId,ligandMolecularWeight&service=wsfile&format=csv"""
    request = urllib.request.Request(url)
    try:
        response = urllib.request.urlopen(request)
    except urllib.error.HTTPError as e:
        print(f"PDB error...")
        return None

    page = response.read()
    return pd.read_csv(StringIO(page.decode("utf-8")), sep=",")


In [14]:
ligand_details = combine_structure_and_ligand(with_ligands)


In [15]:
ligand_details.dropna(subset=["ligandName"], inplace=True)


Many structures will report multiple "ligands": cosolvents, glycerol, ions, and a true small molecule ligand. We're just interested in structures with a small molecule ligand, so for each structure we'll sort the "ligands" by molecular weight and take the protein + ligand complex with the highest molecular weight.

In [16]:
structures_with_ligands = pd.DataFrame()
for structure in pd.unique(ligand_details["structureId"]):
    df = ligand_details[ligand_details["structureId"] == structure]
    winner = df.sort_values(by="ligandMolecularWeight", ascending=False).iloc[0]
    structures_with_ligands = structures_with_ligands.append(winner, ignore_index=True)


## Filter for complexes that do not have histidine in the binding site

In [17]:
def run_chimera(input_file="tmp/chimera.com"):

    sp.call(f"chimera {input_file}", shell=True)


In [18]:
def find_histidines(
    pdb, ligand, input_file="tmp/contacts.com", output_file="contacts.txt"
):
    with open(input_file, "w") as f:
        string = f"""
open {pdb}
select :{ligand} z<5
writesel {output_file}
stop
        """
        f.write(string)


def get_histidines(
    histidines=["HIS", "HSE", "HSD", "HID", "HIP", "HIE"], input_file="tmp/contacts.txt"
):
    nearby_residues = pd.read_csv(
        input_file, sep=" ", names=["Model", "Res", "ResID.Chain"]
    )

    total = sum(nearby_residues["Res"].isin(histidines))
    return total


In [19]:
good_pocket = dict()
for pdb, ligand in zip(
    structures_with_ligands["structureId"], structures_with_ligands["ligandId"]
):
    chimera_in = f"queries/{pdb}-his.com"
    path = "queries"
    chimera_out = f"{pdb}-his.txt"

    find_histidines(pdb, ligand, input_file=chimera_in, output_file=chimera_out)
    if not os.path.exists(os.path.join(path, chimera_out)):
        run_chimera(input_file=chimera_in)
    good_pocket[pdb] = get_histidines(input_file=os.path.join(path, chimera_out))


In [20]:
structures_with_histidines = pd.DataFrame.from_dict(
    good_pocket, orient="index", columns=["Histidines"]
)
structures_with_histidines["structureId"] = structures_with_histidines.index


In [21]:
structures = pd.merge(structures_with_ligands, structures_with_histidines)
structures = structures[structures.Histidines == 0]

In [22]:
print(f"Filtered to {len(structures)} structures...")

Filtered to 95 structures...


## Filter for complexes that don't have an unbound metal ion

In [23]:
def find_metals_not_in_ligand(
    pdb,
    ligand,
    disallowed_mask=":He | :Li | :Be | :B | :F | :Ne | :Na | :Mg | :Al | :Si | :Cl | :Ar | :K | :Ca | :Sc | :Ti | :V | :Cr | :Mn | :Fe | :Co | :Ni | :Cu | :Zn | :Ga | :Ge | :As | :Se | :Br | :Kr | :Rb | :Sr | :Y | :Zr | :Nb | :Mo | :Tc | :Ru | :Rh | :Pd | :Ag | :Cd | :In",
    input_file="tmp/metals.com",
    output_file="metals.txt",
):
    with open(input_file, "w") as f:
        string = f"""
open {pdb}
select {disallowed_mask} & ~:{ligand}
writesel {output_file}
stop
        """
        f.write(string)


def get_metals_not_in_ligand(input_file):
    nearby_residues = pd.read_csv(
        input_file, sep=" ", names=["Model", "Res", "ResID.Chain"]
    )

    total = len(nearby_residues["Res"])
    return total


```python
allowed_elements = ["H", "C", "N", "O", "P", "S",]
disallowed_elements = ""
disallowed_mask_list = []
for element in elements:
    if str(element) not in allowed_elements and element.number > 0 and element.number < 50:
        disallowed_elements += f"{str(element).upper()}0 "
        disallowed_mask_list.append(str(element))
```


In [24]:
metals = dict()
for pdb, ligand in zip(structures["structureId"],
                       structures["ligandId"]):

    chimera_in = f"queries/{pdb}-metal.com"
    path = "queries"
    chimera_out = f"{pdb}-metal.txt"

    
    find_metals_not_in_ligand(pdb, ligand, input_file=chimera_in, output_file=chimera_out)
    if not os.path.exists(os.path.join(path, chimera_out)):
        run_chimera(input_file=chimera_in)
    metals[pdb] = get_metals_not_in_ligand(input_file=os.path.join(path, chimera_out))

In [25]:
structures_with_metals = pd.DataFrame.from_dict(
    metals, orient="index", columns=["Metals"]
)
structures_with_metals["structureId"] = structures_with_metals.index


In [26]:
structures = pd.merge(structures, structures_with_metals)
structures = structures[structures.Metals == 0]

In [27]:
print(f"Filtered to {len(structures)} structures...")

Filtered to 60 structures...


In [28]:
structures.head()

Unnamed: 0,chainId,ligandId,ligandMolecularWeight,ligandName,structureId,uniprotAcc,Histidines,Metals
0,H,MID,521.63,1-[N-(naphthalen-2-ylsulfonyl)glycyl-4-carbami...,1ETS,P00735,0,0
3,B,YZ9,234.21,7-HYDROXY-2-OXO-CHROMENE-3-CARBOXYLIC ACID ETH...,1GCZ,P14174,0,0
6,A,MAO,339.35,5'-DEOXY-5'-[N-METHYL-N-(2-AMINOOXYETHYL) AMIN...,1I72,P17707,0,0
7,A,C2P,323.2,CYTIDINE-2'-MONOPHOSPHATE,1JVU,P61823,0,0
8,A,MGP,538.22,7-METHYL-GUANOSINE-5'-TRIPHOSPHATE,1L8B,P63073,0,0


In [29]:
def chain_length(pdbs):
    url = f"""https://www.rcsb.org/pdb/rest/customReport.xml?pdbids={",".join(pdbs)}&customReportColumns=structureId,chainLength&service=wsfile&format=csv"""
    request = urllib.request.Request(url)
    try:
        response = urllib.request.urlopen(request)
    except urllib.error.HTTPError as e:
        print(f"PDB error...")
        return None

    page = response.read()
    return pd.read_csv(StringIO(page.decode("utf-8")), sep=",")


## Filter to make sure the chain the ligand is part of is under the threshold

In [30]:
queries = [".".join([pdb, chain]) for pdb, chain in zip(structures["structureId"], structures["chainId"])]

In [31]:
chain_lengths = chain_length(queries)

In [32]:
structures = pd.merge(structures, chain_lengths)

In [78]:
structures[structures.chainLength < 200]

Unnamed: 0,chainId,ligandId,ligandMolecularWeight,ligandName,structureId,uniprotAcc,Histidines,Metals,chainLength
1,B,YZ9,234.21,7-HYDROXY-2-OXO-CHROMENE-3-CARBOXYLIC ACID ETH...,1GCZ,P14174,0,0,122
3,A,C2P,323.2,CYTIDINE-2'-MONOPHOSPHATE,1JVU,P61823,0,0,124
4,A,MGP,538.22,7-METHYL-GUANOSINE-5'-TRIPHOSPHATE,1L8B,P63073,0,0,190
9,B,U2P,324.18,"PHOSPHORIC ACID MONO-[2-(2,4-DIOXO-3,4-DIHYDRO...",1O0M,P61823,0,0,124
10,B,U3P,324.18,3'-URIDINEMONOPHOSPHATE,1O0N,P61823,0,0,124
11,A,852,629.66,2-{4-[2-ACETYLAMINO-2-(1-BIPHENYL-4-YLMETHYL-2...,1O44,P12931,0,0,108
12,A,903,661.67,2-{4-[2-ACETYLAMINO-2-(1-BIPHENYL-4-YLMETHYL-2...,1O46,P12931,0,0,108
13,A,822,613.59,"N-ACETYL-N-[1-(1,1'-BIPHENYL-4-YLMETHYL)-2-OXO...",1O47,P12931,0,0,108
14,A,853,585.65,5-[2-ACETYLAMINO-2-(1-BIPHENYL-4-YLMETHYL-2-OX...,1O48,P12931,0,0,108
15,A,493,637.62,{4-[2-ACETYLAMINO-2-(1-BIPHENYL-4-YLMETHYL-2-O...,1O49,P12931,0,0,108


In [79]:
structures.to_csv("structures.csv")

In [34]:
len(structures)

60

In [35]:
import nglview

In [45]:
view = nglview.show_pdbid("4AC4", gui=True)
view.add_surface(selection="protein", opacity=0.3)

view

NGLWidget()

Tab(children=(Box(children=(Box(children=(Box(children=(Label(value='step'), IntSlider(value=1, min=-100)), la…

In [76]:
def render(
    pdb,
    ligand,
    input_file="tmp/snapshot.com",
):
    with open(input_file, "w") as f:
        string = f"""
open {pdb}
~show
~display
ribbon
color grey protein
surface protein
surftransp 90
display :{ligand}
color purple :{ligand}
color byhet :{ligand}
center :{ligand}
copy file {pdb}.png supersample 3
stop
        """
        f.write(string)


In [77]:
render("4AC4", "HKA", input_file=f"queries/{pdb}-snapshot.com")
run_chimera(input_file=f"queries/{pdb}-snapshot.com")


In [95]:
def render_pymol(
    pdb,
    ligand,
    input_file="tmp/snapshot.com",
):
    with open(input_file, "w") as f:
        string = f"""\
fetch {pdb}, async=0

hide
show cartoon
show sticks, resn {ligand}

color green, resn {ligand}
color atomic, resn and not elem C

color grey80, polymer
show surface, polymer
set transparency, 0.3

bg_color white

center resn {ligand}
zoom resn {ligand}, 30

set cartoon_fancy_helices
png {pdb}.png, dpi=150

        """
        f.write(string)


In [97]:

for pdb, ligand in zip(structures["structureId"],
                       structures["ligandId"]):

    pymol_in = f"queries/{pdb}-pymol.pml"
    path = "queries"
    
    if not os.path.exists(os.path.join(path, pymol_in)):
        render_pymol(pdb=pdb,
                     ligand=ligand,
                     input_file=pymol_in)
        sp.call(f"pymol -cq {pymol_in}", shell=True)

In [83]:
structures

Unnamed: 0,chainId,ligandId,ligandMolecularWeight,ligandName,structureId,uniprotAcc,Histidines,Metals,chainLength
0,H,MID,521.63,1-[N-(naphthalen-2-ylsulfonyl)glycyl-4-carbami...,1ETS,P00735,0,0,259
1,B,YZ9,234.21,7-HYDROXY-2-OXO-CHROMENE-3-CARBOXYLIC ACID ETH...,1GCZ,P14174,0,0,122
2,A,MAO,339.35,5'-DEOXY-5'-[N-METHYL-N-(2-AMINOOXYETHYL) AMIN...,1I72,P17707,0,0,267
3,A,C2P,323.2,CYTIDINE-2'-MONOPHOSPHATE,1JVU,P61823,0,0,124
4,A,MGP,538.22,7-METHYL-GUANOSINE-5'-TRIPHOSPHATE,1L8B,P63073,0,0,190
5,B,NAD,663.42,NICOTINAMIDE-ADENINE-DINUCLEOTIDE,1NHG,Q9BH77,0,0,229
6,A,UCN,482.53,7-HYDROXYSTAUROSPORINE,1NVQ,O14757,0,0,289
7,A,STU,466.53,STAUROSPORINE,1NVR,O14757,0,0,289
8,A,UCM,393.39,"REL-(9R,12S)-9,10,11,12-TETRAHYDRO-9,12-EPOXY-...",1NVS,O14757,0,0,289
9,B,U2P,324.18,"PHOSPHORIC ACID MONO-[2-(2,4-DIOXO-3,4-DIHYDRO...",1O0M,P61823,0,0,124
