# Find candidate proteins for protein-ligand free energy benchmarks

Start with the protein-ligand validation sets in BindinDB, filter for:

- Small(ish)
- No metals
- No histidines near the ligand

Would also be nice to filter for surface exposed binding pocket, but not sure exactly how to do that yet.

In [131]:
import os
import urllib
import pandas as pd
import subprocess as sp

import periodictable
from periodictable import elements

from io import StringIO


In [132]:
%load_ext blackcellmagic

The blackcellmagic extension is already loaded. To reload it, use:
  %reload_ext blackcellmagic


## Read in the protein-ligand validation sets from BindingDB
http://bindingdb.org/validation_sets/index.jsp

In [133]:
validation_set = pd.read_csv("validation_sets_PDBs.tsv", names={"PDB"})


Extract just the PDBs from this list.

In [134]:
pdbs = pd.Series.tolist(validation_set["PDB"])

In [135]:
print(f"Starting with {len(pdbs)} structures...")

Starting with 778 structures...


## Filter for proteins less than 200 AAs

In [136]:
def filter_chain_length(pdbs, length=200):
    url = "http://www.rcsb.org/pdb/rest/search"
    query_text = f"""
<orgPdbCompositeQuery version="1.0">
 <queryRefinement>
   <queryRefinementLevel>0</queryRefinementLevel>
      <orgPdbQuery>
        <version>head</version>
        <queryType>org.pdb.query.simple.StructureIdQuery</queryType>
        <structureIdList>{" ".join(pdbs)}</structureIdList>
      </orgPdbQuery>
 </queryRefinement>
 <queryRefinement>
   <queryRefinementLevel>1</queryRefinementLevel>
       <orgPdbQuery>
        <version>head</version>
        <queryType>org.pdb.query.simple.SequenceLengthQuery</queryType>
        <v_sequence.chainLength.min>1</v_sequence.chainLength.min>
        <v_sequence.chainLength.max>{length}</v_sequence.chainLength.max>
      </orgPdbQuery>
  </queryRefinement>
</orgPdbCompositeQuery>
"""
    request = urllib.request.Request(url, data=query_text.encode())
    try:
        response = urllib.request.urlopen(request)
    except urllib.error.HTTPError as e:
        print(f"PDB error...")
        return None

    page = response.read()
    page = page.decode("utf-8").split()
    return page


In [163]:
small_pdbs = filter_chain_length(pdbs, length=100)

In [166]:
small_pdbs = [i.split(":")[0] for i in small_pdbs]

In [167]:
print(f"Filtered to {len(small_pdbs)} structures...")

Filtered to 36 structures...


At this point, I originally planned to filter for metals in the structures.

But this turns out to be tricky because it is fine if there is a Chlorine as part of the ligand but we don't want there to Chlorine atoms floating around the structure that are cofactors or otherwise not part of a substructure.

Therefore, instead of searching the PDB now, I do search for metals later, using Chimera.

```python
def eliminate_metals(pdbs,
                    disallowed_elements=""):
    url = "http://www.rcsb.org/pdb/rest/search"
    query_text = f"""
<orgPdbCompositeQuery version="1.0">
 <queryRefinement>
   <queryRefinementLevel>0</queryRefinementLevel>
      <orgPdbQuery>
        <version>head</version>
        <queryType>org.pdb.query.simple.StructureIdQuery</queryType>
        <structureIdList>{" ".join(pdbs)}</structureIdList>
      </orgPdbQuery>
 </queryRefinement>
 <queryRefinement>
   <queryRefinementLevel>1</queryRefinementLevel>
       <orgPdbQuery>
        <version>head</version>
        <queryType>org.pdb.query.simple.ChemCompFormulaQuery</queryType>
        <formula>{disallowed_elements}</formula>
      </orgPdbQuery>
 </queryRefinement>
</orgPdbCompositeQuery>
"""
    request = urllib.request.Request(url, data=query_text.encode())
    try:
        response = urllib.request.urlopen(request)
    except urllib.error.HTTPError as e:
        print(f"PDB error...")
        return None

    page = response.read()
    page = page.decode("utf-8").split()
    return page
```

In [168]:
def get_structures_with_ligands(pdbs):
    url = "http://www.rcsb.org/pdb/rest/search"
    query_text = f"""
<orgPdbCompositeQuery version="1.0">
 <queryRefinement>
  <queryRefinementLevel>0</queryRefinementLevel>
  <orgPdbQuery>
    <version>head</version>
    <queryType>org.pdb.query.simple.StructureIdQuery</queryType>
    <structureIdList>{" ".join(pdbs)}</structureIdList>
  </orgPdbQuery>
 </queryRefinement>
 <queryRefinement>
  <queryRefinementLevel>1</queryRefinementLevel>
  <conjunctionType>and</conjunctionType>
  <orgPdbQuery>
    <version>head</version>
    <queryType>org.pdb.query.simple.NoLigandQuery</queryType>
    <description>Ligand Search : Has free ligands=yes</description>
    <haveLigands>yes</haveLigands>
  </orgPdbQuery>
 </queryRefinement>
</orgPdbCompositeQuery>
"""
    request = urllib.request.Request(url, data=query_text.encode())
    try:
        response = urllib.request.urlopen(request)
    except urllib.error.HTTPError as e:
        print(f"PDB error...")
        return None

    page = response.read()
    page = page.decode("utf-8").split()
    return page


In [169]:
with_ligands = get_structures_with_ligands(small_pdbs)

In [170]:
print(f"Filtered to {len(with_ligands)} structures...")

Filtered to 35 structures...


## Combine the small protein structures with their ligand(s) into a single `pandas` DataFrame

In [171]:
def combine_structure_and_ligand(pdbs):
    url = f"""https://www.rcsb.org/pdb/rest/customReport.xml?pdbids={",".join(pdbs)}&customReportColumns=structureId,uniprotAcc,ligandName,ligandId,ligandMolecularWeight&service=wsfile&format=csv"""
    request = urllib.request.Request(url)
    try:
        response = urllib.request.urlopen(request)
    except urllib.error.HTTPError as e:
        print(f"PDB error...")
        return None

    page = response.read()
    return pd.read_csv(StringIO(page.decode("utf-8")), sep=",")


In [172]:
ligand_details = combine_structure_and_ligand(with_ligands)


In [173]:
ligand_details.dropna(subset=["ligandName"], inplace=True)


Many structures will report multiple "ligands": cosolvents, glycerol, ions, and a true small molecule ligand. We're just interested in structures with a small molecule ligand, so for each structure we'll sort the "ligands" by molecular weight and take the protein + ligand complex with the highest molecular weight.

In [174]:
structures_with_ligands = pd.DataFrame()
for structure in pd.unique(ligand_details["structureId"]):
    df = ligand_details[ligand_details["structureId"] == structure]
    winner = df.sort_values(by="ligandMolecularWeight", ascending=False).iloc[0]
    structures_with_ligands = structures_with_ligands.append(winner, ignore_index=True)


## Filter for complexes that do not have histidine in the binding site

In [175]:
def run_chimera(input_file="tmp/chimera.com"):

    sp.call(f"chimera {input_file}", shell=True)


In [176]:
def find_histidines(
    pdb, ligand, input_file="tmp/contacts.com", output_file="contacts.txt"
):
    with open(input_file, "w") as f:
        string = f"""
open {pdb}
select :{ligand} z<5
writesel {output_file}
stop
        """
        f.write(string)


def get_histidines(
    histidines=["HIS", "HSE", "HSD", "HID", "HIP", "HIE"], input_file="tmp/contacts.txt"
):
    nearby_residues = pd.read_csv(
        input_file, sep=" ", names=["Model", "Res", "ResID.Chain"]
    )

    total = sum(nearby_residues["Res"].isin(histidines))
    return total


In [177]:
good_pocket = dict()
for pdb, ligand in zip(
    structures_with_ligands["structureId"], structures_with_ligands["ligandId"]
):
    chimera_in = f"queries/{pdb}-his.com"
    path = "queries"
    chimera_out = f"{pdb}-his.txt"

    find_histidines(pdb, ligand, input_file=chimera_in, output_file=chimera_out)
    if not os.path.exists(os.path.join(path, chimera_out)):
        run_chimera(input_file=chimera_in)
    good_pocket[pdb] = get_histidines(input_file=os.path.join(path, chimera_out))


In [178]:
structures_with_histidines = pd.DataFrame.from_dict(
    good_pocket, orient="index", columns=["Histidines"]
)
structures_with_histidines["structureId"] = structures_with_histidines.index


In [179]:
structures = pd.merge(structures_with_ligands, structures_with_histidines)
structures = structures[structures.Histidines == 0]

In [180]:
print(f"Filtered to {len(structures)} structures...")

Filtered to 35 structures...


## Filter for complexes that don't have an unbound metal ion

In [181]:
def find_metals_not_in_ligand(
    pdb,
    ligand,
    disallowed_mask=":He | :Li | :Be | :B | :F | :Ne | :Na | :Mg | :Al | :Si | :Cl | :Ar | :K | :Ca | :Sc | :Ti | :V | :Cr | :Mn | :Fe | :Co | :Ni | :Cu | :Zn | :Ga | :Ge | :As | :Se | :Br | :Kr | :Rb | :Sr | :Y | :Zr | :Nb | :Mo | :Tc | :Ru | :Rh | :Pd | :Ag | :Cd | :In",
    input_file="tmp/metals.com",
    output_file="metals.txt",
):
    with open(input_file, "w") as f:
        string = f"""
open {pdb}
select {disallowed_mask} & ~:{ligand}
writesel {output_file}
stop
        """
        f.write(string)


def get_metals_not_in_ligand(input_file):
    nearby_residues = pd.read_csv(
        input_file, sep=" ", names=["Model", "Res", "ResID.Chain"]
    )

    total = len(nearby_residues["Res"])
    return total


```python
allowed_elements = ["H", "C", "N", "O", "P", "S",]
disallowed_elements = ""
disallowed_mask_list = []
for element in elements:
    if str(element) not in allowed_elements and element.number > 0 and element.number < 50:
        disallowed_elements += f"{str(element).upper()}0 "
        disallowed_mask_list.append(str(element))
```


In [182]:
metals = dict()
for pdb, ligand in zip(structures["structureId"],
                       structures["ligandId"]):

    chimera_in = f"queries/{pdb}-metal.com"
    path = "queries"
    chimera_out = f"{pdb}-metal.txt"

    
    find_metals_not_in_ligand(pdb, ligand, input_file=chimera_in, output_file=chimera_out)
    if not os.path.exists(os.path.join(path, chimera_out)):
        run_chimera(input_file=chimera_in)
    metals[pdb] = get_metals_not_in_ligand(input_file=os.path.join(path, chimera_out))

In [183]:
structures_with_metals = pd.DataFrame.from_dict(
    metals, orient="index", columns=["Metals"]
)
structures_with_metals["structureId"] = structures_with_metals.index


In [184]:
structures = pd.merge(structures, structures_with_metals)
structures = structures[structures.Metals == 0]

In [185]:
print(f"Filtered to {len(structures)} structures...")

Filtered to 25 structures...


In [186]:
structures.head()

Unnamed: 0,chainId,ligandId,ligandMolecularWeight,ligandName,structureId,uniprotAcc,Histidines,Metals
0,H,MID,521.63,1-[N-(naphthalen-2-ylsulfonyl)glycyl-4-carbami...,1ETS,P00735,0,0
2,A,MAO,339.35,5'-DEOXY-5'-[N-METHYL-N-(2-AMINOOXYETHYL) AMIN...,1I72,P17707,0,0
5,B,NAD,663.42,NICOTINAMIDE-ADENINE-DINUCLEOTIDE,1NHG,Q9BH77,0,0
6,A,UCN,482.53,7-HYDROXYSTAUROSPORINE,1NVQ,O14757,0,0
7,A,STU,466.53,STAUROSPORINE,1NVR,O14757,0,0


In [196]:
def chain_length(pdbs):
    url = f"""https://www.rcsb.org/pdb/rest/customReport.xml?pdbids={",".join(pdbs)}&customReportColumns=structureId,chainLength&service=wsfile&format=csv"""
    request = urllib.request.Request(url)
    try:
        response = urllib.request.urlopen(request)
    except urllib.error.HTTPError as e:
        print(f"PDB error...")
        return None

    page = response.read()
    return pd.read_csv(StringIO(page.decode("utf-8")), sep=",")


## Filter to make sure the chain the ligand is part of is under the threshold

In [204]:
queries = [".".join([pdb, chain]) for pdb, chain in zip(structures["structureId"], structures["chainId"])]

In [206]:
chain_lengths = chain_length(queries)

In [207]:
structures = pd.merge(structures, chain_lengths)

In [211]:
structures[structures.chainLength < 200]

Unnamed: 0,chainId,ligandId,ligandMolecularWeight,ligandName,structureId,uniprotAcc,Histidines,Metals,chainLength
10,A,MUT,637.74,"(5S)-3-(3-ACETYLPHENYL)-N-[(1S,2R)-1-BENZYL-2-...",2I0D,O38731,0,0,99
15,A,MGP,538.22,7-METHYL-GUANOSINE-5'-TRIPHOSPHATE,3AM7,P06730,0,0,191
