This notebook outlines the steps for dataset creation of Astex Diverse Set.



## Background

The Astex Diverse set is a well-established and commonly-used benchmark for evaluating docking methods. It was published in 2007 is a set of hand-picked, relevant, diverse, and high-quality protein–ligand complexes from the PDB. The complexes were downloaded from the PDB as MMTF files and PyMOL was used to remove solvents and all occurrences of the ligand of interest from the complexes before saving the proteins with the cofactors in PDB files and the ligands in SDF files.

## Data source
- Reference: [Hartshorn  et al.](https://pubs.acs.org/doi/abs/10.1021/jm061277y)
- Orignial: https://zenodo.org/records/8278563
- Polaris: polaris-public/polaris-recipes/org-polaris/posebusters/posebusters_paper_data/astex_diverse_set


In [1]:
%load_ext autoreload
%autoreload 2
from pathlib import Path
import platformdirs
import numpy as np
import pandas as pd
import datamol as dm
from polaris.dataset import DatasetFactory
from polaris.dataset.converters import SDFConverter, PDBConverter, ZarrConverter

In [5]:
DATA_ROOT = "/Users/lu.zhu/Downloads/8278563/posebusters_paper_data"
dataset_name = "astex_diverse_set"
proteins = [Path(folder).name for folder in dm.fs.glob(f"{DATA_ROOT}/{dataset_name}/*")]
file_list = ["ligand", "ligand_start_conf", "ligands", "protein"]


SAVE_DIR = dm.fs.join(
    platformdirs.user_cache_dir(appname="polaris-recipes"), "posebusters"
)

In [6]:
ligands = [
    dm.fs.join(DATA_ROOT, dataset_name, protein, f"{protein}_ligands.sdf")
    for protein in proteins
]
ligand_start_conf = [
    dm.fs.join(DATA_ROOT, dataset_name, protein, f"{protein}_ligand_start_conf.sdf")
    for protein in proteins
]
ligand = [
    dm.fs.join(DATA_ROOT, dataset_name, protein, f"{protein}_ligand.sdf")
    for protein in proteins
]
protein = [
    dm.fs.join(DATA_ROOT, dataset_name, protein, f"{protein}_protein.pdb")
    for protein in proteins
]

In [7]:
# Preprocess sdf and add protein group
# This step is to facilitate the data aggregation later on
# It's neccessary for adding a list of ligands and conformers for one protein in a single row in the dataset
ligands_files = []
for sdf in ligands:
    mols = dm.read_sdf(sdf)
    protein_name = Path(sdf).parent.name
    for mol in mols:
        mol.SetProp("protein_group", protein_name)
    url = dm.fs.join(SAVE_DIR, protein_name, Path(sdf).name)
    dm.to_sdf(mols, url)
    ligands_files.append(url)

In [8]:
# Create a new factory

save_dst = dm.fs.join(SAVE_DIR, f"{dataset_name}.zarr")
factory = DatasetFactory(zarr_root_path=save_dst)
factory.reset(save_dst)

converter = SDFConverter(
        smiles_column=f"ligands_smiles",
        mol_column="ligands",
    ) 

factory.register_converter(
    "sdf",
    converter
)
factory.add_from_files(paths=ligands_files, axis=0)

# aggregate the table to map a list of ligands to one protein
agg_fn_ligands = lambda x: converter.get_pointer(converter.mol_column, f"{x.index[0]}:{x.index[-1]}" if len(x) >1 else f"{x.index[0]}" )
agg_fn_ligands_smiles = lambda x: str(np.unique(x).tolist())
factory._table = factory._table.groupby("protein_group").agg({"ligands": agg_fn_ligands, "ligands_smiles":agg_fn_ligands_smiles}).reset_index()

In [9]:
# add protein pdbs
factory.register_converter("pdb", PDBConverter(pdb_column="protein"))
factory.add_from_files(paths=protein, axis=0)

# add ligand sdfs
factory.register_converter(
    "sdf",
    SDFConverter(
        smiles_column="ligand_smiles", mol_column="ligand", mol_prop_as_cols=False
    ),
)
factory.add_from_files(paths=ligand, axis=0)

# add ligand start conformers
factory.register_converter(
    "sdf",
    SDFConverter(
        smiles_column="ligand_start_conf_smiles",
        mol_column="ligand_start_conf",
        mol_prop_as_cols=False,
    ),
)
factory.add_from_files(paths=ligand_start_conf, axis=0)

[32m2024-08-27 11:52:30.551[0m | [1mINFO    [0m | [36mpolaris.dataset._factory[0m:[36mregister_converter[0m:[36m136[0m - [1mYou are overwriting the converter for the sdf extension.[0m
[32m2024-08-27 11:52:30.822[0m | [1mINFO    [0m | [36mpolaris.dataset._factory[0m:[36mregister_converter[0m:[36m136[0m - [1mYou are overwriting the converter for the sdf extension.[0m


In [10]:
# Build the dataset through factory
dataset = factory.build()

In [17]:
dataset

[32m2024-08-27 11:23:39.048[0m | [1mINFO    [0m | [36mpolaris.mixins._checksum[0m:[36mmd5sum[0m:[36m27[0m - [1mComputing the checksum. This can be slow for large datasets.[0m
Finding all files in the Zarr archive: 100%|██████████| 14057/14057 [00:01<00:00, 8218.85it/s]


0,1
name,
description,
tags,
user_attributes,
owner,
polaris_version,0.8.0
default_adapters,ligandsBYTES_TO_MOLproteinARRAY_TO_PDBligandBYTES_TO_MOLligand_start_confBYTES_TO_MOL
zarr_root_path,/Users/lu.zhu/Library/Caches/polaris-recipes/posebusters/posebusters_benchmark_set.zarr
readme,
annotations,ligands_smilesis_pointerFalsemodalityMOLECULEdescriptionNoneuser_attributesdtypeobjectcontent_typeSMILESprotein_groupis_pointerFalsemodalityUNKNOWNdescriptionNoneuser_attributesdtypeobjectcontent_typeNoneligandsis_pointerTruemodalityMOLECULE_3DdescriptionNoneuser_attributesdtypeobjectcontent_typeNoneproteinis_pointerTruemodalityPROTEIN_3DdescriptionNoneuser_attributesdtypeobjectcontent_typePDBligand_smilesis_pointerFalsemodalityMOLECULEdescriptionNoneuser_attributesdtypeobjectcontent_typeSMILESligandis_pointerTruemodalityMOLECULE_3DdescriptionNoneuser_attributesdtypeobjectcontent_typeNoneligand_start_conf_smilesis_pointerFalsemodalityMOLECULEdescriptionNoneuser_attributesdtypeobjectcontent_typeSMILESligand_start_confis_pointerTruemodalityMOLECULE_3DdescriptionNoneuser_attributesdtypeobjectcontent_typeNone

0,1
ligands,BYTES_TO_MOL
protein,ARRAY_TO_PDB
ligand,BYTES_TO_MOL
ligand_start_conf,BYTES_TO_MOL

0,1
ligands_smiles,is_pointerFalsemodalityMOLECULEdescriptionNoneuser_attributesdtypeobjectcontent_typeSMILES
protein_group,is_pointerFalsemodalityUNKNOWNdescriptionNoneuser_attributesdtypeobjectcontent_typeNone
ligands,is_pointerTruemodalityMOLECULE_3DdescriptionNoneuser_attributesdtypeobjectcontent_typeNone
protein,is_pointerTruemodalityPROTEIN_3DdescriptionNoneuser_attributesdtypeobjectcontent_typePDB
ligand_smiles,is_pointerFalsemodalityMOLECULEdescriptionNoneuser_attributesdtypeobjectcontent_typeSMILES
ligand,is_pointerTruemodalityMOLECULE_3DdescriptionNoneuser_attributesdtypeobjectcontent_typeNone
ligand_start_conf_smiles,is_pointerFalsemodalityMOLECULEdescriptionNoneuser_attributesdtypeobjectcontent_typeSMILES
ligand_start_conf,is_pointerTruemodalityMOLECULE_3DdescriptionNoneuser_attributesdtypeobjectcontent_typeNone

0,1
is_pointer,False
modality,MOLECULE
description,
user_attributes,
dtype,object
content_type,SMILES

0,1
is_pointer,False
modality,UNKNOWN
description,
user_attributes,
dtype,object
content_type,

0,1
is_pointer,True
modality,MOLECULE_3D
description,
user_attributes,
dtype,object
content_type,

0,1
is_pointer,True
modality,PROTEIN_3D
description,
user_attributes,
dtype,object
content_type,PDB

0,1
is_pointer,False
modality,MOLECULE
description,
user_attributes,
dtype,object
content_type,SMILES

0,1
is_pointer,True
modality,MOLECULE_3D
description,
user_attributes,
dtype,object
content_type,

0,1
is_pointer,False
modality,MOLECULE
description,
user_attributes,
dtype,object
content_type,SMILES

0,1
is_pointer,True
modality,MOLECULE_3D
description,
user_attributes,
dtype,object
content_type,


### Update the annotations

In [11]:
dataset.annotations["protein"].description = (
    "The protein structure without the ligand of interest without solvents and with all cofactors."
)
dataset.annotations["ligand"].description = (
    "One of the instances of the ligand of interest. This crystal pose marks the binding site for those docking methods that require a binding site."
)

dataset.annotations["ligands"].description = "All instances of the ligand of interest."
dataset.annotations["ligand_start_conf"].description = (
    "One generated molecule conformation for the ligand of interest generated with RDKit's ETKDGv3 followed by an energy minimization with the UFF."
)

In [14]:
from polaris.utils.types import HubOwner
dataset.owner = HubOwner(slug="polaris")
dataset.name = "astex_diverse_set-v1"
dataset.tags = ['docking']
dataset.description = "The Astex Diverse set is a well-established and commonly-used benchmark for evaluating docking methods."

In [13]:
dataset

0,1
name,astex_diverse_set-v1
description,The Astex Diverse set is a well-established and commonly-used benchmark for evaluating docking methods.
tags,docking
user_attributes,
owner,polaris
polaris_version,0.8.0
default_adapters,ligandsBYTES_TO_MOLproteinARRAY_TO_PDBligandBYTES_TO_MOLligand_start_confBYTES_TO_MOL
zarr_root_path,/Users/lu.zhu/Library/Caches/polaris-recipes/posebusters/astex_diverse_set.zarr
readme,
annotations,ligands_smilesis_pointerFalsemodalityMOLECULEdescriptionNoneuser_attributesdtypeobjectcontent_typeSMILESprotein_groupis_pointerFalsemodalityUNKNOWNdescriptionNoneuser_attributesdtypeobjectcontent_typeNoneligandsis_pointerTruemodalityMOLECULE_3DdescriptionAll instances of the ligand of interest.user_attributesdtypeobjectcontent_typeNoneproteinis_pointerTruemodalityPROTEIN_3DdescriptionThe protein structure without the ligand of interest without solvents and with all cofactors.user_attributesdtypeobjectcontent_typePDBligand_smilesis_pointerFalsemodalityMOLECULEdescriptionNoneuser_attributesdtypeobjectcontent_typeSMILESligandis_pointerTruemodalityMOLECULE_3DdescriptionOne of the instances of the ligand of interest. This crystal pose marks the binding site for those docking methods that require a binding site.user_attributesdtypeobjectcontent_typeNoneligand_start_conf_smilesis_pointerFalsemodalityMOLECULEdescriptionNoneuser_attributesdtypeobjectcontent_typeSMILESligand_start_confis_pointerTruemodalityMOLECULE_3DdescriptionOne generated molecule conformation for the ligand of interest generated with RDKit's ETKDGv3 followed by an energy minimization with the UFF.user_attributesdtypeobjectcontent_typeNone

0,1
ligands,BYTES_TO_MOL
protein,ARRAY_TO_PDB
ligand,BYTES_TO_MOL
ligand_start_conf,BYTES_TO_MOL

0,1
ligands_smiles,is_pointerFalsemodalityMOLECULEdescriptionNoneuser_attributesdtypeobjectcontent_typeSMILES
protein_group,is_pointerFalsemodalityUNKNOWNdescriptionNoneuser_attributesdtypeobjectcontent_typeNone
ligands,is_pointerTruemodalityMOLECULE_3DdescriptionAll instances of the ligand of interest.user_attributesdtypeobjectcontent_typeNone
protein,is_pointerTruemodalityPROTEIN_3DdescriptionThe protein structure without the ligand of interest without solvents and with all cofactors.user_attributesdtypeobjectcontent_typePDB
ligand_smiles,is_pointerFalsemodalityMOLECULEdescriptionNoneuser_attributesdtypeobjectcontent_typeSMILES
ligand,is_pointerTruemodalityMOLECULE_3DdescriptionOne of the instances of the ligand of interest. This crystal pose marks the binding site for those docking methods that require a binding site.user_attributesdtypeobjectcontent_typeNone
ligand_start_conf_smiles,is_pointerFalsemodalityMOLECULEdescriptionNoneuser_attributesdtypeobjectcontent_typeSMILES
ligand_start_conf,is_pointerTruemodalityMOLECULE_3DdescriptionOne generated molecule conformation for the ligand of interest generated with RDKit's ETKDGv3 followed by an energy minimization with the UFF.user_attributesdtypeobjectcontent_typeNone

0,1
is_pointer,False
modality,MOLECULE
description,
user_attributes,
dtype,object
content_type,SMILES

0,1
is_pointer,False
modality,UNKNOWN
description,
user_attributes,
dtype,object
content_type,

0,1
is_pointer,True
modality,MOLECULE_3D
description,All instances of the ligand of interest.
user_attributes,
dtype,object
content_type,

0,1
is_pointer,True
modality,PROTEIN_3D
description,The protein structure without the ligand of interest without solvents and with all cofactors.
user_attributes,
dtype,object
content_type,PDB

0,1
is_pointer,False
modality,MOLECULE
description,
user_attributes,
dtype,object
content_type,SMILES

0,1
is_pointer,True
modality,MOLECULE_3D
description,One of the instances of the ligand of interest. This crystal pose marks the binding site for those docking methods that require a binding site.
user_attributes,
dtype,object
content_type,

0,1
is_pointer,False
modality,MOLECULE
description,
user_attributes,
dtype,object
content_type,SMILES

0,1
is_pointer,True
modality,MOLECULE_3D
description,One generated molecule conformation for the ligand of interest generated with RDKit's ETKDGv3 followed by an energy minimization with the UFF.
user_attributes,
dtype,object
content_type,


In [15]:
dataset.to_json(f"gs://polaris-public/polaris-recipes/org-polaris/posebusters/datasets/{dataset.name}")

[32m2024-08-27 11:59:38.025[0m | [1mINFO    [0m | [36mpolaris.dataset._dataset[0m:[36mto_json[0m:[36m433[0m - [1mCopying Zarr archive to gs://polaris-public/polaris-recipes/org-polaris/posebusters/datasets/astex_diverse_set-v1/data.zarr. This may take a while.[0m
