In [1]:
%load_ext autoreload
%autoreload 2
from pathlib import Path
import platformdirs

import datamol as dm
from polaris.dataset import DatasetFactory
from polaris.dataset.converters import SDFConverter, PDBConverter

In [2]:
DATA_ROOT = "/Users/lu.zhu/Downloads/8278563/posebusters_paper_data"
dataset_name = "posebusters_benchmark_set"
proteins = [Path(folder).name for folder in dm.fs.glob(f"{DATA_ROOT}/{dataset_name}/*")]
file_list = ["ligand", "ligand_start_conf", "ligands", "protein"]


SAVE_DIR = dm.fs.join(
    platformdirs.user_cache_dir(appname="polaris-recipes"), "posebusters"
)

In [3]:
sdf_converter = SDFConverter()
pdb_converter = PDBConverter()

In [29]:
ligands = [
    dm.fs.join(DATA_ROOT, dataset_name, protein, f"{protein}_ligands.sdf")
    for protein in proteins
]
ligand_start_conf = [
    dm.fs.join(DATA_ROOT, dataset_name, protein, f"{protein}_ligand_start_conf.sdf")
    for protein in proteins
]
ligand = [
    dm.fs.join(DATA_ROOT, dataset_name, protein, f"{protein}_ligand.sdf")
    for protein in proteins
]
protein = [
    dm.fs.join(DATA_ROOT, dataset_name, protein, f"{protein}_protein.pdb")
    for protein in proteins
]

In [44]:
# preprocess sdf and add protein group
# it's neccessary for adding a list of ligands and conformers for one protein in a single row in the dataset
ligands_files = []
for sdf in ligands:
    mols = dm.read_sdf(sdf)
    protein_name = Path(sdf).parent.name
    for mol in mols:
        mol.SetProp("protein_group", protein_name)
    url = dm.fs.join(SAVE_DIR, protein_name, Path(sdf).name)
    dm.to_sdf(mols, url)
    ligands_files.append(url)

In [33]:
# Create a new factory

save_dst = dm.fs.join(SAVE_DIR, f"{dataset_name}.zarr")
factory = DatasetFactory(zarr_root_path=save_dst)
factory.reset(save_dst)

# add protein pdbs
factory.register_converter("pdb", PDBConverter(pdb_column="protein"))
factory.add_from_files(paths=protein, axis=0)

# add ligand sdfs
factory.register_converter(
    "sdf",
    SDFConverter(
        smiles_column="ligand_smiles", mol_column="ligand", mol_prop_as_cols=False
    ),
)
factory.add_from_files(paths=ligand, axis=0)

# add ligand start conformers
factory.register_converter(
    "sdf",
    SDFConverter(
        smiles_column="ligand_start_conf_smiles",
        mol_column="ligand_start_conf",
        mol_prop_as_cols=False,
    ),
)
factory.add_from_files(paths=ligand_start_conf, axis=0)

# add all ligands and conformers
factory.register_converter(
    "sdf",
    SDFConverter(
        smiles_column=f"ligands_smiles",
        mol_column="ligands",
        mol_prop_as_cols=True,
        groupby_key="protein_group",
    ),
)
factory.add_from_files(paths=ligands_files, axis=0)

[32m2024-08-20 17:28:37.506[0m | [1mINFO    [0m | [36mpolaris.dataset._factory[0m:[36mregister_converter[0m:[36m136[0m - [1mYou are overwriting the converter for the sdf extension.[0m
[32m2024-08-20 17:28:38.976[0m | [1mINFO    [0m | [36mpolaris.dataset._factory[0m:[36mregister_converter[0m:[36m136[0m - [1mYou are overwriting the converter for the sdf extension.[0m


In [34]:
dataset = factory.build()

In [35]:
dataset.table

Unnamed: 0,protein,ligand_smiles,ligand,ligand_start_conf_smiles,ligand_start_conf,ligands_smiles,protein_group,ligands
0,protein/5S8I_2LY_protein,CNC(=O)C1=C2OCCOC2=CS1,ligand#0,[H]C1=C2OC([H])([H])C([H])([H])OC2=C(C(=O)N([H...,ligand_start_conf#0,CNC(=O)C1=C2OCCOC2=CS1,5S8I_2LY,ligands#0
1,protein/5SAK_ZRY_protein,N=C1N/C(=N\NC2=CC=CC=C2)C2=CC=CC=C12,ligand#1,[H]/N=C1\C2=C([H])C([H])=C([H])C([H])=C2/C(=N/...,ligand_start_conf#1,N=C1N/C(=N\NC2=CC=CC=C2)C2=CC=CC=C12,5SAK_ZRY,ligands#1
2,protein/5SB2_1K2_protein,O=C(N[C@@H]1C[C@H]1C1=CC=CC=C1)C1=CC(Cl)=CC(CO...,ligand#2,[H]C1=NC2=C(C([H])=C1OC([H])([H])C1=C([H])C(C(...,ligand_start_conf#2,O=C(N[C@@H]1C[C@H]1C1=CC=CC=C1)C1=CC(Cl)=CC(CO...,5SB2_1K2,ligands#2
3,protein/5SD5_HWI_protein,CCC1=C(OCCCOC2=CC(C)=CC=C2N2CC(C(=O)O)C2)C(N)=...,ligand#3,[H]OC(=O)C1([H])C([H])([H])N(C2=C([H])C([H])=C...,ligand_start_conf#3,CCC1=C(OCCCOC2=CC(C)=CC=C2N2CC(C(=O)O)C2)C(N)=...,5SD5_HWI,ligands#3
4,protein/5SIS_JSM_protein,CNCCN(C)C(=O)C1=C(C(=O)NC2=CC3=NC(C4=CC=CC=C4)...,ligand#4,[H]C1=NN(C([H])([H])[H])C(C(=O)N([H])C2=C([H])...,ligand_start_conf#4,CNCCN(C)C(=O)C1=C(C(=O)NC2=CC3=NC(C4=CC=CC=C4)...,5SIS_JSM,ligands#4
...,...,...,...,...,...,...,...,...
423,protein/8GFD_ZHR_protein,CC(=O)N[C@H]1[C@H](OCCC2=CC=C3OC=CC3=C2)O[C@H]...,ligand#423,[H]OC([H])([H])[C@@]1([H])O[C@@]([H])(OC([H])(...,ligand_start_conf#423,CC(=O)N[C@H]1[C@H](OCCC2=CC=C3OC=CC3=C2)O[C@H]...,8GFD_ZHR,ligands#669
424,protein/8H0M_2EH_protein,CCCC[C@H](CC)CO,ligand#424,[H]OC([H])([H])[C@@]([H])(C([H])([H])C([H])([H...,ligand_start_conf#424,CCCC[C@H](CC)CO,8H0M_2EH,ligands#670
425,protein/8HFN_XGC_protein,COC1=NC=C(C2=CC(S(=O)(=O)NC(=O)[C@@H](N)CS)=CC...,ligand#425,[H]SC([H])([H])[C@@]([H])(C(=O)N([H])S(=O)(=O)...,ligand_start_conf#425,COC1=NC=C(C2=CC(S(=O)(=O)NC(=O)[C@@H](N)CS)=CC...,8HFN_XGC,ligands#671
426,protein/8HO0_3ZI_protein,O=C1N[C@@H](CC2=CNC3=C(F)C=CC=C23)C(=O)N2CCC[C...,ligand#426,[H]C1=C([H])C(F)=C2C(=C1[H])C(C([H])([H])[C@@]...,ligand_start_conf#426,O=C1N[C@@H](CC2=CNC3=C(F)C=CC=C23)C(=O)N2CCC[C...,8HO0_3ZI,ligands#672


In [36]:
# Define the annotations

In [42]:
dataset.annotations["protein"].description = (
    "The protein structure without the ligand of interest without solvents and with all cofactors."
)
dataset.annotations["ligand"].description = (
    "One of the instances of the ligand of interest. This crystal pose marks the binding site for those docking methods that require a binding site."
)

dataset.annotations["ligands"].description = "All instances of the ligand of interest."
dataset.annotations["ligand_start_conf"].description = (
    "One generated molecule conformation for the ligand of interest generated with RDKit's ETKDGv3 followed by an energy minimization with the UFF."
)

In [43]:
dataset

0,1
name,
description,
tags,
user_attributes,
owner,
polaris_version,0.8.0
default_adapters,proteinARRAY_TO_PDBligandBYTES_TO_MOLligand_start_confBYTES_TO_MOLligandsBYTES_TO_MOL
zarr_root_path,/Users/lu.zhu/Library/Caches/polaris-recipes/posebusters/posebusters_benchmark_set.zarr
readme,
annotations,proteinis_pointerTruemodalityPROTEIN_3DdescriptionThe protein structure without the ligand of interest without solvents and with all cofactors.user_attributesdtypeobjectligand_smilesis_pointerFalsemodalityMOLECULEdescriptionNoneuser_attributesdtypeobjectligandis_pointerTruemodalityMOLECULE_3DdescriptionOne of the instances of the ligand of interest. This crystal pose marks the binding site for those docking methods that require a binding site.user_attributesdtypeobjectligand_start_conf_smilesis_pointerFalsemodalityMOLECULEdescriptionNoneuser_attributesdtypeobjectligand_start_confis_pointerTruemodalityMOLECULE_3DdescriptionOne generated molecule conformation for the ligand of interest generated with RDKit's ETKDGv3 followed by an energy minimization with the UFF.user_attributesdtypeobjectligands_smilesis_pointerFalsemodalityMOLECULEdescriptionNoneuser_attributesdtypeobjectprotein_groupis_pointerFalsemodalityUNKNOWNdescriptionNoneuser_attributesdtypeobjectligandsis_pointerTruemodalityMOLECULE_3DdescriptionAll instances of the ligand of interest.user_attributesdtypeobject

0,1
protein,ARRAY_TO_PDB
ligand,BYTES_TO_MOL
ligand_start_conf,BYTES_TO_MOL
ligands,BYTES_TO_MOL

0,1
protein,is_pointerTruemodalityPROTEIN_3DdescriptionThe protein structure without the ligand of interest without solvents and with all cofactors.user_attributesdtypeobject
ligand_smiles,is_pointerFalsemodalityMOLECULEdescriptionNoneuser_attributesdtypeobject
ligand,is_pointerTruemodalityMOLECULE_3DdescriptionOne of the instances of the ligand of interest. This crystal pose marks the binding site for those docking methods that require a binding site.user_attributesdtypeobject
ligand_start_conf_smiles,is_pointerFalsemodalityMOLECULEdescriptionNoneuser_attributesdtypeobject
ligand_start_conf,is_pointerTruemodalityMOLECULE_3DdescriptionOne generated molecule conformation for the ligand of interest generated with RDKit's ETKDGv3 followed by an energy minimization with the UFF.user_attributesdtypeobject
ligands_smiles,is_pointerFalsemodalityMOLECULEdescriptionNoneuser_attributesdtypeobject
protein_group,is_pointerFalsemodalityUNKNOWNdescriptionNoneuser_attributesdtypeobject
ligands,is_pointerTruemodalityMOLECULE_3DdescriptionAll instances of the ligand of interest.user_attributesdtypeobject

0,1
is_pointer,True
modality,PROTEIN_3D
description,The protein structure without the ligand of interest without solvents and with all cofactors.
user_attributes,
dtype,object

0,1
is_pointer,False
modality,MOLECULE
description,
user_attributes,
dtype,object

0,1
is_pointer,True
modality,MOLECULE_3D
description,One of the instances of the ligand of interest. This crystal pose marks the binding site for those docking methods that require a binding site.
user_attributes,
dtype,object

0,1
is_pointer,False
modality,MOLECULE
description,
user_attributes,
dtype,object

0,1
is_pointer,True
modality,MOLECULE_3D
description,One generated molecule conformation for the ligand of interest generated with RDKit's ETKDGv3 followed by an energy minimization with the UFF.
user_attributes,
dtype,object

0,1
is_pointer,False
modality,MOLECULE
description,
user_attributes,
dtype,object

0,1
is_pointer,False
modality,UNKNOWN
description,
user_attributes,
dtype,object

0,1
is_pointer,True
modality,MOLECULE_3D
description,All instances of the ligand of interest.
user_attributes,
dtype,object
