In [1]:
from signaturizer3d import CCSpace

# Print all 25 available spaces for the signaturizer
CCSpace.print_spaces()

INFO:signaturizer3d.unicore.utils:fused_multi_tensor is not installed corrected
INFO:signaturizer3d.unicore.utils:fused_rounding is not installed corrected
INFO:signaturizer3d.unicore.layer_norm:fused_layer_norm is not installed corrected
INFO:signaturizer3d.unicore.softmax_dropout:fused_softmax is not installed corrected


A1: 2D fingerprints
A2: 3D fingerprints
A3: Scaffolds
A4: Structural keys
A5: Physiochemistry
B1: Mechanism of action
B2: Metabolic genes
B3: Crystals
B4: Binding
B5: HTS bioassasy
C1: Small molecule roles
C2: Small molecule pathways
C3: Signaling pathways
C4: Biological processes
C5: Interactome
D1: Transcription
D2: Cancer cell lines
D3: Chemical genetics
D4: Morphology
D5: Cell bioassasy
E1: Theraupetic areas
E2: Indications
E3: Side effects
E4: Diseases & toxicology
E5: Drug-drug interactions


Specify a space by using the space description "Binding" or using the short code representing the space of interest directly on the CCSpace enum, here B4.

In [2]:
space_of_interest = CCSpace("Binding")
space_alternative = CCSpace.B4
print(space_of_interest == space_alternative)
print(f"We'll work with {space_of_interest}")

True
We'll work with B4: Binding


Instantiate a `Signaturizer` object to load the model weights for your space of interest. The weights will only be downloaded the first time you use a space, after that they are cached by pytorch.

In [3]:
from signaturizer3d import Signaturizer

signaturizer_B4 = Signaturizer(CCSpace.B4)

INFO:signaturizer3d.unimol.unimol:Loading pretrained weights from https://github.com/aksell/test-pytorch-modelhub/releases/download/full-CC/B4_split0.pt


Signatures can be infered from a list of smiles. Do do inference we need a 3D conformation for the molecule. Before inference a 3D conformation will be generated for the molecules using RDKit.

In [4]:
smiles_list = ["C", "CCC"]
signatures = signaturizer_B4.infer_from_smiles(smiles_list)
print(signatures.shape)

INFO:signaturizer3d.input.smiles:Start generating conformers...
2it [00:00, 32.65it/s]
INFO:signaturizer3d.input.smiles:Conformer generation success rate: 100.00%


(2, 128)


If you already have structures for your molecules you can do inference from SDF files.

In [5]:
import os
sdf_dir_path = "./sdf_files/"
print(f"SDF file names {sdf_dir_path}: {os.listdir(sdf_dir_path)}")
signatures = signaturizer_B4.infer_from_sdf("./sdf_files/")
print(signatures.shape)
print(signatures)

SDF file names ./sdf_files/: ['ethanol-3D-structure-CT1001214542.sdf', 'S-Fluoxetine-3D-structure-CT1000405130.sdf', 'tamoxifen-3D-structure-CT1001176588.sdf']
(3, 128)
[[-0.09519936  0.06829637 -0.08188163  0.1029023  -0.05649166 -0.06668392
   0.08427928 -0.1099747   0.06556955 -0.09545685  0.07614429 -0.04069559
   0.0624813  -0.06223743  0.09910508 -0.08936232  0.10336801 -0.03606369
   0.03822726 -0.06820646 -0.02681307  0.02072948 -0.00487476 -0.00069978
  -0.09776585  0.11285394  0.03235247 -0.11460033 -0.05307676  0.00650208
   0.1111129  -0.10874218  0.0735438   0.12068304  0.10218719 -0.05537481
  -0.12561333 -0.11073242 -0.00435872  0.09860958 -0.09820296  0.10806404
  -0.04810098 -0.09633937 -0.1113384   0.09649462 -0.1037758   0.10374872
   0.11269274  0.09431731 -0.10281128  0.10951822 -0.04198135  0.09022897
  -0.07675917 -0.09154199 -0.08738239 -0.10794725 -0.07321379 -0.08334978
   0.08571323  0.11039509  0.05947338  0.08808466 -0.0265783  -0.09482557
  -0.07769462  0.

## Fetching structures from ChEMBL
Below is an example of infering signatures by first fetching structures from chembl

In [None]:
!pip install chembl_webresource_client

In [7]:
from chembl_webresource_client.new_client import new_client
import os

def fetch_sdf(compound_name, sdf_dir_path):
    # Search for the compound
    molecule = new_client.molecule
    res = molecule.search(compound_name)
    
    if res is None:
        print(f"No results found for {compound_name}")
        return

    # Get the ChEMBL ID for the first result
    chembl_id = res[0]['molecule_chembl_id']
    print(f"Found ChEMBL ID for {compound_name}: {chembl_id}")

    # Fetch the SDF file
    result = molecule.get(chembl_id).get('molecule_structures')
    molfile = result.get('molfile')
    if not molfile:
        print(f"SDF content not found for {compound_name}")
        return

    sdf_file_path = os.path.join(sdf_dir_path, f"{compound_name}.sdf")

    # Write to file
    with open(sdf_file_path, 'w') as file:
        file.write(molfile)

    print(f"SDF file for {compound_name} saved as {sdf_file_path}")

# Directory to save SDF files
sdf_dir_path = "./chembl_files/"
os.makedirs(sdf_dir_path, exist_ok=True)

# Fetch SDF files
compounds = ["ethanol", "tamoxifen", "fluoxetine"]
for compound in compounds:
    fetch_sdf(compound, sdf_dir_path)

print("Done fetching files")
signatures = signaturizer_B4.infer_from_sdf(sdf_dir_path)
print("Done infering signatures")
print(signatures.shape)


INFO:chembl_webresource_client.url_query:resetting chunk
INFO:chembl_webresource_client.url_query:https://www.ebi.ac.uk/chembl/api/data/molecule/search.json
INFO:chembl_webresource_client.url_query:[('q', 'ethanol'), ('limit', 20), ('offset', 0)]
INFO:chembl_webresource_client.url_query:From cache: True
INFO:chembl_webresource_client.url_query:headers:
INFO:chembl_webresource_client.url_query:{'Accept': 'application/json'}
INFO:chembl_webresource_client.url_query:https://www.ebi.ac.uk/chembl/api/data/molecule/CHEMBL545
INFO:chembl_webresource_client.url_query:From cache: True
INFO:chembl_webresource_client.url_query:resetting chunk
INFO:chembl_webresource_client.url_query:https://www.ebi.ac.uk/chembl/api/data/molecule/search.json
INFO:chembl_webresource_client.url_query:[('q', 'tamoxifen'), ('limit', 20), ('offset', 0)]
INFO:chembl_webresource_client.url_query:From cache: True
INFO:chembl_webresource_client.url_query:headers:
INFO:chembl_webresource_client.url_query:{'Accept': 'applica

Found ChEMBL ID for ethanol: CHEMBL545
SDF file for ethanol saved as ./chembl_files/ethanol.sdf
Found ChEMBL ID for tamoxifen: CHEMBL83
SDF file for tamoxifen saved as ./chembl_files/tamoxifen.sdf
Found ChEMBL ID for fluoxetine: CHEMBL41
SDF file for fluoxetine saved as ./chembl_files/fluoxetine.sdf
Done fetching files
Done infering signatures
(3, 128)


## Inference from other formats than SDF files
If you have molecular structure in a different format than SDF it is possible to do inference from a lists of atoms and coordinates for your molecules.
This is what is used under the hood after extracting the coordinates from a SDF file.

In [8]:
from signaturizer3d import Signaturizer

atoms_mol1 =  ["C", "C"]
coords_mol1 = [[0, 0, 0], [0, 0, 1]]
atoms_mol2 = ["C", "C", "C"]
coords_mol2 = [[0, 0, 0], [0, 0, 1], [0, 0, 2]]
signatures = signaturizer_B4.infer_from_coordinates(atoms=[atoms_mol1, atoms_mol2], coordinates=[coords_mol1, coords_mol2])
print(signatures.shape)


(2, 128)


## Generating conformations manually with RDKit
If you don't have structures for your molecules and want more control over conformation generation than you get by
doing inference directly from smiles you can generate conformations manually and do inference with these structures.
This is convenient if you want inference to be repetable or want to validate the conformations before running inference.
Visualization and conformation generation is adapted from [this](https://greglandrum.github.io/rdkit-blog/posts/2023-02-04-working-with-conformers.html) post on the RDkit blog. 

In [9]:
!pip install py3Dmol


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [10]:
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
IPythonConsole.ipython_3d = True
import py3Dmol
import rdkit

from rdkit.Chem import rdDepictor
from rdkit.Chem import rdDistGeom
print(rdkit.__version__)

esomeprazole = Chem.MolFromSmiles('COc1ccc2[n-]c([S@@+]([O-])Cc3ncc(C)c(OC)c3C)nc2c1')

2022.09.5


In [11]:
esomeprazole = Chem.AddHs(esomeprazole) # Add hydrogens to get a reasonable conformer
rdDistGeom.EmbedMolecule(esomeprazole)
IPythonConsole.drawMol3D(esomeprazole)

[12:57:43] UFFTYPER: Unrecognized charge state for atom: 8


After validating the conformer we can run inference with the structure.

In [12]:
conf = esomeprazole.GetConformer()
coordinates = conf.GetPositions()
atoms_list = []
for atom in esomeprazole.GetAtoms():
    atoms_list.append(atom.GetSymbol())
print(atoms_list)
print(coordinates.shape)

['C', 'O', 'C', 'C', 'C', 'C', 'N', 'C', 'S', 'O', 'C', 'C', 'N', 'C', 'C', 'C', 'C', 'O', 'C', 'C', 'C', 'N', 'C', 'C', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']
(42, 3)


In [13]:
signature = signaturizer_B4.infer_from_coordinates(atoms=[atoms_list], coordinates=[coordinates])
print(signature)

[[ 0.10534912  0.03060597 -0.05647036  0.05861403 -0.07488568  0.03211444
   0.05733145 -0.05699329  0.09645466 -0.03772514 -0.05896286  0.02901443
  -0.04864609 -0.08854273  0.09540114 -0.09725082  0.01925274 -0.04269111
  -0.02906142  0.10497911 -0.06381205 -0.02077713  0.07920726  0.1271902
  -0.12991667 -0.01890666 -0.12155195 -0.08795581 -0.0193915  -0.06440806
  -0.11076576  0.00368134  0.12010095  0.04269152 -0.05725695 -0.01661298
  -0.06057845 -0.07216614  0.05187513  0.10588477 -0.11565472  0.09062179
  -0.00262216 -0.1035703  -0.11354019  0.03810996 -0.03999119  0.01191132
   0.10989758 -0.02274036  0.12255322  0.07563224  0.08814507  0.08935983
  -0.01557146 -0.06688122  0.02031335 -0.02494345 -0.00259073 -0.09571204
  -0.05424452  0.09222589 -0.05004374  0.08363793 -0.06270695 -0.02128205
   0.10725331  0.09683667 -0.08759648  0.12773815 -0.10872212 -0.0687696
   0.00382547 -0.03729359 -0.12032747 -0.10417423 -0.01984096  0.00937093
  -0.01222997  0.02807178 -0.13168024 -0