# Evaluation objects and pipelines

In [1]:
%load_ext autoreload
%autoreload 2

import open3d # open3d can occasionally cause issues during imports; importing it first can help alleviate that
import numpy as np
from rdkit import Chem
from shepherd_score.conformer_generation import embed_conformer_from_smiles

from shepherd_score.evaluations.evaluate import ConfEval, UnconditionalEvalPipeline
from shepherd_score.evaluations.evaluate import ConsistencyEvalPipeline, ConditionalEvalPipeline

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


## Conformer evaluation base class

The base class used to evaluate conformer validity and get 2D graph properties is `ConfEval`. Other evaluation classes (other than docking) inherit from `ConfEval` and the related pipelines utilize these objects. 

Let's run a small experiment where the MMFF94-relaxed molecule is the "generated" molecule -- represent it as an atomic point cloud.

In [2]:
rdkit_mol = embed_conformer_from_smiles('c1Cc2ccc(Cl)cc2C(=O)c1c3cc(N1nnc2cc(C)c(Cl)cc2c1=O)ccc3', MMFF_optimize=True)

# get the atomic numbers as an array and the positions of the atoms
atoms = np.array([a.GetAtomicNum() for a in rdkit_mol.GetAtoms()])
positions = rdkit_mol.GetConformer().GetPositions()

In [3]:
conf_eval = ConfEval(atoms, positions, solvent='water') # solvent = None if gas phase



In [4]:
conf_eval.to_pandas() # show the attributes as a pandas Series

xyz_block                   46\n\nC -3.529 -1.43 0.988\nC -4.785 -0.771 1....
mol                          <rdkit.Chem.rdchem.Mol object at 0x7fcfa2efd430>
smiles                      Cc1cc2nnn(-c3cccc(C4=CCc5ccc(Cl)cc5C4=O)c3)c(=...
molblock                    \n     RDKit          3D\n\n 46 50  0  0  0  0...
energy                                                             -85.047853
partial_charges             [-0.01934405, -0.08768726, 0.02357351, -0.0417...
solvent                                                                 water
charge                                                                      0
xyz_block_post_opt          46\n\nC           -3.74553476583238       -1.5...
mol_post_opt                 <rdkit.Chem.rdchem.Mol object at 0x7fcfa2efd820>
smiles_post_opt             Cc1cc2nnn(-c3cccc(C4=CCc5ccc(Cl)cc5C4=O)c3)c(=...
molblock_post_opt           \n     RDKit          3D\n\n 46 50  0  0  0  0...
energy_post_opt                                                 

## Conformer evaluation pipelines

Since typically multiple molecules are generated and all need to be evaluated, some pipeline classes used.

### Unconditional evaluation
The `UnconditionalEvalPipeline` simply iterates over all the generated molecules with `ConfEval` and stores the full evaluation.

Let's generate a few test molecules and embed them with RDKit ETKDG. We prepare them for the necessary inputs: a list of tuples containing each the molecule's corresponding atoms' atomic numbers and positions as numpy arrays.

In [5]:
smiles_ls = ['CC', 'CCC', 'CCCC']
test_mols = [embed_conformer_from_smiles(smi, MMFF_optimize=False) for smi in smiles_ls]

generated_mols = []
for m in test_mols:
    generated_mols.append(
        (np.array([a.GetAtomicNum() for a in m.GetAtoms()]), m.GetConformer().GetPositions())
    )

Initialize and run the pipeline.

In [6]:
uncond_pipe = UnconditionalEvalPipeline(generated_mols=generated_mols, solvent='water')
uncond_pipe.evaluate(verbose=True)

Unconditional Eval: 100%|██████████| 3/3 [00:00<00:00,  5.35it/s]


In [7]:
properties_df, global_attr = uncond_pipe.to_pandas()

In [8]:
global_attr

Unnamed: 0,generated_mols,molblocks,molblocks_post_opt,strain_energies,rmsds,SA_scores,logPs,QEDs,fsp3s,strain_energies_post_opt,rmsds_post_opt,SA_scores_post_opt,logPs_post_opt,QEDs_post_opt,fsp3s_post_opt
0,"([6, 6, 1, 1, 1, 1, 1, 1], [[-0.76367466358819...",\n RDKit 3D\n\n 8 7 0 0 0 0...,\n RDKit 3D\n\n 8 7 0 0 0 0...,0.006719,0.001154,2.747568,1.0262,0.372786,1.0,4.661066e-310,4.661067e-310,2.747568,1.0262,0.372786,1.0
1,"([6, 6, 6, 1, 1, 1, 1, 1, 1, 1, 1], [[-1.11574...",\n RDKit 3D\n\n 11 10 0 0 0 0...,\n RDKit 3D\n\n 11 10 0 0 0 0...,0.016948,0.014228,1.754957,1.4163,0.385471,1.0,0.0,0.0,1.754957,1.4163,0.385471,1.0
2,"([6, 6, 6, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [...",\n RDKit 3D\n\n 14 13 0 0 0 0...,\n RDKit 3D\n\n 14 13 0 0 0 0...,0.012491,0.062795,1.605723,1.8064,0.431024,1.0,1.6e-322,1.6e-322,1.605723,1.8064,0.431024,1.0


In [9]:
properties_df

num_generated_mols                                                         3
solvent                                                                water
num_valid                                                                  3
num_valid_post_opt                                                         3
num_consistent_graph                                                       3
frac_valid                                                               1.0
frac_valid_post_opt                                                      1.0
frac_consistent                                                          1.0
frac_unique                                                              1.0
frac_unique_post_opt                                                     1.0
avg_graph_diversity                                                 0.711111
graph_similarity_matrix    [[1.0, 0.2, 0.16666666666666666], [0.2, 1.0, 0...
dtype: object

### Consistency evaluation

This is used to evaluate if the jointly generated interaction profiles correspond to the true interaction profile of the generated molecule. The `ConsistencyEvalPipeline` simply iterates over all the generated molecules with the `ConsistencyEval` class and stores the full evaluation. In addition to the properties calculated by `ConfEval` it also does score-based alignment so it is a slower operation.

In [10]:
from shepherd_score.container import Molecule

Prepare the inputs. We pretend that the test smiles are "generated" molecules with their corresponding interaction profiles. `ConsistencyEvalPipeline` expects this format for the inputs.

In [11]:
smiles_ls = ['CC', 'CCC', 'CCCC']
test_mols = [embed_conformer_from_smiles(smi, MMFF_optimize=True) for smi in smiles_ls]

generated_mols = []
generated_surf_points = []
generated_surf_esp = []
generated_pharm_feats = []
for m in test_mols:
    generated_mols.append(
        (np.array([a.GetAtomicNum() for a in m.GetAtoms()]), m.GetConformer().GetPositions())
    )
    # Generate and store each interaction profile as if they were generated.
    # Notably, we use MMFF94 partial charges and ConsistencyEvalPipeline 
    #  will compare the ESP to xTB generated partial charges
    molec = Molecule(m, num_surf_points=200, probe_radius=1.2, partial_charges=None, pharm_multi_vector=False)
    generated_surf_points.append(molec.surf_pos)
    generated_surf_esp.append(molec.surf_esp)
    generated_pharm_feats.append(
        (molec.pharm_types, molec.pharm_ancs, molec.pharm_vecs)
    )

Initialize and run the pipeline

In [12]:
consis_eval = ConsistencyEvalPipeline(
    generated_mols = generated_mols,
    generated_surf_points = generated_surf_points,
    generated_surf_esp = generated_surf_esp,
    generated_pharm_feats = generated_pharm_feats,
    probe_radius=1.2,
    pharm_multi_vector=False,
    solvent=None
)

In [13]:
consis_eval.evaluate(num_processes=4, verbose=True)

Consistency Eval: 100%|██████████| 3/3 [00:04<00:00,  1.40s/it]


You can view the saved attributes and properties as a pandas Series for the global (whole set) attributes and a DataFrame for per-sample properties.

In [14]:
properties_df_consis, global_attr_consis = consis_eval.to_pandas()

In [15]:
global_attr_consis

Unnamed: 0,generated_mols,generated_surf_points,generated_surf_esp,generated_pharm_feats,molblocks,molblocks_post_opt,strain_energies,rmsds,SA_scores,logPs,...,sims_esp_upper_bound,sims_surf_lower_bound,sims_esp_lower_bound,sims_pharm_lower_bound,sims_surf_consistent_relax,sims_esp_consistent_relax,sims_pharm_consistent_relax,sims_surf_consistent_relax_optimal,sims_esp_consistent_relax_optimal,sims_pharm_consistent_relax_optimal
0,"([6, 6, 1, 1, 1, 1, 1, 1], [[-0.75347790992488...","[[-1.1622853, -2.9194512, -0.3687372], [-1.664...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","([3], [[4.124822294837571e-08, -2.122918381312...",\n RDKit 3D\n\n 8 7 0 0 0 0...,\n RDKit 3D\n\n 8 7 0 0 0 0...,0.000204,0.005329,2.747568,1.0262,...,0.970873,,,,0.97992,0.979918,1.0,0.980252,0.98025,0.999987
1,"([6, 6, 6, 1, 1, 1, 1, 1, 1, 1, 1], [[-1.21663...","[[-1.9376649, -2.890272, -0.05321455], [-1.638...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","([3], [[-0.010918935404662933, -0.052514005803...",\n RDKit 3D\n\n 11 10 0 0 0 0...,\n RDKit 3D\n\n 11 10 0 0 0 0...,0.000215,0.003058,1.754957,1.4163,...,0.968017,,,,0.972534,0.972532,0.999997,0.972822,0.972821,1.0
2,"([6, 6, 6, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [...","[[3.4988637, -1.722003, -0.20727295], [3.85821...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","([3, 3], [[1.2636719511656584, 0.4148633070650...",\n RDKit 3D\n\n 14 13 0 0 0 0...,\n RDKit 3D\n\n 14 13 0 0 0 0...,0.000294,0.003009,1.605723,1.8064,...,0.965348,,,,0.958321,0.958319,0.999997,0.958929,0.958927,0.999996


In [16]:
properties_df_consis

num_generated_mols                                                                  3
solvent                                                                          None
probe_radius                                                                      1.2
pharm_multi_vector                                                              False
num_valid                                                                           3
num_valid_post_opt                                                                  3
num_consistent_graph                                                                3
frac_valid                                                                        1.0
frac_valid_post_opt                                                               1.0
frac_consistent                                                                   1.0
frac_unique                                                                       1.0
frac_unique_post_opt                                  

### Conditional evaluation

This is used to evaluate if the generated is similar (based on `shepherd_score` 3D scoring functions) to the target/reference molecule. The `ConditionalEvalPipeline` simply iterates over all the generated molecules with the `ConditionalEval` class and stores the full evaluation. In addition to the properties calculated by `ConfEval` it also does score-based alignment so it is a slower operation.

In [17]:
rdkit_mol = embed_conformer_from_smiles('c1Cc2ccc(Cl)cc2C(=O)c1c3cc(N1nnc2cc(C)c(Cl)cc2c1=O)ccc3', MMFF_optimize=True)

# Again using MMFF94 partial charges
ref_molec = Molecule(rdkit_mol, num_surf_points=200, probe_radius=1.2, pharm_multi_vector=False)

In [18]:
cond_pipe = ConditionalEvalPipeline(ref_molec, generated_mols=generated_mols,
                                    condition='all', num_surf_points=200,
                                    pharm_multi_vector=False, solvent=None)
cond_pipe.evaluate(verbose=True)

Conditional Eval: 100%|██████████| 3/3 [00:05<00:00,  1.78s/it]


In [19]:
properties_df_cond, global_attr_cond = cond_pipe.to_pandas()

In [20]:
global_attr_cond

Unnamed: 0,generated_mols,molblocks,molblocks_post_opt,strain_energies,rmsds,SA_scores,logPs,QEDs,fsp3s,SA_scores_post_opt,...,sims_surf_target_relax,sims_esp_target_relax,sims_pharm_target_relax,sims_surf_target_relax_optimal,sims_esp_target_relax_optimal,sims_pharm_target_relax_optimal,sims_surf_target_relax_esp_aligned,sims_pharm_target_relax_esp_aligned,graph_similarities,graph_similarities_post_opt
0,"([6, 6, 1, 1, 1, 1, 1, 1], [[-0.75347790992488...",\n RDKit 3D\n\n 8 7 0 0 0 0...,\n RDKit 3D\n\n 8 7 0 0 0 0...,0.000204,0.005329,2.747568,1.0262,0.372786,1.0,2.747568,...,0.136737,0.136443,0.015734,0.215801,0.212543,0.07535,0.21279,0.074503,0.011905,0.011905
1,"([6, 6, 6, 1, 1, 1, 1, 1, 1, 1, 1], [[-1.21663...",\n RDKit 3D\n\n 11 10 0 0 0 0...,\n RDKit 3D\n\n 11 10 0 0 0 0...,0.000215,0.003058,1.754957,1.4163,0.385471,1.0,1.754957,...,0.153875,0.153496,0.012852,0.199899,0.199468,0.075338,0.199899,0.003614,0.011628,0.011628
2,"([6, 6, 6, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [...",\n RDKit 3D\n\n 14 13 0 0 0 0...,\n RDKit 3D\n\n 14 13 0 0 0 0...,0.000294,0.003009,1.605723,1.8064,0.431024,1.0,1.605723,...,0.168214,0.167741,0.022982,0.241039,0.240074,0.077281,0.241034,0.043792,0.011494,0.011494


In [21]:
properties_df_cond

num_generated_mols                                                                3
solvent                                                                        None
pharm_multi_vector                                                            False
condition                                                                       all
num_surf_points                                                                 200
lam                                                                             0.3
lam_scaled                                                                62.206048
ref_molblock                      \n     RDKit          3D\n\n 46 50  0  0  0  0...
ref_mol_SA_score                                                           2.746132
ref_mol_QED                                                                0.422032
ref_mol_logP                                                                6.78332
ref_mol_fsp3                                                               0