In [2]:
import datamol as dm
import polaris as po

from time import time

# SMILES to 3D representations of molecules

Let's find out how to go from SMILES to 3D, using `datamol`package. Then run some timing tests

In [3]:
# Get the data
competition = po.load_competition(f"asap-discovery/antiviral-potency-2025")
competition.cache()
train, test = competition.get_train_test_split()

Output()

In [7]:

# TODO: think of a realistic rms cutoff (ask Slav) and energy iterations
def generate_conformers(
        smiles: str, 
        num_confs: int = 5, 
        minimize_energy: bool = False, 
        rms_cutoff: int = None,
        energy_iterations: int = 1000
    ) -> dm.Mol:
    """
    Generate 3D conformers using datamol.
    """
    dm.disable_rdkit_log() # stop logging a lot of info

    mol = dm.to_mol(smiles, add_hs=True)
    mol = dm.fix_mol(mol)
    mol = dm.sanitize_mol(mol)
    mol = dm.standardize_mol(
        mol,
        disconnect_metals=True,
        normalize=True,
        reionize=True,
        uncharge=False,
        stereo=True,
    )

    mol_confs = dm.conformers.generate(
        mol, 
        n_confs=num_confs,
        num_threads=8,
        minimize_energy=minimize_energy, 
        rms_cutoff=rms_cutoff,
        energy_iterations=energy_iterations
    )
    return mol_confs

In [8]:
def run_conformer_test(n_examples, minimize_energy=True, energy_iterations=100):
    start = time()
    for num, (x, _) in enumerate(train):
        if num < n_examples:
            mol_confs = generate_conformers(
                x, num_confs=10, 
                minimize_energy=minimize_energy,
                energy_iterations=energy_iterations
        )
        else:
            break

    if minimize_energy:
        energy_minimization_str = 'with energy minimization for ' + str(energy_iterations) + ' steps:\n' 
    else:
        energy_minimization_str = 'without energy minimization:\n'

    message = (
        f"Generating 10 conformers for {n_examples} examples "
        f"{energy_minimization_str}"
        f"time spent: {(time() - start):.4} seconds\n"
    )
    return message

In [9]:
# Run some tests on 10, 50 and 100 data points and time it.
# WARNING: this will run for about 3 mins
for n in [10, 50, 100]:
    print(run_conformer_test(n, False))
    print(run_conformer_test(n, True, 100))
    print(run_conformer_test(n, True, 1000))
    print(80 * "*")

Generating 10 conformers for 10 examples without energy minimization:
time spent: 2.304 seconds

Generating 10 conformers for 10 examples with energy minimization for 100 steps:
time spent: 2.88 seconds

Generating 10 conformers for 10 examples with energy minimization for 1000 steps:
time spent: 3.766 seconds

********************************************************************************
Generating 10 conformers for 50 examples without energy minimization:
time spent: 12.5 seconds

Generating 10 conformers for 50 examples with energy minimization for 100 steps:
time spent: 16.6 seconds

Generating 10 conformers for 50 examples with energy minimization for 1000 steps:
time spent: 22.73 seconds

********************************************************************************
Generating 10 conformers for 100 examples without energy minimization:
time spent: 26.42 seconds

Generating 10 conformers for 100 examples with energy minimization for 100 steps:
time spent: 34.2 seconds

Generat

## Conclusions

- Time scales roughly linearly as expected
- It takes almost twice as long to generate conformers with 1000 minimization steps in comparison to no energy 
minimization approach.
- It's around 40% slower to generate conformers with 1000 steps of energy minimization VS 100 steps.
- **We can easily either preprocess all molecules and store their various conformers or we can cache the call to the 
generation function directly in the data pipeline to avoid further recomputation** I would go for the 1st option, since
it'll be faster to iterate over training attempts, however, we would need to include that preprocessing step into the 
test data pipeline. (Let's discuss it)

## Check how a 3D conformation looks like

In [10]:
# Print 3D coords of the 1st conformer
mol_confs = generate_conformers(next(train)[0])
conf_coords = mol_confs.GetConformer(0).GetPositions()

print(f"Shape: {conf_coords.shape}\n")
print(f"Coords: \n {conf_coords}")

Shape: (34, 3)

Coords: 
 [[ 2.6323168   0.6054825  -5.26036319]
 [ 1.55613292  0.46559828 -4.27137778]
 [ 0.70949033  1.57649872 -4.08417693]
 [ 0.93371378  2.61463654 -4.77849485]
 [-0.42578617  1.61326483 -3.13011377]
 [-0.57399667  0.37355257 -2.40317728]
 [ 0.08080771  0.07489942 -1.15809112]
 [-0.82630558 -0.53425015 -0.16178747]
 [-0.0967829  -1.23012324  0.9367751 ]
 [ 0.71890213 -2.17729257  0.72974089]
 [-0.43552543 -0.69688957  2.2089517 ]
 [ 0.32053209 -0.73579863  3.4036591 ]
 [ 1.23442289  0.27773974  3.68401511]
 [ 1.97343362  0.2667547   4.8248514 ]
 [ 1.86448287 -0.71107368  5.73463395]
 [ 0.97425736 -1.74033419  5.50789148]
 [ 0.87000733 -2.74632272  6.45835017]
 [-0.01611678 -3.78349397  6.24939467]
 [-0.77226924 -3.78387918  5.09536118]
 [-0.6539544  -2.77504117  4.16317374]
 [ 0.22341337 -1.73120263  4.35111531]
 [-1.75671827 -0.09134367  1.97888762]
 [-1.6912545   0.43315651  0.56755177]
 [-1.03251757  1.78503019  0.68330124]
 [-1.92975251  2.62007128  1.39049769]