# 2D feature calucator with molfeat from Pat Walters:
https://github.com/PatWalters/practical_cheminformatics_tutorials?tab=readme-ov-file
Lesson 7 covers using datamol and molfeat.

But see also: https://docs.datamol.io/0.9.1/tutorials/Descriptors.html
for a list of descriptors. Note the use of batch_compute...


In [9]:
import pandas as pd
import datamol as dm
from molfeat.calc import FPCalculator
from molfeat.calc import RDKitDescriptors2D
from molfeat.trans import MoleculeTransformer

In [2]:
# Load dataframe from CSV. 
df = pd.read_csv(
    'https://raw.githubusercontent.com/cptlab/ASCEPT_2023_comptox_workshop/main/W109_Machine_learning_QSAR_for_toxicity_prediction/data/smiles_cas_N6512_corrected.smi',
    names=['SMILES', 'CASRN', 'Ames'],
    delimiter='\t'
)

In [3]:
df['mol'] = dm.from_df(df, smiles_column="SMILES")

In [5]:
def max_ring_size(mol):
    """Get the size of the largest ring in a molecule

    :param mol: input_molecule
    :return: size of the largest ring or 0 for an acyclic molecule
    """
    ri = mol.GetRingInfo()
    atom_rings = ri.AtomRings()
    if len(atom_rings) == 0:
        return 0
    else:
        return max([len(x) for x in ri.AtomRings()])

In [6]:
my_prop_dict = {
    "mw" : dm.descriptors.mw,
    "logp" : dm.descriptors.clogp,
    "hbd" : dm.descriptors.n_lipinski_hbd,
    "hba" : dm.descriptors.n_lipinski_hba,
    "max_ring_size" : max_ring_size
}

In [7]:
prop_df = dm.descriptors.batch_compute_many_descriptors(df.mol,properties_fn=my_prop_dict,add_properties=False,
                                             progress=True)

100%|██████████| 6512/6512 [00:02<00:00, 2211.45it/s]


In [8]:
df = pd.concat([df,prop_df],axis=1)
df.head(3)

Unnamed: 0,SMILES,CASRN,Ames,mol,mw,logp,hbd,hba,max_ring_size
0,O=C1c2ccccc2C(=O)c3c1ccc4c3[nH]c5c6C(=O)c7cccc...,2475-33-4,0,"<img data-content=""rdkit/molecule"" src=""data:i...",646.116486,6.3494,2,8,6
1,NNC(=O)CNC(=O)\C=N#N,820-75-7,1,"<img data-content=""rdkit/molecule"" src=""data:i...",157.059974,-2.6069,4,7,0
2,O=C1NC(=O)\C(=N#N)\C=N1,2435-76-9,1,"<img data-content=""rdkit/molecule"" src=""data:i...",138.017775,-1.0222,1,6,6
