## Applying drug-likeness filters (RO5) 

In [24]:
import datamol as dm
from rdkit import Chem
import pandas as pd
from rdkit.Chem import rdMolDescriptors #molecular descriptors
import numpy as np

In [19]:
from tqdm.auto import tqdm

tqdm.pandas()

In [10]:
property_names = list(rdMolDescriptors.Properties.GetAvailableProperties())
property_getter = rdMolDescriptors.Properties(property_names)

In [11]:
def smi2props(smi):
    mol = Chem.MolFromSmiles(smi)
    props = None
    if mol:
        Chem.DeleteSubstructs(mol, Chem.MolFromSmarts("[#1X0]"))
        props = np.array(property_getter.ComputeProperties(mol))
    return props

In [25]:
def safe_smi2props(smi):
    try:
        return smi2props(smi)
    except Exception:
        return None

In [42]:
df = pd.read_csv("lgbm_active.csv", engine='pyarrow', dtype_backend='pyarrow')

In [47]:
df.head()

Unnamed: 0,name,SMILES,groups
0,Methotrexate,CN(CC1=CN=C2N=C(N)N=C(N)C2=N1)C1=CC=C(C=C1)C(=...,['approved']
1,Midazolam,CC1=NC=C2CN=C(C3=CC=CC=C3F)C3=C(C=CC(Cl)=C3)N12,"['approved', 'illicit']"
2,Rabeprazole,COCCCOC1=C(C)C(CS(=O)C2=NC3=CC=CC=C3N2)=NC=C1,"['approved', 'investigational']"
3,Pramlintide,[H]N[C@@H](CCCCN)C(=O)N[C@H]1CSSC[C@H](NC(=O)[...,"['approved', 'investigational']"
4,Roflumilast,FC(F)OC1=C(OCC2CC2)C=C(C=C1)C(=O)NC1=C(Cl)C=NC...,['approved']


In [46]:
df = df.drop(columns='')

In [48]:
df['props'] = df.SMILES.progress_apply(safe_smi2props)

  0%|          | 0/233 [00:00<?, ?it/s]

In [49]:
df.query('props.isna()')

Unnamed: 0,name,SMILES,groups,props


In [50]:
df = df.query('props.notna()').reset_index(drop=True)

In [51]:
df.head()

Unnamed: 0,name,SMILES,groups,props
0,Methotrexate,CN(CC1=CN=C2N=C(N)N=C(N)C2=N1)C1=CC=C(C=C1)C(=...,['approved'],"[454.1713158039999, 454.44700000000023, 13.0, ..."
1,Midazolam,CC1=NC=C2CN=C(C3=CC=CC=C3F)C3=C(C=CC(Cl)=C3)N12,"['approved', 'illicit']","[325.07820331600004, 325.77400000000006, 3.0, ..."
2,Rabeprazole,COCCCOC1=C(C)C(CS(=O)C2=NC3=CC=CC=C3N2)=NC=C1,"['approved', 'investigational']","[359.13036253200005, 359.451, 6.0, 1.0, 8.0, 1..."
3,Pramlintide,[H]N[C@@H](CCCCN)C(=O)N[C@H]1CSSC[C@H](NC(=O)[...,"['approved', 'investigational']","[3946.920674404006, 3949.4549999999854, 104.0,..."
4,Roflumilast,FC(F)OC1=C(OCC2CC2)C=C(C=C1)C(=O)NC1=C(Cl)C=NC...,['approved'],"[402.034954108, 403.212, 5.0, 1.0, 7.0, 1.0, 4..."


In [52]:
df[property_names] = df['props'].to_list()

In [53]:
df = df.drop(columns='props')

In [54]:
df.columns

Index(['name', 'SMILES', 'groups', 'exactmw', 'amw', 'lipinskiHBA',
       'lipinskiHBD', 'NumRotatableBonds', 'NumHBD', 'NumHBA', 'NumHeavyAtoms',
       'NumAtoms', 'NumHeteroatoms', 'NumAmideBonds', 'FractionCSP3',
       'NumRings', 'NumAromaticRings', 'NumAliphaticRings',
       'NumSaturatedRings', 'NumHeterocycles', 'NumAromaticHeterocycles',
       'NumSaturatedHeterocycles', 'NumAliphaticHeterocycles', 'NumSpiroAtoms',
       'NumBridgeheadAtoms', 'NumAtomStereoCenters',
       'NumUnspecifiedAtomStereoCenters', 'labuteASA', 'tpsa', 'CrippenClogP',
       'CrippenMR', 'chi0v', 'chi1v', 'chi2v', 'chi3v', 'chi4v', 'chi0n',
       'chi1n', 'chi2n', 'chi3n', 'chi4n', 'hallKierAlpha', 'kappa1', 'kappa2',
       'kappa3', 'Phi'],
      dtype='object')

In [55]:
df['ro5'] = (df
             .apply(lambda row: 'OK' if sum([row.exactmw <= 500, row.lipinskiHBD <= 5, row.lipinskiHBA <= 10, row.CrippenClogP <= 5, row.NumRotatableBonds <= 10]) >= 4
                    else 'NK', axis=1)
            )


In [56]:
df.head()

Unnamed: 0,name,SMILES,groups,exactmw,amw,lipinskiHBA,lipinskiHBD,NumRotatableBonds,NumHBD,NumHBA,...,chi1n,chi2n,chi3n,chi4n,hallKierAlpha,kappa1,kappa2,kappa3,Phi,ro5
0,Methotrexate,CN(CC1=CN=C2N=C(N)N=C(N)C2=N1)C1=CC=C(C=C1)C(=...,['approved'],454.171316,454.447,13.0,7.0,9.0,5.0,10.0,...,9.750179,4.720877,4.720877,3.027299,-4.55,23.120242,9.71964,5.782489,6.80971,NK
1,Midazolam,CC1=NC=C2CN=C(C3=CC=CC=C3F)C3=C(C=CC(Cl)=C3)N12,"['approved', 'illicit']",325.078203,325.774,3.0,0.0,1.0,0.0,3.0,...,7.278038,3.993996,3.993996,2.953456,-2.46,14.152613,5.31752,2.263054,3.272035,OK
2,Rabeprazole,COCCCOC1=C(C)C(CS(=O)C2=NC3=CC=CC=C3N2)=NC=C1,"['approved', 'investigational']",359.130363,359.451,6.0,1.0,8.0,1.0,5.0,...,8.08112,3.86023,3.86023,2.476,-2.25,17.5691,8.22144,4.236502,5.777732,OK
3,Pramlintide,[H]N[C@@H](CCCCN)C(=O)N[C@H]1CSSC[C@H](NC(=O)[...,"['approved', 'investigational']",3946.920674,3949.455,104.0,67.0,109.0,56.0,59.0,...,90.142798,44.459954,44.459954,29.13555,-27.08,234.598251,116.161207,79.137581,98.379841,NK
4,Roflumilast,FC(F)OC1=C(OCC2CC2)C=C(C=C1)C(=O)NC1=C(Cl)C=NC...,['approved'],402.034954,403.212,5.0,1.0,7.0,1.0,4.0,...,7.757677,3.685367,3.685367,2.164033,-2.12,18.664563,8.053206,4.735901,5.781138,OK


In [57]:
df.query('ro5 == "OK"').describe()

Unnamed: 0,exactmw,amw,lipinskiHBA,lipinskiHBD,NumRotatableBonds,NumHBD,NumHBA,NumHeavyAtoms,NumAtoms,NumHeteroatoms,...,chi0n,chi1n,chi2n,chi3n,chi4n,hallKierAlpha,kappa1,kappa2,kappa3,Phi
count,173.0,173.0,173.0,173.0,173.0,173.0,173.0,173.0,173.0,173.0,...,173.0,173.0,173.0,173.0,173.0,173.0,173.0,173.0,173.0,173.0
mean,433.909114,434.337785,7.473988,2.00578,4.809249,1.768786,6.67052,31.156069,51.260116,9.32948,...,16.91494,9.824713,5.307062,5.307062,3.692088,-3.510867,20.224528,8.016984,3.922828,5.235445
std,63.148932,63.231848,1.682905,1.374591,2.133216,1.231258,1.739118,4.42429,9.420196,2.267336,...,2.657076,1.673183,1.164871,1.164871,0.941663,0.624767,3.421886,1.654153,1.099611,1.30561
min,264.081125,264.263,3.0,0.0,0.0,0.0,2.0,19.0,28.0,5.0,...,10.144742,5.744119,2.857394,2.857394,1.887662,-4.99,11.044292,3.611607,1.257169,2.024223
25%,388.201159,388.475,7.0,1.0,3.0,1.0,6.0,28.0,45.0,8.0,...,15.135516,8.701039,4.465619,4.465619,2.964949,-3.95,18.024348,7.073402,3.290841,4.507177
50%,441.081698,441.534,8.0,2.0,5.0,2.0,7.0,32.0,51.0,9.0,...,17.179683,9.984283,5.351468,5.351468,3.696351,-3.55,20.409988,8.154281,3.941153,5.261226
75%,478.125275,478.509,8.0,3.0,6.0,3.0,8.0,34.0,58.0,11.0,...,18.84357,11.0826,5.988295,5.988295,4.25211,-3.16,22.706388,9.088406,4.545485,6.079685
max,586.141022,586.571,12.0,6.0,10.0,6.0,11.0,41.0,70.0,16.0,...,21.954468,13.177132,8.64689,8.64689,7.378615,-1.48,28.10706,12.608795,7.749814,9.854364


In [59]:
(df
 .query("ro5 == 'OK'")
 [['name', 'SMILES']]
   ).to_csv('./cleaner_to_dock_lgbm.csv', index=False)