## Applying drug-likeness filters (RO5) 

In [1]:
import datamol as dm
from rdkit import Chem
import pandas as pd

In [2]:
df = pd.read_csv("./savolotinib_6SDE/drugs_to_dock.csv", engine='pyarrow', dtype_backend='pyarrow')

In [3]:
df.head()

Unnamed: 0,molecule_chembl_id,SMILES,method
0,CHEMBL1257015,COc1ccc(C(=O)NCc2cccnc2)cc1C(=O)NCc1cccnc1,swiss
1,CHEMBL473417,CS(=O)(=O)c1ccc(C(=O)Nc2ccc(Cl)c(-c3ccccn3)c2)...,swiss
2,CHEMBL2368925,O=C(O[C@@H]1C[C@@H]2C[C@H]3C[C@H](C1)N2CC3=O)c...,swiss
3,CHEMBL3301607,O=C(Nc1nc2cccc(-c3ccc(CN4CCS(=O)(=O)CC4)cc3)n2...,swiss
4,CHEMBL1521,CCN(C(C)=O)c1cccc(-c2ccnc3c(C#N)cnn23)c1,swiss


In [4]:
df['mol'] = dm.from_df(df, smiles_column='SMILES')

In [5]:
df.head()

Unnamed: 0,molecule_chembl_id,SMILES,method,mol
0,CHEMBL1257015,COc1ccc(C(=O)NCc2cccnc2)cc1C(=O)NCc1cccnc1,swiss,<rdkit.Chem.rdchem.Mol object at 0x78258a117ae0>
1,CHEMBL473417,CS(=O)(=O)c1ccc(C(=O)Nc2ccc(Cl)c(-c3ccccn3)c2)...,swiss,<rdkit.Chem.rdchem.Mol object at 0x78258a117b50>
2,CHEMBL2368925,O=C(O[C@@H]1C[C@@H]2C[C@H]3C[C@H](C1)N2CC3=O)c...,swiss,<rdkit.Chem.rdchem.Mol object at 0x78258a117c30>
3,CHEMBL3301607,O=C(Nc1nc2cccc(-c3ccc(CN4CCS(=O)(=O)CC4)cc3)n2...,swiss,<rdkit.Chem.rdchem.Mol object at 0x78258a117bc0>
4,CHEMBL1521,CCN(C(C)=O)c1cccc(-c2ccnc3c(C#N)cnn23)c1,swiss,<rdkit.Chem.rdchem.Mol object at 0x78258a117d10>


In [6]:
my_prop = {
    'mw': dm.descriptors.mw,
    'nhd': dm.descriptors.n_hbd,
    'nha': dm.descriptors.n_hba,
    'clogp': dm.descriptors.clogp,
    'rb': dm.descriptors.n_rotatable_bonds
}

In [7]:
prop_df = dm.descriptors.batch_compute_many_descriptors(df.mol.values, add_properties=False, properties_fn=my_prop)

In [8]:
prop_df.describe()

Unnamed: 0,mw,nhd,nha,clogp,rb
count,1097.0,1097.0,1097.0,1097.0,1097.0
mean,415.416292,1.961714,5.860529,2.767186,5.905196
std,316.181909,3.512022,6.603587,2.28022,8.218622
min,82.053098,0.0,0.0,-20.60433,0.0
25%,285.184112,1.0,3.0,1.4922,3.0
50%,367.242376,1.0,5.0,2.8818,5.0
75%,463.283492,2.0,7.0,4.1711,7.0
max,7172.091672,58.0,180.0,10.0563,156.0


In [10]:
df = pd.concat([df, prop_df], axis=1)

In [11]:
df.head()

Unnamed: 0,molecule_chembl_id,SMILES,method,mol,mw,nhd,nha,clogp,rb
0,CHEMBL1257015,COc1ccc(C(=O)NCc2cccnc2)cc1C(=O)NCc1cccnc1,swiss,<rdkit.Chem.rdchem.Mol object at 0x78258a117ae0>,376.153541,2,5,2.3452,7
1,CHEMBL473417,CS(=O)(=O)c1ccc(C(=O)Nc2ccc(Cl)c(-c3ccccn3)c2)...,swiss,<rdkit.Chem.rdchem.Mol object at 0x78258a117b50>,420.010219,1,4,4.7112,4
2,CHEMBL2368925,O=C(O[C@@H]1C[C@@H]2C[C@H]3C[C@H](C1)N2CC3=O)c...,swiss,<rdkit.Chem.rdchem.Mol object at 0x78258a117c30>,324.147393,1,4,2.519,2
3,CHEMBL3301607,O=C(Nc1nc2cccc(-c3ccc(CN4CCS(=O)(=O)CC4)cc3)n2...,swiss,<rdkit.Chem.rdchem.Mol object at 0x78258a117bc0>,425.152161,1,7,1.9752,5
4,CHEMBL1521,CCN(C(C)=O)c1cccc(-c2ccnc3c(C#N)cnn23)c1,swiss,<rdkit.Chem.rdchem.Mol object at 0x78258a117d10>,305.12766,0,5,2.64078,3


In [15]:
df['ro5'] = (df
             .apply(lambda row: 'OK' if sum([row.mw <= 500, row.nhd <= 5, row.nha <= 10, row.clogp <= 5, row.rb <= 10]) >= 4
                    else 'NK', axis=1)
            )


In [16]:
df.head()

Unnamed: 0,molecule_chembl_id,SMILES,method,mol,mw,nhd,nha,clogp,rb,ro5
0,CHEMBL1257015,COc1ccc(C(=O)NCc2cccnc2)cc1C(=O)NCc1cccnc1,swiss,<rdkit.Chem.rdchem.Mol object at 0x78258a117ae0>,376.153541,2,5,2.3452,7,OK
1,CHEMBL473417,CS(=O)(=O)c1ccc(C(=O)Nc2ccc(Cl)c(-c3ccccn3)c2)...,swiss,<rdkit.Chem.rdchem.Mol object at 0x78258a117b50>,420.010219,1,4,4.7112,4,OK
2,CHEMBL2368925,O=C(O[C@@H]1C[C@@H]2C[C@H]3C[C@H](C1)N2CC3=O)c...,swiss,<rdkit.Chem.rdchem.Mol object at 0x78258a117c30>,324.147393,1,4,2.519,2,OK
3,CHEMBL3301607,O=C(Nc1nc2cccc(-c3ccc(CN4CCS(=O)(=O)CC4)cc3)n2...,swiss,<rdkit.Chem.rdchem.Mol object at 0x78258a117bc0>,425.152161,1,7,1.9752,5,OK
4,CHEMBL1521,CCN(C(C)=O)c1cccc(-c2ccnc3c(C#N)cnn23)c1,swiss,<rdkit.Chem.rdchem.Mol object at 0x78258a117d10>,305.12766,0,5,2.64078,3,OK


In [18]:
df.query('ro5 == "OK"').describe()

Unnamed: 0,mw,nhd,nha,clogp,rb
count,938.0,938.0,938.0,938.0,938.0
mean,346.8139,1.376333,4.766525,2.675653,4.39339
std,102.176323,1.24319,2.27601,1.777248,2.551882
min,82.053098,0.0,0.0,-3.0115,0.0
25%,276.167931,0.0,3.0,1.549205,3.0
50%,342.567449,1.0,4.0,2.79415,4.0
75%,416.886003,2.0,6.0,3.92028,6.0
max,765.294468,7.0,12.0,7.4898,13.0


In [21]:
(df
 .query("ro5 == 'OK'")
 [['molecule_chembl_id', 'SMILES', 'method']]
   ).to_csv('./cleaner_to_dock.csv', index=False)