In [5]:
import pandas as pd
from chembl_structure_pipeline import standardizer
from chembl_structure_pipeline import checker
from rdkit import Chem
from rdkit import RDLogger
from rdkit.Chem import MolStandardize

In [15]:
df1 = pd.read_csv("data/CDRP_top40.txt")
df2 = pd.read_csv("data/JUMP_top40.txt")
df3 = pd.read_csv("data/lincs_top40.txt")

In [16]:
df1 = df1[["Smiles"]]
df1

Unnamed: 0,Smiles
0,COc1ccc(NC(=O)N[C@H]2CC[C@@H](CC(=O)NCc3ccccn3...
1,OC[C@@H]1O[C@H](CCNC(=O)Cc2ccccn2)CC[C@H]1NC(=...
2,CC(C)C#Cc1ccc2c(O[C@H](CN(C)C(=O)c3ccncc3)[C@H...
3,[O-][N+](=O)c1ccc2oc(nc2c1)-c1ccccc1
4,C[C@@H](CO)N1C[C@H](C)[C@@H](CN(C)C(=O)Nc2cccc...
5,COC(=O)C[C@@H]1C[C@H]2[C@H](Oc3ccc(NC(=O)C4CCO...
6,CCSc1nn2c(nc3ccccc3c2=O)s1
7,Cc1ccccc1S(=O)(=O)NCC[C@@H]1CC[C@@H](NC(=O)Nc2...
8,Oc1ccc(C(=O)Cc2c(F)cccc2Cl)c(O)c1
9,CC(C)NC(=O)N[C@H]1CC[C@@H](CC(=O)NC2Cc3ccccc3C...


In [18]:
RDLogger.DisableLog('rdApp.*')  
def inchitosmiles(smiles):
    
    try:
        # Read SMILES and convert it to RDKit mol object
        mol = Chem.MolFromInchi(smiles)
        inchi_standardized = Chem.MolToSmiles(mol)

        return inchi_standardized
    
    except:
        return "Cannot_do"
    
df2["Smiles"] = df2["Metadata_InChI"].apply(inchitosmiles)
df2 = df2[["Smiles"]]
df2

Unnamed: 0,Smiles
0,Cc1cc(C)cc(N(CC(=O)O)S(=O)(=O)c2ccc3[nH]c(=O)c...
1,COc1ccc(C)cc1N=C(O)c1cncn1-c1ccccc1
2,N#CC(C#N)=NNc1ccc(OC(F)(F)F)cc1
3,CN(C)S(=O)(=O)N1CCN(c2ccc([N+](=O)[O-])cc2)CC1
4,COc1ccc(CCS(=O)(=O)NCC2(O)CCOC2)cc1
5,CC(C)n1cc(C(=N)NO)c(=O)n(Cc2ccccc2)c1=O
6,Cc1cc(C(=O)N2CCN=C(O)C2(C)C)nn1C
7,COc1ccc(Cn2cnc3c(NS(=O)(=O)c4c(Cl)cc(Cl)cc4Cl)...
8,Cc1nn(CC=NC2NCN=C3NC=C(O)N=C32)c(C)c1Cl
9,COc1cc2c(cc1OC)C(=NC(C)C(=O)O)NC(C)(C)C2


In [20]:
df3 = df3[["Smiles"]]
df3

Unnamed: 0,Smiles
0,CCCCCCCCCCC(C)(C)C(=O)Nc1c(OC)cc(OC)cc1OC
1,CC(N)C12CC3CC(CC(C3)C1)C2
2,CCOC(=O)Nc1c[n+](no1)N1CCOCC1
3,Nc1ccc(cc1)S(=O)(=O)c1ccc(N)cc1
4,COc1ccc2C[C@H]3N(C)CCc4cc(OC)c(Oc5c6OCOc6cc6CC...
5,COc1cc2nccc(Oc3ccc(NC(=O)Nc4cc(C)on4)c(Cl)c3)c...
6,COC([C@H](Oc1nc(C)cc(C)n1)C(O)=O)(c1ccccc1)c1c...
7,COc1ccc(cc1OC1CCCC1)C1(CCC(CC1)C(O)=O)C#N
8,CCOC(=O)c1ncc2[nH]c3ccc(OCc4ccccc4)cc3c2c1COC
9,CN(C)CCCC1(OCc2cc(ccc12)C#N)c1ccc(F)cc1


In [24]:
df = pd.concat([df1, df2, df3]).drop_duplicates(keep="first").reset_index(drop=True)
df

Unnamed: 0,Smiles
0,COc1ccc(NC(=O)N[C@H]2CC[C@@H](CC(=O)NCc3ccccn3...
1,OC[C@@H]1O[C@H](CCNC(=O)Cc2ccccn2)CC[C@H]1NC(=...
2,CC(C)C#Cc1ccc2c(O[C@H](CN(C)C(=O)c3ccncc3)[C@H...
3,[O-][N+](=O)c1ccc2oc(nc2c1)-c1ccccc1
4,C[C@@H](CO)N1C[C@H](C)[C@@H](CN(C)C(=O)Nc2cccc...
...,...
112,CC(=O)C[C@@H](c1ccc(cc1)[N+]([O-])=O)c1c(O)c2c...
113,CCOC(=O)C(C)(C)Oc1ccc(Cl)cc1
114,OC(=O)c1ccccc1OC(=O)c1ccccc1O
115,Oc1c(I)cc(Cl)c2cccnc12


In [25]:
def standardize_chembl(smiles):
    
    try:
        # Read SMILES and convert it to RDKit mol object
        mol = Chem.MolFromSmiles(smiles)

        desalter = MolStandardize.fragment.LargestFragmentChooser()
        std1_mol = standardizer.standardize_mol(mol)
        desalt_mol = desalter.choose(std1_mol)
        std2_mol = standardizer.standardize_mol(desalt_mol)
        smiles_standardized = Chem.MolToSmiles(std2_mol)

        return smiles_standardized
    
    except:
        return "Cannot_do"

In [26]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=80)

INFO: Pandarallel will run on 80 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [27]:
import pandas as pd 
from tqdm import tqdm 
import time 

RDLogger.DisableLog('rdApp.info')  
tqdm.pandas()

In [28]:
df["Standardized_SMILES_chembl"] = df["Smiles"].parallel_apply(standardize_chembl)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2), Label(value='0 / 2'))), HBox(c…

In [29]:
RDLogger.DisableLog('rdApp.*')  
def smiles_to_inchi(smiles):
    
    try:
        # Read SMILES and convert it to RDKit mol object
        mol = Chem.MolFromSmiles(smiles)
        inchi_standardized = Chem.MolToInchi(mol)

        return inchi_standardized
    
    except:
        return "Cannot_do"

In [30]:
df["Standardized_InChI_chembl"] = df["Standardized_SMILES_chembl"].parallel_apply(smiles_to_inchi)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2), Label(value='0 / 2'))), HBox(c…

In [31]:
df

Unnamed: 0,Smiles,Standardized_SMILES_chembl,Standardized_InChI_chembl
0,COc1ccc(NC(=O)N[C@H]2CC[C@@H](CC(=O)NCc3ccccn3...,COc1ccc(NC(=O)N[C@H]2CC[C@@H](CC(=O)NCc3ccccn3...,InChI=1S/C22H28N4O5/c1-30-17-7-5-15(6-8-17)25-...
1,OC[C@@H]1O[C@H](CCNC(=O)Cc2ccccn2)CC[C@H]1NC(=...,O=C(Cc1ccccn1)NCC[C@@H]1CC[C@@H](NC(=O)c2ccccc...,InChI=1S/C22H26FN3O4/c23-18-7-2-1-6-17(18)22(2...
2,CC(C)C#Cc1ccc2c(O[C@H](CN(C)C(=O)c3ccncc3)[C@H...,CC(C)C#Cc1ccc2c(c1)O[C@H](CN(C)C(=O)c1ccncc1)[...,InChI=1S/C26H33N3O5S/c1-18(2)6-7-21-8-9-25-23(...
3,[O-][N+](=O)c1ccc2oc(nc2c1)-c1ccccc1,O=[N+]([O-])c1ccc2oc(-c3ccccc3)nc2c1,InChI=1S/C13H8N2O3/c16-15(17)10-6-7-12-11(8-10...
4,C[C@@H](CO)N1C[C@H](C)[C@@H](CN(C)C(=O)Nc2cccc...,C[C@H]1CN([C@@H](C)CO)C(=O)CCCn2cc(nn2)CO[C@@H...,InChI=1S/C23H33FN6O4/c1-16-11-30(17(2)14-31)22...
...,...,...,...
112,CC(=O)C[C@@H](c1ccc(cc1)[N+]([O-])=O)c1c(O)c2c...,CC(=O)C[C@@H](c1ccc([N+](=O)[O-])cc1)c1c(O)c2c...,InChI=1S/C19H15NO6/c1-11(21)10-15(12-6-8-13(9-...
113,CCOC(=O)C(C)(C)Oc1ccc(Cl)cc1,CCOC(=O)C(C)(C)Oc1ccc(Cl)cc1,"InChI=1S/C12H15ClO3/c1-4-15-11(14)12(2,3)16-10..."
114,OC(=O)c1ccccc1OC(=O)c1ccccc1O,O=C(Oc1ccccc1C(=O)O)c1ccccc1O,InChI=1S/C14H10O5/c15-11-7-3-1-5-9(11)14(18)19...
115,Oc1c(I)cc(Cl)c2cccnc12,Oc1c(I)cc(Cl)c2cccnc12,InChI=1S/C9H5ClINO/c10-6-4-7(11)9(13)8-5(6)2-1...


In [32]:
RDLogger.DisableLog('rdApp.*')  
def smiles_to_inchikey(smiles):
    
    try:
        # Read SMILES and convert it to RDKit mol object
        mol = Chem.MolFromSmiles(smiles)
        inchi_standardized = Chem.MolToInchiKey(mol)

        return inchi_standardized
    
    except:
        return "Cannot_do"

In [33]:
df["Standardized_InChIKey_chembl"] = df["Standardized_SMILES_chembl"].parallel_apply(smiles_to_inchikey)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2), Label(value='0 / 2'))), HBox(c…

In [35]:
df.Smiles == df.Standardized_SMILES_chembl

0       True
1      False
2      False
3      False
4      False
       ...  
112    False
113     True
114    False
115     True
116    False
Length: 117, dtype: bool

In [36]:
df.to_csv("mitotox.csv.gz", index=False, compression="gzip")

In [37]:
import pandas as pd
df = pd.read_csv("mitotox.csv.gz", compression="gzip")
df

Unnamed: 0,Smiles,Standardized_SMILES_chembl,Standardized_InChI_chembl,Standardized_InChIKey_chembl
0,COc1ccc(NC(=O)N[C@H]2CC[C@@H](CC(=O)NCc3ccccn3...,COc1ccc(NC(=O)N[C@H]2CC[C@@H](CC(=O)NCc3ccccn3...,InChI=1S/C22H28N4O5/c1-30-17-7-5-15(6-8-17)25-...,GEVYJHFWGQLEQA-SLFFLAALSA-N
1,OC[C@@H]1O[C@H](CCNC(=O)Cc2ccccn2)CC[C@H]1NC(=...,O=C(Cc1ccccn1)NCC[C@@H]1CC[C@@H](NC(=O)c2ccccc...,InChI=1S/C22H26FN3O4/c23-18-7-2-1-6-17(18)22(2...,CUFNDCCMTGJSBI-DBVUQKKJSA-N
2,CC(C)C#Cc1ccc2c(O[C@H](CN(C)C(=O)c3ccncc3)[C@H...,CC(C)C#Cc1ccc2c(c1)O[C@H](CN(C)C(=O)c1ccncc1)[...,InChI=1S/C26H33N3O5S/c1-18(2)6-7-21-8-9-25-23(...,YTUJKVHLLCOJEP-JXALWOEJSA-N
3,[O-][N+](=O)c1ccc2oc(nc2c1)-c1ccccc1,O=[N+]([O-])c1ccc2oc(-c3ccccc3)nc2c1,InChI=1S/C13H8N2O3/c16-15(17)10-6-7-12-11(8-10...,PBRISAFILDFQFS-UHFFFAOYSA-N
4,C[C@@H](CO)N1C[C@H](C)[C@@H](CN(C)C(=O)Nc2cccc...,C[C@H]1CN([C@@H](C)CO)C(=O)CCCn2cc(nn2)CO[C@@H...,InChI=1S/C23H33FN6O4/c1-16-11-30(17(2)14-31)22...,VTXUSJLMEZRIOV-XGHQBKJUSA-N
...,...,...,...,...
112,CC(=O)C[C@@H](c1ccc(cc1)[N+]([O-])=O)c1c(O)c2c...,CC(=O)C[C@@H](c1ccc([N+](=O)[O-])cc1)c1c(O)c2c...,InChI=1S/C19H15NO6/c1-11(21)10-15(12-6-8-13(9-...,VABCILAOYCMVPS-HNNXBMFYSA-N
113,CCOC(=O)C(C)(C)Oc1ccc(Cl)cc1,CCOC(=O)C(C)(C)Oc1ccc(Cl)cc1,"InChI=1S/C12H15ClO3/c1-4-15-11(14)12(2,3)16-10...",KNHUKKLJHYUCFP-UHFFFAOYSA-N
114,OC(=O)c1ccccc1OC(=O)c1ccccc1O,O=C(Oc1ccccc1C(=O)O)c1ccccc1O,InChI=1S/C14H10O5/c15-11-7-3-1-5-9(11)14(18)19...,WVYADZUPLLSGPU-UHFFFAOYSA-N
115,Oc1c(I)cc(Cl)c2cccnc12,Oc1c(I)cc(Cl)c2cccnc12,InChI=1S/C9H5ClINO/c10-6-4-7(11)9(13)8-5(6)2-1...,QCDFBFJGMNKBDO-UHFFFAOYSA-N


In [38]:
from chembl_webresource_client.new_client import new_client

# Set up the ChEMBL client
chembl_client = new_client

In [39]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=80)

INFO: Pandarallel will run on 80 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [40]:
def smiles_to_chembl_id(smiles):
    try:
        molecule = chembl_client.molecule
        result = molecule.filter(smiles=smiles).only("molecule_chembl_id")
        return result[0]["molecule_chembl_id"] if result else None
    except Exception as e:
        print(f"Error converting SMILES to ChEMBL ID for {smiles}: {e}")
        return "Cannot_do"

In [41]:
def inchikey_to_chembl_id(inchikey):
    result = None
    
    try:
        molecule = chembl_client.molecule
        result = molecule.filter(molecule_structures__standard_inchi_key=inchikey).only("molecule_chembl_id")
        return result[0]["molecule_chembl_id"] if result else None
    except Exception as e:
        print(f"Error converting InChI key to ChEMBL ID for {inchikey}: {e}")
        return "Cannot_do"


In [42]:
# Apply the function to each row and create a new 'ChEMBL_ID' column
df['ChEMBL_ID'] = df['Standardized_InChIKey_chembl'].parallel_apply(inchikey_to_chembl_id)
df

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2), Label(value='0 / 2'))), HBox(c…

Unnamed: 0,Smiles,Standardized_SMILES_chembl,Standardized_InChI_chembl,Standardized_InChIKey_chembl,ChEMBL_ID
0,COc1ccc(NC(=O)N[C@H]2CC[C@@H](CC(=O)NCc3ccccn3...,COc1ccc(NC(=O)N[C@H]2CC[C@@H](CC(=O)NCc3ccccn3...,InChI=1S/C22H28N4O5/c1-30-17-7-5-15(6-8-17)25-...,GEVYJHFWGQLEQA-SLFFLAALSA-N,
1,OC[C@@H]1O[C@H](CCNC(=O)Cc2ccccn2)CC[C@H]1NC(=...,O=C(Cc1ccccn1)NCC[C@@H]1CC[C@@H](NC(=O)c2ccccc...,InChI=1S/C22H26FN3O4/c23-18-7-2-1-6-17(18)22(2...,CUFNDCCMTGJSBI-DBVUQKKJSA-N,
2,CC(C)C#Cc1ccc2c(O[C@H](CN(C)C(=O)c3ccncc3)[C@H...,CC(C)C#Cc1ccc2c(c1)O[C@H](CN(C)C(=O)c1ccncc1)[...,InChI=1S/C26H33N3O5S/c1-18(2)6-7-21-8-9-25-23(...,YTUJKVHLLCOJEP-JXALWOEJSA-N,
3,[O-][N+](=O)c1ccc2oc(nc2c1)-c1ccccc1,O=[N+]([O-])c1ccc2oc(-c3ccccc3)nc2c1,InChI=1S/C13H8N2O3/c16-15(17)10-6-7-12-11(8-10...,PBRISAFILDFQFS-UHFFFAOYSA-N,CHEMBL391351
4,C[C@@H](CO)N1C[C@H](C)[C@@H](CN(C)C(=O)Nc2cccc...,C[C@H]1CN([C@@H](C)CO)C(=O)CCCn2cc(nn2)CO[C@@H...,InChI=1S/C23H33FN6O4/c1-16-11-30(17(2)14-31)22...,VTXUSJLMEZRIOV-XGHQBKJUSA-N,CHEMBL2131431
...,...,...,...,...,...
112,CC(=O)C[C@@H](c1ccc(cc1)[N+]([O-])=O)c1c(O)c2c...,CC(=O)C[C@@H](c1ccc([N+](=O)[O-])cc1)c1c(O)c2c...,InChI=1S/C19H15NO6/c1-11(21)10-15(12-6-8-13(9-...,VABCILAOYCMVPS-HNNXBMFYSA-N,
113,CCOC(=O)C(C)(C)Oc1ccc(Cl)cc1,CCOC(=O)C(C)(C)Oc1ccc(Cl)cc1,"InChI=1S/C12H15ClO3/c1-4-15-11(14)12(2,3)16-10...",KNHUKKLJHYUCFP-UHFFFAOYSA-N,CHEMBL565
114,OC(=O)c1ccccc1OC(=O)c1ccccc1O,O=C(Oc1ccccc1C(=O)O)c1ccccc1O,InChI=1S/C14H10O5/c15-11-7-3-1-5-9(11)14(18)19...,WVYADZUPLLSGPU-UHFFFAOYSA-N,CHEMBL154111
115,Oc1c(I)cc(Cl)c2cccnc12,Oc1c(I)cc(Cl)c2cccnc12,InChI=1S/C9H5ClINO/c10-6-4-7(11)9(13)8-5(6)2-1...,QCDFBFJGMNKBDO-UHFFFAOYSA-N,CHEMBL497


In [43]:
df.ChEMBL_ID.nunique()

53

In [45]:
df.to_csv("mitotox_chemblidentifiers.csv.gz", index=False, compression="gzip")