In [1]:
import pandas as pd
from chembl_structure_pipeline import standardizer
from chembl_structure_pipeline import checker
from rdkit import Chem
from rdkit import RDLogger
from rdkit.Chem import MolStandardize

[21:09:51] Initializing Normalizer


In [2]:
df = pd.read_csv("JUMPCP_compounds_processed.csv.gz", compression="gzip")

In [4]:
!pwd

/dgx1nas1/storage/data/seal/Jump_target_data_extraction/data/JUMPCP


In [3]:
df.Standardized_SMILES.nunique()

115771

In [4]:
def standardize_chembl(smiles):
    
    try:
        # Read SMILES and convert it to RDKit mol object
        mol = Chem.MolFromSmiles(smiles)

        desalter = MolStandardize.fragment.LargestFragmentChooser()
        std1_mol = standardizer.standardize_mol(mol)
        desalt_mol = desalter.choose(std1_mol)
        std2_mol = standardizer.standardize_mol(desalt_mol)
        smiles_standardized = Chem.MolToSmiles(std2_mol)

        return smiles_standardized
    
    except:
        return "Cannot_do"

In [5]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=80)

INFO: Pandarallel will run on 80 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [6]:
import pandas as pd 
from tqdm import tqdm 
import time 

RDLogger.DisableLog('rdApp.info')  
tqdm.pandas()

In [7]:
df["Standardized_SMILES_chembl"] = df["smiles"].parallel_apply(standardize_chembl)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1448), Label(value='0 / 1448'))), …

In [8]:
RDLogger.DisableLog('rdApp.*')  
def smiles_to_inchi(smiles):
    
    try:
        # Read SMILES and convert it to RDKit mol object
        mol = Chem.MolFromSmiles(smiles)
        inchi_standardized = Chem.MolToInchi(mol)

        return inchi_standardized
    
    except:
        return "Cannot_do"

In [9]:
df["Standardized_InChI_chembl"] = df["Standardized_SMILES_chembl"].parallel_apply(smiles_to_inchi)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1448), Label(value='0 / 1448'))), …

In [10]:
df

Unnamed: 0,Metadata_JCP2022,smiles,Standardized_SMILES,Standardized_InChI,Standardized_SMILES_chembl,Standardized_InChI_chembl
0,JCP2022_000001,CCc1nccn1-c1cccc(C2CCCN2C(=O)c2ccc(OCCN(C)C)cc...,CCc1nccn1-c1cccc(C2CCC[NH+]2C(=O)c2ccc(OCC[NH+...,InChI=1S/C25H31N5O2/c1-4-23-26-14-16-30(23)24-...,CCc1nccn1-c1cccc(C2CCCN2C(=O)c2ccc(OCCN(C)C)cc...,InChI=1S/C25H31N5O2/c1-4-23-26-14-16-30(23)24-...
1,JCP2022_000002,OC1=NCCCN1Cc1ccc(Cl)cc1,O=C1NCCC[NH+]1Cc1ccc(Cl)cc1,InChI=1S/C11H13ClN2O/c12-10-4-2-9(3-5-10)8-14-...,OC1=NCCCN1Cc1ccc(Cl)cc1,InChI=1S/C11H13ClN2O/c12-10-4-2-9(3-5-10)8-14-...
2,JCP2022_000004,CCCn1nccc1S(=O)(=O)N1CC2CCC1CNC2,CCCn1nccc1S(=O)(=O)[NH+]1CC2CCC1C[NH2+]C2,InChI=1S/C13H22N4O2S/c1-2-7-16-13(5-6-15-16)20...,CCCn1nccc1S(=O)(=O)N1CC2CCC1CNC2,InChI=1S/C13H22N4O2S/c1-2-7-16-13(5-6-15-16)20...
3,JCP2022_000005,CCN=C(O)CC1N=C(c2ccc(Cl)cc2)c2cc(OC)ccc2-n2c(C...,CCNC(=O)CC1N=C(c2ccc(Cl)cc2)c2cc(OC)ccc2-n2c(C...,InChI=1S/C22H22ClN5O2/c1-4-24-20(29)12-18-22-2...,CCNC(=O)CC1N=C(c2ccc(Cl)cc2)c2cc(OC)ccc2-n2c(C...,InChI=1S/C22H22ClN5O2/c1-4-24-20(29)12-18-22-2...
4,JCP2022_000006,Cn1cc(-c2noc(-c3cc(C4CC4)n(C(C)(C)C)n3)n2)cn1,Cn1cc(-c2noc(-c3cc(C4CC4)n(C(C)(C)C)n3)n2)cn1,"InChI=1S/C16H20N6O/c1-16(2,3)22-13(10-5-6-10)7...",Cn1cc(-c2noc(-c3cc(C4CC4)n(C(C)(C)C)n3)n2)cn1,"InChI=1S/C16H20N6O/c1-16(2,3)22-13(10-5-6-10)7..."
...,...,...,...,...,...,...
115779,JCP2022_116749,CN(C)c1ccc(O)c2c1CC1CC3C(N(C)C)C(=O)C(C(=N)O)C...,CN(C)c1ccc([O-])c2c1CC1CC3C([NH+](C)C)C(=O)C(C...,InChI=1S/C23H27N3O7/c1-25(2)12-5-6-13(27)15-10...,CN(C)c1ccc(O)c2c1CC1CC3C(N(C)C)C(=O)C(C(N)=O)C...,InChI=1S/C23H27N3O7/c1-25(2)12-5-6-13(27)15-10...
115780,JCP2022_116750,Cc1nc(C)c(CCC(=O)N(C2CC2)C2CCCc3ccccc32)c(O)n1,Cc1nc(=O)c(CCC(=O)[NH+](C2CC2)C2CCCc3ccccc32)c...,InChI=1S/C22H27N3O2/c1-14-18(22(27)24-15(2)23-...,Cc1nc(C)c(CCC(=O)N(C2CC2)C2CCCc3ccccc32)c(O)n1,InChI=1S/C22H27N3O2/c1-14-18(22(27)24-15(2)23-...
115781,JCP2022_116751,Cn1ccc2ccn(CCN=C(O)c3cnc4ccccc4n3)c(=O)c21,Cn1ccc2ccn(CCNC(=O)c3cnc4ccccc4n3)c(=O)c21,InChI=1S/C19H17N5O2/c1-23-9-6-13-7-10-24(19(26...,Cn1ccc2ccn(CCNC(=O)c3cnc4ccccc4n3)c(=O)c21,InChI=1S/C19H17N5O2/c1-23-9-6-13-7-10-24(19(26...
115782,JCP2022_116752,Cc1nccc(-c2cccc(NS(=O)(=O)c3cc(C(F)(F)F)cc(C(F...,Cc1nccc(-c2cccc(NS(=O)(=O)c3cc(C(F)(F)F)cc(C(F...,InChI=1S/C19H13F6N3O2S/c1-11-26-6-5-17(27-11)1...,Cc1nccc(-c2cccc(NS(=O)(=O)c3cc(C(F)(F)F)cc(C(F...,InChI=1S/C19H13F6N3O2S/c1-11-26-6-5-17(27-11)1...


In [11]:
RDLogger.DisableLog('rdApp.*')  
def smiles_to_inchikey(smiles):
    
    try:
        # Read SMILES and convert it to RDKit mol object
        mol = Chem.MolFromSmiles(smiles)
        inchi_standardized = Chem.MolToInchiKey(mol)

        return inchi_standardized
    
    except:
        return "Cannot_do"

In [12]:
df["Standardized_InChIKey_chembl"] = df["Standardized_SMILES_chembl"].parallel_apply(smiles_to_inchikey)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1448), Label(value='0 / 1448'))), …

In [13]:
df.smiles == df.Standardized_SMILES_chembl

0          True
1          True
2          True
3         False
4          True
          ...  
115779    False
115780     True
115781    False
115782     True
115783    False
Length: 115784, dtype: bool

In [14]:
df.to_csv("JUMPCP_compounds_processed_chembl_identifiers.csv.gz", index=False, compression="gzip")

In [32]:
import pandas as pd
df = pd.read_csv("JUMPCP_compounds_processed_chembl_identifiers.csv.gz", compression="gzip")
df

Unnamed: 0,Metadata_JCP2022,smiles,Standardized_SMILES,Standardized_InChI,Standardized_SMILES_chembl,Standardized_InChI_chembl,Standardized_InChIKey_chembl,ChEMBL_ID
0,JCP2022_000001,CCc1nccn1-c1cccc(C2CCCN2C(=O)c2ccc(OCCN(C)C)cc...,CCc1nccn1-c1cccc(C2CCC[NH+]2C(=O)c2ccc(OCC[NH+...,InChI=1S/C25H31N5O2/c1-4-23-26-14-16-30(23)24-...,CCc1nccn1-c1cccc(C2CCCN2C(=O)c2ccc(OCCN(C)C)cc...,InChI=1S/C25H31N5O2/c1-4-23-26-14-16-30(23)24-...,AAAHWCWPZPSPIW-UHFFFAOYSA-N,
1,JCP2022_000002,OC1=NCCCN1Cc1ccc(Cl)cc1,O=C1NCCC[NH+]1Cc1ccc(Cl)cc1,InChI=1S/C11H13ClN2O/c12-10-4-2-9(3-5-10)8-14-...,OC1=NCCCN1Cc1ccc(Cl)cc1,InChI=1S/C11H13ClN2O/c12-10-4-2-9(3-5-10)8-14-...,AAAJHRMBUHXWLD-UHFFFAOYSA-N,
2,JCP2022_000004,CCCn1nccc1S(=O)(=O)N1CC2CCC1CNC2,CCCn1nccc1S(=O)(=O)[NH+]1CC2CCC1C[NH2+]C2,InChI=1S/C13H22N4O2S/c1-2-7-16-13(5-6-15-16)20...,CCCn1nccc1S(=O)(=O)N1CC2CCC1CNC2,InChI=1S/C13H22N4O2S/c1-2-7-16-13(5-6-15-16)20...,AAANUZMCJQUYNX-UHFFFAOYSA-N,
3,JCP2022_000005,CCN=C(O)CC1N=C(c2ccc(Cl)cc2)c2cc(OC)ccc2-n2c(C...,CCNC(=O)CC1N=C(c2ccc(Cl)cc2)c2cc(OC)ccc2-n2c(C...,InChI=1S/C22H22ClN5O2/c1-4-24-20(29)12-18-22-2...,CCNC(=O)CC1N=C(c2ccc(Cl)cc2)c2cc(OC)ccc2-n2c(C...,InChI=1S/C22H22ClN5O2/c1-4-24-20(29)12-18-22-2...,AAAQFGUYHFJNHI-UHFFFAOYSA-N,
4,JCP2022_000006,Cn1cc(-c2noc(-c3cc(C4CC4)n(C(C)(C)C)n3)n2)cn1,Cn1cc(-c2noc(-c3cc(C4CC4)n(C(C)(C)C)n3)n2)cn1,"InChI=1S/C16H20N6O/c1-16(2,3)22-13(10-5-6-10)7...",Cn1cc(-c2noc(-c3cc(C4CC4)n(C(C)(C)C)n3)n2)cn1,"InChI=1S/C16H20N6O/c1-16(2,3)22-13(10-5-6-10)7...",AAAROXVLYNJINN-UHFFFAOYSA-N,
...,...,...,...,...,...,...,...,...
115779,JCP2022_116749,CN(C)c1ccc(O)c2c1CC1CC3C(N(C)C)C(=O)C(C(=N)O)C...,CN(C)c1ccc([O-])c2c1CC1CC3C([NH+](C)C)C(=O)C(C...,InChI=1S/C23H27N3O7/c1-25(2)12-5-6-13(27)15-10...,CN(C)c1ccc(O)c2c1CC1CC3C(N(C)C)C(=O)C(C(N)=O)C...,InChI=1S/C23H27N3O7/c1-25(2)12-5-6-13(27)15-10...,ZZZRUAITSXLWBH-UHFFFAOYSA-N,
115780,JCP2022_116750,Cc1nc(C)c(CCC(=O)N(C2CC2)C2CCCc3ccccc32)c(O)n1,Cc1nc(=O)c(CCC(=O)[NH+](C2CC2)C2CCCc3ccccc32)c...,InChI=1S/C22H27N3O2/c1-14-18(22(27)24-15(2)23-...,Cc1nc(C)c(CCC(=O)N(C2CC2)C2CCCc3ccccc32)c(O)n1,InChI=1S/C22H27N3O2/c1-14-18(22(27)24-15(2)23-...,ZZZTXDPKNAOZPM-UHFFFAOYSA-N,
115781,JCP2022_116751,Cn1ccc2ccn(CCN=C(O)c3cnc4ccccc4n3)c(=O)c21,Cn1ccc2ccn(CCNC(=O)c3cnc4ccccc4n3)c(=O)c21,InChI=1S/C19H17N5O2/c1-23-9-6-13-7-10-24(19(26...,Cn1ccc2ccn(CCNC(=O)c3cnc4ccccc4n3)c(=O)c21,InChI=1S/C19H17N5O2/c1-23-9-6-13-7-10-24(19(26...,ZZZUOLMMTJKOGE-UHFFFAOYSA-N,
115782,JCP2022_116752,Cc1nccc(-c2cccc(NS(=O)(=O)c3cc(C(F)(F)F)cc(C(F...,Cc1nccc(-c2cccc(NS(=O)(=O)c3cc(C(F)(F)F)cc(C(F...,InChI=1S/C19H13F6N3O2S/c1-11-26-6-5-17(27-11)1...,Cc1nccc(-c2cccc(NS(=O)(=O)c3cc(C(F)(F)F)cc(C(F...,InChI=1S/C19H13F6N3O2S/c1-11-26-6-5-17(27-11)1...,ZZZZIBSVRUABIA-UHFFFAOYSA-N,


In [33]:
from chembl_webresource_client.new_client import new_client

# Set up the ChEMBL client
chembl_client = new_client

In [34]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=80)

INFO: Pandarallel will run on 80 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [35]:
def smiles_to_chembl_id(smiles):
    try:
        molecule = chembl_client.molecule
        result = molecule.filter(smiles=smiles).only("molecule_chembl_id")
        return result[0]["molecule_chembl_id"] if result else None
    except Exception as e:
        print(f"Error converting SMILES to ChEMBL ID for {smiles}: {e}")
        return "Cannot_do"

In [18]:
def inchikey_to_chembl_id(inchikey):
    result = None
    
    try:
        molecule = chembl_client.molecule
        result = molecule.filter(molecule_structures__standard_inchi_key=inchikey).only("molecule_chembl_id")
        return result[0]["molecule_chembl_id"] if result else None
    except Exception as e:
        print(f"Error converting InChI key to ChEMBL ID for {inchikey}: {e}")
        return "Cannot_do"


In [36]:
# Apply the function to each row and create a new 'ChEMBL_ID' column
df['ChEMBL_ID'] = df['Standardized_InChIKey_chembl'].parallel_apply(inchikey_to_chembl_id)
df

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1448), Label(value='0 / 1448'))), …

Unnamed: 0,Metadata_JCP2022,smiles,Standardized_SMILES,Standardized_InChI,Standardized_SMILES_chembl,Standardized_InChI_chembl,Standardized_InChIKey_chembl,ChEMBL_ID
0,JCP2022_000001,CCc1nccn1-c1cccc(C2CCCN2C(=O)c2ccc(OCCN(C)C)cc...,CCc1nccn1-c1cccc(C2CCC[NH+]2C(=O)c2ccc(OCC[NH+...,InChI=1S/C25H31N5O2/c1-4-23-26-14-16-30(23)24-...,CCc1nccn1-c1cccc(C2CCCN2C(=O)c2ccc(OCCN(C)C)cc...,InChI=1S/C25H31N5O2/c1-4-23-26-14-16-30(23)24-...,AAAHWCWPZPSPIW-UHFFFAOYSA-N,
1,JCP2022_000002,OC1=NCCCN1Cc1ccc(Cl)cc1,O=C1NCCC[NH+]1Cc1ccc(Cl)cc1,InChI=1S/C11H13ClN2O/c12-10-4-2-9(3-5-10)8-14-...,OC1=NCCCN1Cc1ccc(Cl)cc1,InChI=1S/C11H13ClN2O/c12-10-4-2-9(3-5-10)8-14-...,AAAJHRMBUHXWLD-UHFFFAOYSA-N,CHEMBL592894
2,JCP2022_000004,CCCn1nccc1S(=O)(=O)N1CC2CCC1CNC2,CCCn1nccc1S(=O)(=O)[NH+]1CC2CCC1C[NH2+]C2,InChI=1S/C13H22N4O2S/c1-2-7-16-13(5-6-15-16)20...,CCCn1nccc1S(=O)(=O)N1CC2CCC1CNC2,InChI=1S/C13H22N4O2S/c1-2-7-16-13(5-6-15-16)20...,AAANUZMCJQUYNX-UHFFFAOYSA-N,
3,JCP2022_000005,CCN=C(O)CC1N=C(c2ccc(Cl)cc2)c2cc(OC)ccc2-n2c(C...,CCNC(=O)CC1N=C(c2ccc(Cl)cc2)c2cc(OC)ccc2-n2c(C...,InChI=1S/C22H22ClN5O2/c1-4-24-20(29)12-18-22-2...,CCNC(=O)CC1N=C(c2ccc(Cl)cc2)c2cc(OC)ccc2-n2c(C...,InChI=1S/C22H22ClN5O2/c1-4-24-20(29)12-18-22-2...,AAAQFGUYHFJNHI-UHFFFAOYSA-N,
4,JCP2022_000006,Cn1cc(-c2noc(-c3cc(C4CC4)n(C(C)(C)C)n3)n2)cn1,Cn1cc(-c2noc(-c3cc(C4CC4)n(C(C)(C)C)n3)n2)cn1,"InChI=1S/C16H20N6O/c1-16(2,3)22-13(10-5-6-10)7...",Cn1cc(-c2noc(-c3cc(C4CC4)n(C(C)(C)C)n3)n2)cn1,"InChI=1S/C16H20N6O/c1-16(2,3)22-13(10-5-6-10)7...",AAAROXVLYNJINN-UHFFFAOYSA-N,
...,...,...,...,...,...,...,...,...
115779,JCP2022_116749,CN(C)c1ccc(O)c2c1CC1CC3C(N(C)C)C(=O)C(C(=N)O)C...,CN(C)c1ccc([O-])c2c1CC1CC3C([NH+](C)C)C(=O)C(C...,InChI=1S/C23H27N3O7/c1-25(2)12-5-6-13(27)15-10...,CN(C)c1ccc(O)c2c1CC1CC3C(N(C)C)C(=O)C(C(N)=O)C...,InChI=1S/C23H27N3O7/c1-25(2)12-5-6-13(27)15-10...,ZZZRUAITSXLWBH-UHFFFAOYSA-N,
115780,JCP2022_116750,Cc1nc(C)c(CCC(=O)N(C2CC2)C2CCCc3ccccc32)c(O)n1,Cc1nc(=O)c(CCC(=O)[NH+](C2CC2)C2CCCc3ccccc32)c...,InChI=1S/C22H27N3O2/c1-14-18(22(27)24-15(2)23-...,Cc1nc(C)c(CCC(=O)N(C2CC2)C2CCCc3ccccc32)c(O)n1,InChI=1S/C22H27N3O2/c1-14-18(22(27)24-15(2)23-...,ZZZTXDPKNAOZPM-UHFFFAOYSA-N,
115781,JCP2022_116751,Cn1ccc2ccn(CCN=C(O)c3cnc4ccccc4n3)c(=O)c21,Cn1ccc2ccn(CCNC(=O)c3cnc4ccccc4n3)c(=O)c21,InChI=1S/C19H17N5O2/c1-23-9-6-13-7-10-24(19(26...,Cn1ccc2ccn(CCNC(=O)c3cnc4ccccc4n3)c(=O)c21,InChI=1S/C19H17N5O2/c1-23-9-6-13-7-10-24(19(26...,ZZZUOLMMTJKOGE-UHFFFAOYSA-N,
115782,JCP2022_116752,Cc1nccc(-c2cccc(NS(=O)(=O)c3cc(C(F)(F)F)cc(C(F...,Cc1nccc(-c2cccc(NS(=O)(=O)c3cc(C(F)(F)F)cc(C(F...,InChI=1S/C19H13F6N3O2S/c1-11-26-6-5-17(27-11)1...,Cc1nccc(-c2cccc(NS(=O)(=O)c3cc(C(F)(F)F)cc(C(F...,InChI=1S/C19H13F6N3O2S/c1-11-26-6-5-17(27-11)1...,ZZZZIBSVRUABIA-UHFFFAOYSA-N,


In [37]:
df.ChEMBL_ID.nunique()

29831

In [38]:
df.to_csv("JUMPCP_compounds_processed_chembl_identifiers.csv.gz", index=False, compression="gzip")