In [39]:
import pandas as pd
from chembl_structure_pipeline import standardizer
from chembl_structure_pipeline import checker
from rdkit import Chem
from rdkit import RDLogger
from rdkit.Chem import MolStandardize

In [40]:
df = pd.read_csv("../../../OpenCellCounter/Profiles/Compound_Names.csv")
df

Unnamed: 0,perturbation,SMILES
0,(+)-Cedrol,C[C@@H]1CC[C@H]2C(C)(C)[C@H]3C[C@@]12CC[C@@]3(C)O
1,(+)-cloprostenol,O[C@@H](COC1=CC=CC(Cl)=C1)\C=C\[C@H]1[C@H](O)C...
2,(-)-Huperzine A (HupA),[H][C@@]12CC3=C(C=CC(=O)N3)[C@@](N)(CC(C)=C1)C...
3,1-Dexoymannojirimycin HCl,OC[C@H]1NC[C@@H](O)[C@@H](O)[C@@H]1O
4,1-Octanol,CCCCCCCCO
...,...,...
1669,vorapaxar,CCOC(=O)N[C@@H]1CC[C@@H]2[C@H](C[C@@H]3[C@@H](...
1670,warfarin,CC(=O)C[C@@H](C1=CC=CC=C1)C1=C(O)C2=C(OC1=O)C=...
1671,zofenopril-calcium,C[C@H](CSC(=O)C1=CC=CC=C1)C(=O)N1C[C@H](C[C@H]...
1672,zolimidine,"CS(=O)(=O)C1=CC=C(C=C1)C1=CN2C=CC=CC2=N1 |c:6,..."


In [41]:
df.iloc[1]["SMILES"]

'O[C@@H](COC1=CC=CC(Cl)=C1)\\C=C\\[C@H]1[C@H](O)C[C@H](O)[C@@H]1C\\C=C/CCCC([O-])=O |c:6,9,t:4|'

In [42]:
!pwd

/dgx1nas1/storage/data/seal/The_Seal_Files/BAK_The_Seal_dataset/Generate_data


In [43]:
df.SMILES.nunique()

1674

In [44]:
from rdkit import Chem
from rdkit.Chem import MolStandardize

def standardize_SMILES(SMILES):
    try:
        # Convert SMILES string to RDKit Mol object
        mol = Chem.MolFromSmiles(SMILES)
        if mol is None:
            return "Invalid_SMILES"

        # Initialize RDKit standardizer components
        desalter = MolStandardize.fragment.LargestFragmentChooser()
        normalizer = MolStandardize.normalize.Normalizer()
        uncharger = MolStandardize.charge.Uncharger()

        # Step 1: Choose the largest fragment
        largest_fragment = desalter.choose(mol)

        # Step 2: Normalize the molecule (handle tautomeric forms, etc.)
        normalized_mol = normalizer.normalize(largest_fragment)

        # Step 3: Remove charges
        uncharged_mol = uncharger.uncharge(normalized_mol)

        # Step 4: Remove stereochemistry (if desired)
        Chem.RemoveStereochemistry(uncharged_mol)

        # Convert back to SMILES
        smiles_standardized = Chem.MolToSmiles(uncharged_mol)

        return smiles_standardized
    
    except Exception as e:
        # Log or print the error if needed
        print(f"Error standardizing SMILES: {e}")
        return "Cannot_do"


In [45]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=80)

INFO: Pandarallel will run on 80 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [46]:
import pandas as pd 
from tqdm import tqdm 
import time 

RDLogger.DisableLog('rdApp.info')  
tqdm.pandas()

In [47]:
df["Standardized_SMILES_chembl"] = df["SMILES"].parallel_apply(standardize_SMILES)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=21), Label(value='0 / 21'))), HBox…

In [48]:
RDLogger.DisableLog('rdApp.*')  
def smiles_to_inchi(smiles):
    
    try:
        # Read SMILES and convert it to RDKit mol object
        mol = Chem.MolFromSmiles(smiles)
        inchi_standardized = Chem.MolToInchi(mol)

        return inchi_standardized
    
    except:
        return "Cannot_do"

In [49]:
df["Standardized_InChI_chembl"] = df["Standardized_SMILES_chembl"].parallel_apply(smiles_to_inchi)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=21), Label(value='0 / 21'))), HBox…

In [50]:
df

Unnamed: 0,perturbation,SMILES,Standardized_SMILES_chembl,Standardized_InChI_chembl
0,(+)-Cedrol,C[C@@H]1CC[C@H]2C(C)(C)[C@H]3C[C@@]12CC[C@@]3(C)O,CC1CCC2C(C)(C)C3CC12CCC3(C)O,"InChI=1S/C15H26O/c1-10-5-6-11-13(2,3)12-9-15(1..."
1,(+)-cloprostenol,O[C@@H](COC1=CC=CC(Cl)=C1)\C=C\[C@H]1[C@H](O)C...,O=C(O)CCCC=CCC1C(O)CC(O)C1C=CC(O)COc1cccc(Cl)c1,InChI=1S/C22H29ClO6/c23-15-6-5-7-17(12-15)29-1...
2,(-)-Huperzine A (HupA),[H][C@@]12CC3=C(C=CC(=O)N3)[C@@](N)(CC(C)=C1)C...,CC=C1C2C=C(C)CC1(N)c1ccc(=O)[nH]c1C2,"InChI=1S/C15H18N2O/c1-3-11-10-6-9(2)8-15(11,16..."
3,1-Dexoymannojirimycin HCl,OC[C@H]1NC[C@@H](O)[C@@H](O)[C@@H]1O,OCC1NCC(O)C(O)C1O,InChI=1S/C6H13NO4/c8-2-3-5(10)6(11)4(9)1-7-3/h...
4,1-Octanol,CCCCCCCCO,CCCCCCCCO,"InChI=1S/C8H18O/c1-2-3-4-5-6-7-8-9/h9H,2-8H2,1H3"
...,...,...,...,...
1669,vorapaxar,CCOC(=O)N[C@@H]1CC[C@@H]2[C@H](C[C@@H]3[C@@H](...,CCOC(=O)NC1CCC2C(C1)CC1C(=O)OC(C)C1C2C=Cc1ccc(...,InChI=1S/C29H33FN2O4/c1-3-35-29(34)32-23-10-11...
1670,warfarin,CC(=O)C[C@@H](C1=CC=CC=C1)C1=C(O)C2=C(OC1=O)C=...,CC(=O)CC(c1ccccc1)c1c(O)c2ccccc2oc1=O,InChI=1S/C19H16O4/c1-12(20)11-15(13-7-3-2-4-8-...
1671,zofenopril-calcium,C[C@H](CSC(=O)C1=CC=CC=C1)C(=O)N1C[C@H](C[C@H]...,CC(CSC(=O)c1ccccc1)C(=O)N1CC(Sc2ccccc2)CC1C(=O)O,InChI=1S/C22H23NO4S2/c1-15(14-28-22(27)16-8-4-...
1672,zolimidine,"CS(=O)(=O)C1=CC=C(C=C1)C1=CN2C=CC=CC2=N1 |c:6,...",CS(=O)(=O)c1ccc(-c2cn3ccccc3n2)cc1,"InChI=1S/C14H12N2O2S/c1-19(17,18)12-7-5-11(6-8..."


In [51]:
RDLogger.DisableLog('rdApp.*')  
def smiles_to_inchikey(smiles):
    
    try:
        # Read SMILES and convert it to RDKit mol object
        mol = Chem.MolFromSmiles(smiles)
        inchikey_standardized = Chem.MolToInchiKey(mol)

        return inchikey_standardized
    
    except:
        return "Cannot_do"

In [52]:
df["Standardized_InChIKey_chembl"] = df["Standardized_SMILES_chembl"].parallel_apply(smiles_to_inchikey)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=21), Label(value='0 / 21'))), HBox…

In [53]:
df = df[["SMILES", "Standardized_SMILES_chembl", "Standardized_InChI_chembl", "Standardized_InChIKey_chembl"]]
df

Unnamed: 0,SMILES,Standardized_SMILES_chembl,Standardized_InChI_chembl,Standardized_InChIKey_chembl
0,C[C@@H]1CC[C@H]2C(C)(C)[C@H]3C[C@@]12CC[C@@]3(C)O,CC1CCC2C(C)(C)C3CC12CCC3(C)O,"InChI=1S/C15H26O/c1-10-5-6-11-13(2,3)12-9-15(1...",SVURIXNDRWRAFU-UHFFFAOYSA-N
1,O[C@@H](COC1=CC=CC(Cl)=C1)\C=C\[C@H]1[C@H](O)C...,O=C(O)CCCC=CCC1C(O)CC(O)C1C=CC(O)COc1cccc(Cl)c1,InChI=1S/C22H29ClO6/c23-15-6-5-7-17(12-15)29-1...,VJGGHXVGBSZVMZ-UHFFFAOYSA-N
2,[H][C@@]12CC3=C(C=CC(=O)N3)[C@@](N)(CC(C)=C1)C...,CC=C1C2C=C(C)CC1(N)c1ccc(=O)[nH]c1C2,"InChI=1S/C15H18N2O/c1-3-11-10-6-9(2)8-15(11,16...",ZRJBHWIHUMBLCN-UHFFFAOYSA-N
3,OC[C@H]1NC[C@@H](O)[C@@H](O)[C@@H]1O,OCC1NCC(O)C(O)C1O,InChI=1S/C6H13NO4/c8-2-3-5(10)6(11)4(9)1-7-3/h...,LXBIFEVIBLOUGU-UHFFFAOYSA-N
4,CCCCCCCCO,CCCCCCCCO,"InChI=1S/C8H18O/c1-2-3-4-5-6-7-8-9/h9H,2-8H2,1H3",KBPLFHHGFOOTCA-UHFFFAOYSA-N
...,...,...,...,...
1669,CCOC(=O)N[C@@H]1CC[C@@H]2[C@H](C[C@@H]3[C@@H](...,CCOC(=O)NC1CCC2C(C1)CC1C(=O)OC(C)C1C2C=Cc1ccc(...,InChI=1S/C29H33FN2O4/c1-3-35-29(34)32-23-10-11...,ZBGXUVOIWDMMJE-UHFFFAOYSA-N
1670,CC(=O)C[C@@H](C1=CC=CC=C1)C1=C(O)C2=C(OC1=O)C=...,CC(=O)CC(c1ccccc1)c1c(O)c2ccccc2oc1=O,InChI=1S/C19H16O4/c1-12(20)11-15(13-7-3-2-4-8-...,PJVWKTKQMONHTI-UHFFFAOYSA-N
1671,C[C@H](CSC(=O)C1=CC=CC=C1)C(=O)N1C[C@H](C[C@H]...,CC(CSC(=O)c1ccccc1)C(=O)N1CC(Sc2ccccc2)CC1C(=O)O,InChI=1S/C22H23NO4S2/c1-15(14-28-22(27)16-8-4-...,IAIDUHCBNLFXEF-UHFFFAOYSA-N
1672,"CS(=O)(=O)C1=CC=C(C=C1)C1=CN2C=CC=CC2=N1 |c:6,...",CS(=O)(=O)c1ccc(-c2cn3ccccc3n2)cc1,"InChI=1S/C14H12N2O2S/c1-19(17,18)12-7-5-11(6-8...",VSLIUWLPFRVCDL-UHFFFAOYSA-N


In [54]:
df.Standardized_InChIKey_chembl.nunique()

1638

In [55]:
df.to_csv("rxrx3_compounds_processed_chembl_identifiers.csv.gz", index=False, compression="gzip")

In [56]:
import pandas as pd
df = pd.read_csv("rxrx3_compounds_processed_chembl_identifiers.csv.gz", compression="gzip")
df

Unnamed: 0,SMILES,Standardized_SMILES_chembl,Standardized_InChI_chembl,Standardized_InChIKey_chembl
0,C[C@@H]1CC[C@H]2C(C)(C)[C@H]3C[C@@]12CC[C@@]3(C)O,CC1CCC2C(C)(C)C3CC12CCC3(C)O,"InChI=1S/C15H26O/c1-10-5-6-11-13(2,3)12-9-15(1...",SVURIXNDRWRAFU-UHFFFAOYSA-N
1,O[C@@H](COC1=CC=CC(Cl)=C1)\C=C\[C@H]1[C@H](O)C...,O=C(O)CCCC=CCC1C(O)CC(O)C1C=CC(O)COc1cccc(Cl)c1,InChI=1S/C22H29ClO6/c23-15-6-5-7-17(12-15)29-1...,VJGGHXVGBSZVMZ-UHFFFAOYSA-N
2,[H][C@@]12CC3=C(C=CC(=O)N3)[C@@](N)(CC(C)=C1)C...,CC=C1C2C=C(C)CC1(N)c1ccc(=O)[nH]c1C2,"InChI=1S/C15H18N2O/c1-3-11-10-6-9(2)8-15(11,16...",ZRJBHWIHUMBLCN-UHFFFAOYSA-N
3,OC[C@H]1NC[C@@H](O)[C@@H](O)[C@@H]1O,OCC1NCC(O)C(O)C1O,InChI=1S/C6H13NO4/c8-2-3-5(10)6(11)4(9)1-7-3/h...,LXBIFEVIBLOUGU-UHFFFAOYSA-N
4,CCCCCCCCO,CCCCCCCCO,"InChI=1S/C8H18O/c1-2-3-4-5-6-7-8-9/h9H,2-8H2,1H3",KBPLFHHGFOOTCA-UHFFFAOYSA-N
...,...,...,...,...
1669,CCOC(=O)N[C@@H]1CC[C@@H]2[C@H](C[C@@H]3[C@@H](...,CCOC(=O)NC1CCC2C(C1)CC1C(=O)OC(C)C1C2C=Cc1ccc(...,InChI=1S/C29H33FN2O4/c1-3-35-29(34)32-23-10-11...,ZBGXUVOIWDMMJE-UHFFFAOYSA-N
1670,CC(=O)C[C@@H](C1=CC=CC=C1)C1=C(O)C2=C(OC1=O)C=...,CC(=O)CC(c1ccccc1)c1c(O)c2ccccc2oc1=O,InChI=1S/C19H16O4/c1-12(20)11-15(13-7-3-2-4-8-...,PJVWKTKQMONHTI-UHFFFAOYSA-N
1671,C[C@H](CSC(=O)C1=CC=CC=C1)C(=O)N1C[C@H](C[C@H]...,CC(CSC(=O)c1ccccc1)C(=O)N1CC(Sc2ccccc2)CC1C(=O)O,InChI=1S/C22H23NO4S2/c1-15(14-28-22(27)16-8-4-...,IAIDUHCBNLFXEF-UHFFFAOYSA-N
1672,"CS(=O)(=O)C1=CC=C(C=C1)C1=CN2C=CC=CC2=N1 |c:6,...",CS(=O)(=O)c1ccc(-c2cn3ccccc3n2)cc1,"InChI=1S/C14H12N2O2S/c1-19(17,18)12-7-5-11(6-8...",VSLIUWLPFRVCDL-UHFFFAOYSA-N


In [57]:
from chembl_webresource_client.new_client import new_client

# Set up the ChEMBL client
chembl_client = new_client

In [58]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=80)

INFO: Pandarallel will run on 80 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [59]:
def smiles_to_chembl_id(smiles):
    try:
        molecule = chembl_client.molecule
        result = molecule.filter(smiles=smiles).only("molecule_chembl_id")
        return result[0]["molecule_chembl_id"] if result else None
    except Exception as e:
        print(f"Error converting SMILES to ChEMBL ID for {smiles}: {e}")
        return "Cannot_do"

In [62]:

from tqdm import tqdm
import pandas as pd

# Initialize tqdm for pandas
tqdm.pandas()


# Apply the function to each row with progress bar
df['ChEMBL_ID'] = df['Standardized_InChIKey_chembl'].progress_apply(inchikey_to_chembl_id)

# Display the updated DataFrame
df

 16%|█▌        | 271/1674 [00:06<00:34, 40.91it/s] 

KeyboardInterrupt



In [None]:
df.ChEMBL_ID.nunique()

In [None]:
df.to_csv("rxrx3_compounds_processed_chembl_identifiers.csv.gz", index=False, compression="gzip")