In [1]:
import pandas as pd
import pubchempy as pcp
import re

In [2]:
dir_DATA = "/mnt/w/Projects/Kinome/data/"
data_SI = "SI_info.xlsx"

# original ligands from the SI
lig_CAS = pd.read_excel(dir_DATA+data_SI,sheet_name='ligands',header=0).dropna(axis=0)
lig_CAS['CAS'] = lig_CAS['CAS'].str.replace(' ', '')
lig_CAS['ligand_name'] = lig_CAS['ligand_name'].str.strip()

In [3]:
def process_name(row):
    ligname = row["ligand_name"]
    if " (" in ligname:
        commercial_name = ligname[ligname.find("(")+1:ligname.find(")")]
        ligand = ligname.replace(f" ({commercial_name})", "")
#         ligand, _, commercial_name = ligname.partition(" (")
#         commercial_name = commercial_name.strip(")")
    else:
        ligand = ligname
        commercial_name = ligand
    return(ligand, commercial_name)
    

lig_CAS[["ligand", "commercial_name"]] = lig_CAS.apply(process_name, axis='columns', result_type='expand')

In [4]:
lig_CAS

Unnamed: 0,ligand_name,CAS,ligand,commercial_name
0,3-Methyladenine,5142-23-4,3-Methyladenine,3-Methyladenine
1,A66,1166227-08-2,A66,A66
2,A-674563,552325-73-2,A-674563,A-674563
3,A-769662,844499-71-4,A-769662,A-769662
4,AEE788 (NVP-AEE788),497839-62-0,AEE788,NVP-AEE788
...,...,...,...,...
265,Y-27632 2HCl,129830-38-2,Y-27632 2HCl,Y-27632 2HCl
266,YM201636,371942-69-7,YM201636,YM201636
267,ZM 336372,208260-29-1,ZM 336372,ZM 336372
268,ZM-447439,331771-20-1,ZM-447439,ZM-447439


In [5]:
# test = pcp.get_compounds(lig_CAS.loc[267,"ligand"], "name")

def get_substructure_cas(smiles):
    # https://github.com/mcs07/PubChemPy/blob/master/examples/CAS%20registry%20numbers.ipynb
    cas_rns = []
    
    results = pcp.get_synonyms(smiles, 'smiles')
    for result in results:
        for syn in result.get('Synonym', []):
            match = re.match('(\d{2,7}-\d\d-\d)', syn)
            if match:
                cas_rns.append(match.group(1))
                continue
    return cas_rns

def get_iso_smi(row):
    lignm0 = row["ligand_name"]
    lignm1 = row["ligand"]
    lignm2 = row["commercial_name"]
    cas = row["CAS"]
    pubchem_compounds = []
    for nm in [lignm0, lignm1, lignm2, cas]:
        pubchem_compounds.extend(pcp.get_compounds(nm, "name"))
    if len(pubchem_compounds) == 0:
        print(f"{lignm} not found")
        return("")
    valid = False
    for rec in pubchem_compounds:
        iso_smi = rec.isomeric_smiles
        matching_cas = get_substructure_cas(iso_smi)
        if cas in matching_cas:
            valid = True
            break
        else:
            continue
    if not valid: # no match found when comparing SMILES
        print(f"{lignm0} may contain error")
    return(iso_smi)

lig_CAS["SMILES"] = lig_CAS.apply(get_iso_smi, axis='columns')

BMS-265246 may contain error
CP-724714 may contain error
Raf265 derivative may contain error


In [8]:
# check entries manually
# compare with records (publication/pubchem/chembl/rcsb); edit the spreadsheet if needed
lig_CAS.loc[lig_CAS["ligand_name"]=="Raf265 derivative", :]

Unnamed: 0,ligand_name,CAS,ligand,commercial_name,SMILES
199,Raf265 derivative,n.a.;SelleckchemCat.No.S2200,Raf265 derivative,Raf265 derivative,C1=CC(=CC=C1C(F)(F)F)NC2=NC3=C(N2)C=C(C=C3)OC4...


In [75]:
lig_CAS.to_csv(dir_DATA+"ligands_pubchem.csv")