# Map compound names to SMILES and InChiKeys

This notebook serves to map compound names of microbial datasets to unique smiles ids.

In [1]:
import numpy as np
import pandas as pd
import pubchempy as pcp
import re

from chemicalchecker.util.parser import Converter

In [2]:
compoundNameInputFile = "../data/compound_name_df.csv"

df = pd.read_csv(compoundNameInputFile, index_col = 0)

In [3]:
df

Unnamed: 0_level_0,prestwick_id,compound_name
compound_ids,Unnamed: 1_level_1,Unnamed: 2_level_1
Prestw-1109_Rifabutin,Prestw-1109,Rifabutin
Prestw-1399_Doxycycline hydrochloride,Prestw-1399,Doxycycline hydrochloride
Prestw-145_Chlortetracycline hydrochloride,Prestw-145,Chlortetracycline hydrochloride
Prestw-1464_Tosufloxacin hydrochloride,Prestw-1464,Tosufloxacin hydrochloride
Prestw-31_Chloramphenicol,Prestw-31,Chloramphenicol
...,...,...
Prestw-540_Clorsulon,Prestw-540,Clorsulon
Prestw-61_Morantel tartrate,Prestw-61,Morantel tartrate
Prestw-731_Sulfaquinoxaline sodium salt,Prestw-731,Sulfaquinoxaline sodium salt
Prestw-8_Amprolium hydrochloride,Prestw-8,Amprolium hydrochloride


In [4]:
cmpName2smiles = {}

In [5]:
for cmp in df['compound_name'].values:
    try:
        cmp_smiles = Converter().chemical_name_to_smiles(cmp)
        
    except:
        try:
            cmp_smiles = Converter().chemical_name_to_smiles(re.sub('[L,R,S,+,-,\s]*\(.+\)-*', '', re.sub('\\,*\s.+', '', cmp)))
        
        except:
            try:
                cmp_smiles = Converter().chemical_name_to_smiles(re.sub('\s+\\(R,S\\)', '', re.sub('\s+', ' ', re.sub('\\(n \\)', '', re.sub('-', ' ', cmp)))))
                
            except:
                try: 
                    pcp_compound = pcp.get_compounds(cmp, 'name')
                    cmp_smiles = [pcp_cmp.isomeric_smiles for pcp_cmp in pcp_compound][0]
                    
                except:
                    if re.match('Morpholinoethylamino-3-benzocyclohepta-\\(5,6-c\\)-pyridazine dihydrochloride', cmp):
                        cmp_smiles = 'CC(CCCN)NC1=C2C(=CC(=C1)OC)C=CC=N2.OP(=O)(O)O.OP(=O)(O)O'
                    elif re.match('Oxibendazol', cmp):
                        cmp_smiles = 'CCCOC1=CC2=C(C=C1)N=C(N2)NC(=O)OC'
                    else:
                        cmp_smiles = ''
                        print(cmp)
            
    cmpName2smiles[cmp] = cmp_smiles

Morpholinoethylamino-3-benzocyclohepta-(5,6-c)-pyridazine dihydrochloride


In [None]:
cmpName2smiles

In [9]:
# add column with smiles ids to df
df['SMILES_id'] = df['compound_name'].map(cmpName2smiles)

In [54]:
df.loc[df['compound_name'] == 'Morpholinoethylamino-3-benzocyclohepta-(5,6-c)-pyridazine dihydrochloride']

Unnamed: 0_level_0,prestwick_id,compound_name
compound_ids,Unnamed: 1_level_1,Unnamed: 2_level_1
Prestw-1460_Oxibendazol,Prestw-1460,Oxibendazol


In [15]:
df.loc[df.compound_name == 'Morpholinoethylamino-3-benzocyclohepta-(5,6-c)-pyridazine dihydrochloride', 'SMILES_id'] = 'CC(CCCN)NC1=C2C(=CC(=C1)OC)C=CC=N2.OP(=O)(O)O.OP(=O)(O)O'

In [17]:
df.loc[df['compound_name'] == 'Morpholinoethylamino-3-benzocyclohepta-(5,6-c)-pyridazine dihydrochloride']

Unnamed: 0_level_0,prestwick_id,compound_name,SMILES_id
compound_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Prestw-1109_Rifabutin,Prestw-1109,Rifabutin,CO[C@H]1/C=C/O[C@@]2(C)Oc3c(C)c(O)c4C(=O)C(=C5...
Prestw-1399_Doxycycline hydrochloride,Prestw-1399,Doxycycline hydrochloride,[Cl-].C[C@@H]1[C@H]2[C@H](O)[C@H]3[C@H]([NH+](...
Prestw-145_Chlortetracycline hydrochloride,Prestw-145,Chlortetracycline hydrochloride,[Cl-].CN(C)[C@H]1[C@@H]2C[C@H]3C(=C(O)c4c(O)cc...
Prestw-1464_Tosufloxacin hydrochloride,Prestw-1464,Tosufloxacin hydrochloride,[H+].O.[Cl-].NC1CCN(C1)c2nc3N(C=C(C(O)=O)C(=O)...
Prestw-31_Chloramphenicol,Prestw-31,Chloramphenicol,OCC(NC(=O)C(Cl)Cl)C(O)c1ccc(cc1)[N+]([O-])=O
...,...,...,...
Prestw-540_Clorsulon,Prestw-540,Clorsulon,Nc1cc(C(Cl)=C(Cl)Cl)c(cc1[S](N)(=O)=O)[S](N)(=...
Prestw-61_Morantel tartrate,Prestw-61,Morantel tartrate,CN1CCCN=C1/C=C/c2sccc2C.O[C@H]([C@@H](O)C(O)=O...
Prestw-731_Sulfaquinoxaline sodium salt,Prestw-731,Sulfaquinoxaline sodium salt,[Na+].Nc1ccc(cc1)[S]([O-])(=O)=Nc2ccc3nccnc3c2
Prestw-8_Amprolium hydrochloride,Prestw-8,Amprolium hydrochloride,[H+].[Cl-].[Cl-].CCCc1ncc(C[n+]2ccccc2C)c(N)n1


In [None]:
df

In [18]:
df.to_csv('../data/compound_name_smiles_df.csv')

## Now map SMILES to InChiKeys

In [19]:
smiles2inchikeys = {}

In [24]:
for cmp in df['SMILES_id'].values:
    try:
        pcp_compound = pcp.get_compounds(cmp, 'smiles')
        cmp_inchi = [pcp_cmp.inchikey for pcp_cmp in pcp_compound][0]
        
    except:
        cmp_inchi = ''
        print(cmp)
            
    smiles2inchikeys[cmp] = cmp_inchi

<!DOCTYPE html>

<html>

<head>
	<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
	<!--<meta http-equiv="X-UA-Compatible" content="chrome=1">-->
	<meta http-equiv="X-UA-Compatible" content="IE=EmulateIE7, IE=9" >
	<title>NCI/CADD Chemical Identifier Resolver</title>
	<meta name="robots" content="index, follow" />
	<meta name="author" content="NCICADD Group, National Cancer Institute" />
	<meta name="author-personal" content="Markus Sitzmann, Igor Filippov, Marc Nicklaus" />
	<meta name="author-mail" content="webmaster@https://cactus.nci.nih.gov"/>
	<meta name="keywords" content="chemical identifier, resolver, InChI, InChIKey, SMILES, GIF, database, chemical names, inchikey resolver, InChIKey resolver" />
	<meta name="description" content="Chemical Identifier Resolver" />
	
	<link rel="stylesheet" href="/style/carbon.css" type="text/css" media="screen,projection" />
	<!--<link rel="stylesheet" href="/style/browser.css" type="text/css" media="screen,projection">-->
 

In [27]:
# add column with inchikeys ids to df
df['InChiKey'] = df['SMILES_id'].map(smiles2inchikeys)

In [28]:
df

Unnamed: 0_level_0,prestwick_id,compound_name,SMILES_id,InChiKey
compound_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Prestw-1109_Rifabutin,Prestw-1109,Rifabutin,CO[C@H]1/C=C/O[C@@]2(C)Oc3c(C)c(O)c4C(=O)C(=C5...,ZWBTYMGEBZUQTK-PVLSIAFMSA-N
Prestw-1399_Doxycycline hydrochloride,Prestw-1399,Doxycycline hydrochloride,[Cl-].C[C@@H]1[C@H]2[C@H](O)[C@H]3[C@H]([NH+](...,PTNZGHXUZDHMIQ-CVHRZJFOSA-N
Prestw-145_Chlortetracycline hydrochloride,Prestw-145,Chlortetracycline hydrochloride,[Cl-].CN(C)[C@H]1[C@@H]2C[C@H]3C(=C(O)c4c(O)cc...,QYAPHLRPFNSDNH-MRFRVZCGSA-N
Prestw-1464_Tosufloxacin hydrochloride,Prestw-1464,Tosufloxacin hydrochloride,[H+].O.[Cl-].NC1CCN(C1)c2nc3N(C=C(C(O)=O)C(=O)...,HIUQPJJYKJHGMT-UHFFFAOYSA-N
Prestw-31_Chloramphenicol,Prestw-31,Chloramphenicol,OCC(NC(=O)C(Cl)Cl)C(O)c1ccc(cc1)[N+]([O-])=O,WIIZWVCIJKGZOK-UHFFFAOYSA-N
...,...,...,...,...
Prestw-540_Clorsulon,Prestw-540,Clorsulon,Nc1cc(C(Cl)=C(Cl)Cl)c(cc1[S](N)(=O)=O)[S](N)(=...,QOVTVIYTBRHADL-UHFFFAOYSA-N
Prestw-61_Morantel tartrate,Prestw-61,Morantel tartrate,CN1CCCN=C1/C=C/c2sccc2C.O[C@H]([C@@H](O)C(O)=O...,GGXQONWGCAQGNA-UUSVNAAPSA-N
Prestw-731_Sulfaquinoxaline sodium salt,Prestw-731,Sulfaquinoxaline sodium salt,[Na+].Nc1ccc(cc1)[S]([O-])(=O)=Nc2ccc3nccnc3c2,
Prestw-8_Amprolium hydrochloride,Prestw-8,Amprolium hydrochloride,[H+].[Cl-].[Cl-].CCCc1ncc(C[n+]2ccccc2C)c(N)n1,PJBQYZZKGNOKNJ-UHFFFAOYSA-M


In [32]:
df.loc[df['InChiKey'] is None]

KeyError: False

In [33]:
df.to_csv('../data/compound_name_smiles_inchi_df.csv')

## Map compounds used by Stokes et al. to SMILES / InChiKeys

In [31]:
stokesCompoundNameInputFile = "/aloy/home/nkurzawa/projects/01_microbial_CC_signatures/data/stokes_mmc1-5.xlsx"

stokes_df = pd.read_excel(stokesCompoundNameInputFile, index_col = False, sheet_name = 'S1B', header = 1)

In [32]:
stokes_df['']

Unnamed: 0,Mean_Inhibition,SMILES,Name,Activity,Unnamed: 4,Unnamed: 5
0,0.041572,Cc1cc(O)c(C(=O)NC(C(=O)NC2C(=O)N3C(C(=O)O)=C(C...,CEFPIRAMIDE,Active,,
1,0.041876,CON=C1CN(c2nc3c(cc2F)c(=O)c(C(=O)O)cn3C2CC2)CC...,GEMIFLOXACIN MESYLATE,Active,,
2,0.041916,CCC(C)CCCCC(=O)NC(CCN)C(=O)NC(C(=O)NC(CCN)C(=O...,POLYMYXIN B SULFATE,Active,,
3,0.041964,Cl.N=C(N)n1cccn1,PRAXADINE HYDROCHLORIDE,Active,,
4,0.042295,Cl.Cl.N=C(NCCCCCCNC(=N)NC(=N)Nc1ccc(Cl)cc1)NC(...,CHLORHEXIDINE DIHYDROCHLORIDE,Active,,
...,...,...,...,...,...,...
2330,1.235350,C=CC1(C)CC(=O)C2(O)C(C)(O1)C(OC(C)=O)C(O)C1C(C...,COLFORSIN,Inactive,,
2331,1.251650,O=S(=O)([O-])c1cc(O)c2c(N=Nc3ccc(Nc4ccccc4)c4c...,ANAZOLENE SODIUM,Inactive,,
2332,1.273150,Nc1ccc(S(=O)(=O)c2ccc(N)cc2)cc1,DAPSONE,Inactive,,
2333,2.263200,Cc1cc(-c2ccc(N=Nc3ccc4c(S(=O)(=O)[O-])cc(S(=O)...,EVANS BLUE,Inactive,,


In [8]:
stokesCmpName2smiles = {}

In [9]:
for cmp in stokes_df['Name'].values:
    try:
        cmp_smiles = Converter().chemical_name_to_smiles(cmp)
        
    except:
        try:
            cmp_smiles = Converter().chemical_name_to_smiles(re.sub('[L,R,S,+,-,\s]*\(.+\)-*', '', re.sub('\\,*\s.+', '', cmp)))
        
        except:
            try:
                cmp_smiles = Converter().chemical_name_to_smiles(re.sub('\s+\\(R,S\\)', '', re.sub('\s+', ' ', re.sub('\\(n \\)', '', re.sub('-', ' ', cmp)))))
                
            except:
                try: 
                    pcp_compound = pcp.get_compounds(cmp.capitalize(), 'name')
                    cmp_smiles = [pcp_cmp.isomeric_smiles for pcp_cmp in pcp_compound][0]
                    
                except:
                    if re.match('Morpholinoethylamino-3-benzocyclohepta-\\(5,6-c\\)-pyridazine dihydrochloride', cmp):
                        cmp_smiles = 'CC(CCCN)NC1=C2C(=CC(=C1)OC)C=CC=N2.OP(=O)(O)O.OP(=O)(O)O'
                    elif re.match('Oxibendazol', cmp):
                        cmp_smiles = 'CCCOC1=CC2=C(C=C1)N=C(N2)NC(=O)OC'
                    else:
                        cmp_smiles = ''
                        print(cmp)
            
    stokesCmpName2smiles[cmp] = cmp_smiles

BENZALKONIUM CHLORIDE HYDRATE
KITASAMYCINS [A1 shown]
LARIXOL ACETATE
2-CARENE-3-ONE
ALGINIC ACID [Mol Wt ~200,000; monomers shown]
7,4'-DIMETHOXY-2'-HYDROXY-ISOFLAVONE
HETEROPEUCENIN METHYL ETHER
11(12)-EPOXY-URSOLOLACTONE ACETATE
3-beta-ACETOXYALLOPREG-16-EN-20-ONE
N-METHYL (-)EPHEDRINE [1R,2S]
16beta-METHYL-16(17)-EPOXY-5-PREGNENOLONE
TERPENE HYDRATE
METHYL-2-HYDROXY-PHENYLPROPRIONATE
5,4'-DIMETHOXY-7-HYDROXYISOFLAVONE
ASPARTIC ACID (L)
LEOIDIN [10mg]
GLUCITOL-4-GLUCOPYANOSIDE
PRAZOSIN HYDROCHLORIDE [10mg]
CETILISAT
URSOLOLACTONE ACETATE
LINACLOTIDE (1 mg/ml)
ARTEPAULIN
SEDANOIC ACID
MAGNIFERITIN 3,6,7-TRIMETHYL ETHER
CAPRYILIDENE
(-)E-CARYOPHYLLENE hydrate
GLAUCONIC ACID ACETATE
6-(3,4-METHYLENEDIOXYSTYRYL)-alpha-PYRONE
CASANTHRANOL [cascaroside A shown]
AGELASINE
alpha-TOCHOPHERYL ACETATE [4mM]
DIHYDROICTHYONE
ACETYL-DIHYDRO-7-EPIKHIVORIN
DICHLOROEVERNIC ACID
AMBROXIDE (-)
3,4-DEHYDRO-1,2-DIHYDRO-3-DESOXO-GEDUNIN
SUCRALFATE [5mM]


In [33]:
stokes_df['SMILES_cc'] = stokes_df['Name'].map(stokesCmpName2smiles)

In [72]:
stokes_df.loc[stokes_df['SMILES_cc'] == '']

Unnamed: 0,Mean_Inhibition,SMILES,Name,Activity,Unnamed: 4,Unnamed: 5,SMILES_cc


In [71]:
stokes_df['SMILES_cc'] = np.where((stokes_df.SMILES_cc == ''), stokes_df.SMILES, stokes_df.SMILES_cc)

In [3]:
stokes_df

Unnamed: 0.1,Unnamed: 0,Mean_Inhibition,SMILES,Name,Activity,Unnamed: 4,Unnamed: 5,SMILES_cc
0,0,0.041572,Cc1cc(O)c(C(=O)NC(C(=O)NC2C(=O)N3C(C(=O)O)=C(C...,CEFPIRAMIDE,Active,,,Cn1nnnc1SCC2=C(N3[C@H](SC2)C(NC(=O)[C@H](NC(=O...
1,1,0.041876,CON=C1CN(c2nc3c(cc2F)c(=O)c(C(=O)O)cn3C2CC2)CC...,GEMIFLOXACIN MESYLATE,Active,,,CO\N=C1/CN(CC1CN)c2nc3N(C=C(C(O)=O)C(=O)c3cc2F...
2,2,0.041916,CCC(C)CCCCC(=O)NC(CCN)C(=O)NC(C(=O)NC(CCN)C(=O...,POLYMYXIN B SULFATE,Active,,,CCC(C)CCCCC(=O)NC(CCN)C(=O)NC(C(C)O)C(=O)NC(CC...
3,3,0.041964,Cl.N=C(N)n1cccn1,PRAXADINE HYDROCHLORIDE,Active,,,[H+].[Cl-].NC(=N)n1cccn1
4,4,0.042295,Cl.Cl.N=C(NCCCCCCNC(=N)NC(=N)Nc1ccc(Cl)cc1)NC(...,CHLORHEXIDINE DIHYDROCHLORIDE,Active,,,[H+].[H+].[Cl-].[Cl-].NC(Nc1ccc(Cl)cc1)=NC(N)=...
...,...,...,...,...,...,...,...,...
2330,2330,1.235350,C=CC1(C)CC(=O)C2(O)C(C)(O1)C(OC(C)=O)C(O)C1C(C...,COLFORSIN,Inactive,,,CC(=O)O[C@H]1[C@@H](O)[C@H]2C(C)(C)CC[C@H](O)[...
2331,2331,1.251650,O=S(=O)([O-])c1cc(O)c2c(N=Nc3ccc(Nc4ccccc4)c4c...,ANAZOLENE SODIUM,Inactive,,,[Na+].[Na+].[Na+].Oc1cc(cc2cc(cc(N=Nc3ccc(Nc4c...
2332,2332,1.273150,Nc1ccc(S(=O)(=O)c2ccc(N)cc2)cc1,DAPSONE,Inactive,,,Nc1ccc(cc1)[S](=O)(=O)c2ccc(N)cc2
2333,2333,2.263200,Cc1cc(-c2ccc(N=Nc3ccc4c(S(=O)(=O)[O-])cc(S(=O)...,EVANS BLUE,Inactive,,,[Na+].[Na+].[Na+].[Na+].Cc1cc(ccc1N\N=C\2C=Cc3...


In [2]:
stokes_df = pd.read_csv('../data/stokes_et_al_smiles_cc_df.csv', index_col = False)

In [74]:
stokes_df.to_csv('../data/stokes_et_al_smiles_cc_df.csv')

### Now map SMILES to InChiKey

In [4]:
stokesSmiles2inchikeys = {}

In [5]:
for cmp in stokes_df['SMILES_cc'].values:
    try:
        pcp_compound = pcp.get_compounds(cmp, 'smiles')
        cmp_inchi = [pcp_cmp.inchikey for pcp_cmp in pcp_compound][0]
        
    except:
        cmp_inchi = ''
        print(cmp)
            
    stokesSmiles2inchikeys[cmp] = cmp_inchi

C.NO[F]O[I]N[IH]
O.O.O.[K+].[SbH6+3].OC(C(O)C([O-])=O)C([O-])=O.OC(C(O)C([O-])=O)C([O-])=O
OC(=O)C1=C[NH++]([O-])[CH-]C=C1
OCC1=C[NH++]([O-])C=C[CH-]1
<!DOCTYPE html>

<html>

<head>
	<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
	<!--<meta http-equiv="X-UA-Compatible" content="chrome=1">-->
	<meta http-equiv="X-UA-Compatible" content="IE=EmulateIE7, IE=9" >
	<title>NCI/CADD Chemical Identifier Resolver</title>
	<meta name="robots" content="index, follow" />
	<meta name="author" content="NCICADD Group, National Cancer Institute" />
	<meta name="author-personal" content="Markus Sitzmann, Igor Filippov, Marc Nicklaus" />
	<meta name="author-mail" content="webmaster@https://cactus.nci.nih.gov"/>
	<meta name="keywords" content="chemical identifier, resolver, InChI, InChIKey, SMILES, GIF, database, chemical names, inchikey resolver, InChIKey resolver" />
	<meta name="description" content="Chemical Identifier Resolver" />
	
	<link rel="stylesheet" href="/style/carbon.

In [6]:
stokes_df['InChiKey'] = stokes_df['SMILES_cc'].map(stokesSmiles2inchikeys)

In [7]:
stokes_df

Unnamed: 0.1,Unnamed: 0,Mean_Inhibition,SMILES,Name,Activity,Unnamed: 4,Unnamed: 5,SMILES_cc,InChiKey
0,0,0.041572,Cc1cc(O)c(C(=O)NC(C(=O)NC2C(=O)N3C(C(=O)O)=C(C...,CEFPIRAMIDE,Active,,,Cn1nnnc1SCC2=C(N3[C@H](SC2)C(NC(=O)[C@H](NC(=O...,PWAUCHMQEXVFJR-JIUDAOPXSA-N
1,1,0.041876,CON=C1CN(c2nc3c(cc2F)c(=O)c(C(=O)O)cn3C2CC2)CC...,GEMIFLOXACIN MESYLATE,Active,,,CO\N=C1/CN(CC1CN)c2nc3N(C=C(C(O)=O)C(=O)c3cc2F...,JIYMVSQRGZEYAX-YDHFHHHVSA-N
2,2,0.041916,CCC(C)CCCCC(=O)NC(CCN)C(=O)NC(C(=O)NC(CCN)C(=O...,POLYMYXIN B SULFATE,Active,,,CCC(C)CCCCC(=O)NC(CCN)C(=O)NC(C(C)O)C(=O)NC(CC...,HFMDLUQUEXNBOP-UHFFFAOYSA-N
3,3,0.041964,Cl.N=C(N)n1cccn1,PRAXADINE HYDROCHLORIDE,Active,,,[H+].[Cl-].NC(=N)n1cccn1,RBZRMBCLZMEYEH-UHFFFAOYSA-N
4,4,0.042295,Cl.Cl.N=C(NCCCCCCNC(=N)NC(=N)Nc1ccc(Cl)cc1)NC(...,CHLORHEXIDINE DIHYDROCHLORIDE,Active,,,[H+].[H+].[Cl-].[Cl-].NC(Nc1ccc(Cl)cc1)=NC(N)=...,WJLVQTJZDCGNJN-UHFFFAOYSA-N
...,...,...,...,...,...,...,...,...,...
2330,2330,1.235350,C=CC1(C)CC(=O)C2(O)C(C)(O1)C(OC(C)=O)C(O)C1C(C...,COLFORSIN,Inactive,,,CC(=O)O[C@H]1[C@@H](O)[C@H]2C(C)(C)CC[C@H](O)[...,OHCQJHSOBUTRHG-KGGHGJDLSA-N
2331,2331,1.251650,O=S(=O)([O-])c1cc(O)c2c(N=Nc3ccc(Nc4ccccc4)c4c...,ANAZOLENE SODIUM,Inactive,,,[Na+].[Na+].[Na+].Oc1cc(cc2cc(cc(N=Nc3ccc(Nc4c...,ADGGJQPKBDIZMT-UHFFFAOYSA-K
2332,2332,1.273150,Nc1ccc(S(=O)(=O)c2ccc(N)cc2)cc1,DAPSONE,Inactive,,,Nc1ccc(cc1)[S](=O)(=O)c2ccc(N)cc2,MQJKPEGWNLWLTK-UHFFFAOYSA-N
2333,2333,2.263200,Cc1cc(-c2ccc(N=Nc3ccc4c(S(=O)(=O)[O-])cc(S(=O)...,EVANS BLUE,Inactive,,,[Na+].[Na+].[Na+].[Na+].Cc1cc(ccc1N\N=C\2C=Cc3...,KBNIFDASRCWYGC-KJPFXMIPSA-J


In [8]:
stokes_df.to_csv('../data/stokes_et_al_smiles_cc_inchikey_df.csv')

## Map compounds used by Lans et al. to SMILES / InChiKeys

In [3]:
gattii_df = pd.read_excel('/aloy/home/nkurzawa/projects/01_microbial_CC_signatures/data/yeast_strain_effect_msb201131-sup-0002.xls', index_col = False, sheet_name = 'C. gattii')

In [4]:
gattii_df

Unnamed: 0,Plate,Column,Row,Drug name,C. gattii \nPrestwick,C. gattii \nFLC+Prestwick,%Inhibition,Residuals,Hit?
0,4,9,G,Pimozide,171.415870,20.088300,88.28,151.327570,yes
1,2,9,F,Terfenadine,144.232078,0.739291,99.49,143.492787,yes
2,5,6,F,Methiothepin maleate,137.714787,0.566086,99.59,137.148701,yes
3,8,9,A,Rescinnamin,136.966534,0.000000,100.00,136.966534,yes
4,12,7,C,Fluspirilen,163.511757,29.533463,81.94,133.978294,yes
...,...,...,...,...,...,...,...,...,...
1115,14,4,G,Quinic acid,95.626953,158.872124,-66.14,-63.245171,
1116,4,8,H,Guanethidine sulfate,94.898866,165.031833,-73.90,-70.132966,
1117,7,11,G,Parthenolide,89.525342,160.151451,-78.89,-70.626109,
1118,12,5,B,Butylparaben,58.312655,131.241322,-125.06,-72.928667,


In [5]:
lansCmpName2smiles = {}

In [10]:
for cmp in gattii_df['Drug name'].values:
    try:
        cmp_smiles = Converter().chemical_name_to_smiles(cmp)
        
    except:
        try:
            print("No direct conversion available for", cmp)
            cmp_smiles = Converter().chemical_name_to_smiles(re.sub('[L,R,S,+,-,\s]*\(.+\)-*', '', re.sub('\\,*\s.+', '', cmp)))
        
        except:
            try:
                cmp_smiles = Converter().chemical_name_to_smiles(re.sub('\s+\\(R,S\\)', '', re.sub('\s+', ' ', re.sub('\\(n \\)', '', re.sub('-', ' ', cmp)))))
                
            except:
                try: 
                    pcp_compound = pcp.get_compounds(cmp.capitalize(), 'name')
                    cmp_smiles = [pcp_cmp.isomeric_smiles for pcp_cmp in pcp_compound][0]
                    
                except:
                    if re.match('Morpholinoethylamino-3-benzocyclohepta-\\(5,6-c\\)-pyridazine dihydrochloride', cmp):
                        cmp_smiles = 'CC(CCCN)NC1=C2C(=CC(=C1)OC)C=CC=N2.OP(=O)(O)O.OP(=O)(O)O'
                    elif re.match('Oxibendazol', cmp):
                        cmp_smiles = 'CCCOC1=CC2=C(C=C1)N=C(N2)NC(=O)OC'
                    else:
                        cmp_smiles = ''
                        print("Not any conversion found for", cmp)
            
    lansCmpName2smiles[cmp] = cmp_smiles

No direct conversion available for Fluspirilen
No direct conversion available for Zuclopenthixol hydrochloride
No direct conversion available for Pirlindole mesylate
No direct conversion available for Solasodine
No direct conversion available for Dehydroisoandosterone 3-acetate
No direct conversion available for S(-)Eticlopride hydrochloride
No direct conversion available for Tracazolate hydrochloride
No direct conversion available for Novobiocin sodium salt
No direct conversion available for Thioproperazine dimesylate
No direct conversion available for Rifampicin
No direct conversion available for S(-)-terguride hydrogen maleate
No direct conversion available for Natamycin
No direct conversion available for (S)-propranolol hydrochloride
No direct conversion available for Domperidone maleate
No direct conversion available for Dicumarol
No direct conversion available for Benzamil hydrochloride
No direct conversion available for S-(+)-ibuprofen
No direct conversion available for Chelidon

In [12]:
# add column with smiles ids to df
gattii_df['SMILES_id'] = gattii_df['Drug name'].map(lansCmpName2smiles)

In [13]:
gattii_df

Unnamed: 0,Plate,Column,Row,Drug name,C. gattii \nPrestwick,C. gattii \nFLC+Prestwick,%Inhibition,Residuals,Hit?,SMILES_id
0,4,9,G,Pimozide,171.415870,20.088300,88.28,151.327570,yes,Fc1ccc(cc1)C(CCCN2CCC(CC2)N3C(=O)Nc4ccccc34)c5...
1,2,9,F,Terfenadine,144.232078,0.739291,99.49,143.492787,yes,CC(C)(C)c1ccc(cc1)C(O)CCCN2CCC(CC2)C(O)(c3cccc...
2,5,6,F,Methiothepin maleate,137.714787,0.566086,99.59,137.148701,yes,CSc1ccc2Sc3ccccc3CC(N4CCN(C)CC4)c2c1.OC(=O)\C=...
3,8,9,A,Rescinnamin,136.966534,0.000000,100.00,136.966534,yes,CO[C@H]1[C@@H](C[C@@H]2CN3CCc4c([nH]c5cc(OC)cc...
4,12,7,C,Fluspirilen,163.511757,29.533463,81.94,133.978294,yes,C1CN(CCC12C(=O)NCN2C3=CC=CC=C3)CCCC(C4=CC=C(C=...
...,...,...,...,...,...,...,...,...,...,...
1115,14,4,G,Quinic acid,95.626953,158.872124,-66.14,-63.245171,,OC1CC(O)(CC(O)C1O)C(O)=O
1116,4,8,H,Guanethidine sulfate,94.898866,165.031833,-73.90,-70.132966,,NC(N)=NCCN1CCCCCCC1
1117,7,11,G,Parthenolide,89.525342,160.151451,-78.89,-70.626109,,C\C1=C\CC[C@@]2(C)O[C@H]2[C@H]3OC(=O)C(=C)[C@@...
1118,12,5,B,Butylparaben,58.312655,131.241322,-125.06,-72.928667,,CCCCOC(=O)c1ccc(O)cc1


In [14]:
gattii_df.to_csv('../data/lans_gattii_smiles_df.csv')

### Now map SMILES to InChiKey

In [2]:
gattii_df = pd.read_csv('../data/lans_gattii_smiles_df.csv')

In [3]:
lansSmiles2inchikeys = {}

In [4]:
for cmp in gattii_df['SMILES_id'].values:
    try:
        pcp_compound = pcp.get_compounds(cmp, 'smiles')
        cmp_inchi = [pcp_cmp.inchikey for pcp_cmp in pcp_compound][0]
        
    except:
        cmp_inchi = ''
        print(cmp)
            
    lansSmiles2inchikeys[cmp] = cmp_inchi

nan
nan
<!DOCTYPE html>

<html>

<head>
	<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
	<!--<meta http-equiv="X-UA-Compatible" content="chrome=1">-->
	<meta http-equiv="X-UA-Compatible" content="IE=EmulateIE7, IE=9" >
	<title>NCI/CADD Chemical Identifier Resolver</title>
	<meta name="robots" content="index, follow" />
	<meta name="author" content="NCICADD Group, National Cancer Institute" />
	<meta name="author-personal" content="Markus Sitzmann, Igor Filippov, Marc Nicklaus" />
	<meta name="author-mail" content="webmaster@https://cactus.nci.nih.gov"/>
	<meta name="keywords" content="chemical identifier, resolver, InChI, InChIKey, SMILES, GIF, database, chemical names, inchikey resolver, InChIKey resolver" />
	<meta name="description" content="Chemical Identifier Resolver" />
	
	<link rel="stylesheet" href="/style/carbon.css" type="text/css" media="screen,projection" />
	<!--<link rel="stylesheet" href="/style/browser.css" type="text/css" media="screen,projectio

In [5]:
# add column with inchikeys to df
gattii_df['InChiKeys'] = gattii_df['SMILES_id'].map(lansSmiles2inchikeys)

In [6]:
gattii_df

Unnamed: 0.1,Unnamed: 0,Plate,Column,Row,Drug name,C. gattii \nPrestwick,C. gattii \nFLC+Prestwick,%Inhibition,Residuals,Hit?,SMILES_id,InChiKeys
0,0,4,9,G,Pimozide,171.415870,20.088300,88.28,151.327570,yes,Fc1ccc(cc1)C(CCCN2CCC(CC2)N3C(=O)Nc4ccccc34)c5...,YVUQSNJEYSNKRX-UHFFFAOYSA-N
1,1,2,9,F,Terfenadine,144.232078,0.739291,99.49,143.492787,yes,CC(C)(C)c1ccc(cc1)C(O)CCCN2CCC(CC2)C(O)(c3cccc...,GUGOEEXESWIERI-UHFFFAOYSA-N
2,2,5,6,F,Methiothepin maleate,137.714787,0.566086,99.59,137.148701,yes,CSc1ccc2Sc3ccccc3CC(N4CCN(C)CC4)c2c1.OC(=O)\C=...,IWDBEHWZGDSFHR-BTJKTKAUSA-N
3,3,8,9,A,Rescinnamin,136.966534,0.000000,100.00,136.966534,yes,CO[C@H]1[C@@H](C[C@@H]2CN3CCc4c([nH]c5cc(OC)cc...,SZLZWPPUNLXJEA-QEGASFHISA-N
4,4,12,7,C,Fluspirilen,163.511757,29.533463,81.94,133.978294,yes,C1CN(CCC12C(=O)NCN2C3=CC=CC=C3)CCCC(C4=CC=C(C=...,QOYHHIBFXOOADH-UHFFFAOYSA-N
...,...,...,...,...,...,...,...,...,...,...,...,...
1115,1115,14,4,G,Quinic acid,95.626953,158.872124,-66.14,-63.245171,,OC1CC(O)(CC(O)C1O)C(O)=O,AAWZDTNXLSGCEK-UHFFFAOYSA-N
1116,1116,4,8,H,Guanethidine sulfate,94.898866,165.031833,-73.90,-70.132966,,NC(N)=NCCN1CCCCCCC1,ACGDKVXYNVEAGU-UHFFFAOYSA-N
1117,1117,7,11,G,Parthenolide,89.525342,160.151451,-78.89,-70.626109,,C\C1=C\CC[C@@]2(C)O[C@H]2[C@H]3OC(=O)C(=C)[C@@...,KTEXNACQROZXEV-QLIGOWBFSA-N
1118,1118,12,5,B,Butylparaben,58.312655,131.241322,-125.06,-72.928667,,CCCCOC(=O)c1ccc(O)cc1,QFOHBWFCKVYLES-UHFFFAOYSA-N


In [7]:
gattii_df.to_csv('../data/lans_gattii_smiles_inchikeys_df.csv')

## Map compound names for Zimmermann et al. dataset

In [6]:
zimCompoundNameInputFile = "../data/zimmermann_cpd_name_df.csv"

zim_df = pd.read_csv(zimCompoundNameInputFile, index_col = False)

In [7]:
zim_df

Unnamed: 0,MOLENAME,TherapeuticIndication,SMILES
0,ABACAVIR SULFATE,antiviral,Nc1nc(NC2CC2)c2ncn(C3C=CC(CO)C3)c2n1
1,ACEBUTOLOL,"antihypertensive, antianginal, antiarrhythmic",CCCC(=O)Nc1ccc(OCC(O)C[NH2+]C(C)C)c(C(C)=O)c1
2,ACECAINIDE,antiarrhythmic,CC[NH+](CC)CCNC(=O)c1ccc(NC(C)=O)cc1
3,ALFUZOSIN,alpha(1)-adrenergic blocker,COc1cc2nc(N(C)CCCNC(=O)C3CCCO3)nc(N)c2cc1OC
4,ALMOTRIPTAN,5HT 1B/2D receptor agonist,C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12
...,...,...,...
266,WARFARIN,"anticoagulant, rodenticide",CC(=O)CC(c1ccccc1)c1c([O-])c2ccccc2oc1=O
267,ZALEPLON,"sedative, hypnotic",CCN(C(C)=O)c1cccc(-c2ccnc3c(C#N)cnn23)c1
268,ZIDOVUDINE [AZT],"RT transferase inhibitor, antiviral",Cc1cn(C2CC(N=[N+]=[N-])C(CO)O2)c(=O)[nH]c1=O
269,ZIPRASIDONE MESYLATE,antipsychotic,O=C1Cc2cc(CC[NH+]3CCN(c4nsc5ccccc45)CC3)c(Cl)c...


### Map compound names to CC SMILES

In [8]:
zimCmpName2smiles = {}

In [9]:
for cmp in zim_df['MOLENAME'].values:
    try:
        cmp_smiles = Converter().chemical_name_to_smiles(cmp)
        
    except:
        try:
            cmp_smiles = Converter().chemical_name_to_smiles(re.sub('[L,R,S,+,-,\s]*\(.+\)-*', '', re.sub('\\,*\s.+', '', cmp)))
        
        except:
            try:
                cmp_smiles = Converter().chemical_name_to_smiles(re.sub('\s+\\(R,S\\)', '', re.sub('\s+', ' ', re.sub('\\(n \\)', '', re.sub('-', ' ', cmp)))))
                
            except:
                try: 
                    pcp_compound = pcp.get_compounds(cmp, 'name')
                    cmp_smiles = [pcp_cmp.isomeric_smiles for pcp_cmp in pcp_compound][0]
                    
                except:
                    cmp_smiles = ''
                    print(cmp)
            
    zimCmpName2smiles[cmp] = cmp_smiles

BETAXALOL


In [10]:
# add column with cc smiles to df
zim_df['SMILES_cc'] = zim_df['MOLENAME'].map(zimCmpName2smiles)

In [11]:
zim_df

Unnamed: 0,MOLENAME,TherapeuticIndication,SMILES,SMILES_cc
0,ABACAVIR SULFATE,antiviral,Nc1nc(NC2CC2)c2ncn(C3C=CC(CO)C3)c2n1,Nc1nc(NC2CC2)c3ncn([C@@H]4C[C@H](CO)C=C4)c3n1....
1,ACEBUTOLOL,"antihypertensive, antianginal, antiarrhythmic",CCCC(=O)Nc1ccc(OCC(O)C[NH2+]C(C)C)c(C(C)=O)c1,CCCC(=O)Nc1ccc(OCC(O)CNC(C)C)c(c1)C(C)=O
2,ACECAINIDE,antiarrhythmic,CC[NH+](CC)CCNC(=O)c1ccc(NC(C)=O)cc1,[H+].[Cl-].CCN(CC)CCNC(=O)c1ccc(NC(C)=O)cc1
3,ALFUZOSIN,alpha(1)-adrenergic blocker,COc1cc2nc(N(C)CCCNC(=O)C3CCCO3)nc(N)c2cc1OC,COc1cc2nc(nc(N)c2cc1OC)N(C)CCCNC(=O)C3CCCO3
4,ALMOTRIPTAN,5HT 1B/2D receptor agonist,C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12,CN(C)CCc1c[nH]c2ccc(C[S](=O)(=O)N3CCCC3)cc12
...,...,...,...,...
266,WARFARIN,"anticoagulant, rodenticide",CC(=O)CC(c1ccccc1)c1c([O-])c2ccccc2oc1=O,CC(=O)CC(c1ccccc1)C2=C(O)Oc3ccccc3C2=O
267,ZALEPLON,"sedative, hypnotic",CCN(C(C)=O)c1cccc(-c2ccnc3c(C#N)cnn23)c1,CCN(C(C)=O)c1cccc(c1)c2ccnc3n2ncc3C#N
268,ZIDOVUDINE [AZT],"RT transferase inhibitor, antiviral",Cc1cn(C2CC(N=[N+]=[N-])C(CO)O2)c(=O)[nH]c1=O,CC1=CN([C@H]2C[C@H](N=[N+]=[N-])[C@@H](CO)O2)C...
269,ZIPRASIDONE MESYLATE,antipsychotic,O=C1Cc2cc(CC[NH+]3CCN(c4nsc5ccccc45)CC3)c(Cl)c...,O.O.O.C[S](O)(=O)=O.Clc1cc2NC(=O)Cc2cc1CCN3CCN...


In [12]:
zim_df.loc[zim_df['SMILES_cc'] == '']

Unnamed: 0,MOLENAME,TherapeuticIndication,SMILES,SMILES_cc
23,BETAXALOL,"antihypertensive, beta-blocker, antianginal",CC(C)[NH2+]CC(O)COc1ccc(CCOCC2CC2)cc1,


In [13]:
zim_df['SMILES_cc'] = np.where((zim_df.SMILES_cc == ''), zim_df.SMILES, zim_df.SMILES_cc)

In [14]:
zim_df.loc[zim_df['SMILES_cc'] == '']

Unnamed: 0,MOLENAME,TherapeuticIndication,SMILES,SMILES_cc


### Now map SMILES to InChiKeys

In [15]:
zimSmiles2inchikeys = {}

In [16]:
for cmp in zim_df['SMILES_cc'].values:
    try:
        pcp_compound = pcp.get_compounds(cmp, 'smiles')
        cmp_inchi = [pcp_cmp.inchikey for pcp_cmp in pcp_compound][0]
        
    except:
        cmp_inchi = ''
        print(cmp)
            
    zimSmiles2inchikeys[cmp] = cmp_inchi

In [17]:
# add column with inchikeys to df
zim_df['InChiKey'] = zim_df['SMILES_cc'].map(zimSmiles2inchikeys)

In [18]:
zim_df

Unnamed: 0,MOLENAME,TherapeuticIndication,SMILES,SMILES_cc,InChiKey
0,ABACAVIR SULFATE,antiviral,Nc1nc(NC2CC2)c2ncn(C3C=CC(CO)C3)c2n1,Nc1nc(NC2CC2)c3ncn([C@@H]4C[C@H](CO)C=C4)c3n1....,WMHSRBZIJNQHKT-FFKFEZPRSA-N
1,ACEBUTOLOL,"antihypertensive, antianginal, antiarrhythmic",CCCC(=O)Nc1ccc(OCC(O)C[NH2+]C(C)C)c(C(C)=O)c1,CCCC(=O)Nc1ccc(OCC(O)CNC(C)C)c(c1)C(C)=O,GOEMGAFJFRBGGG-UHFFFAOYSA-N
2,ACECAINIDE,antiarrhythmic,CC[NH+](CC)CCNC(=O)c1ccc(NC(C)=O)cc1,[H+].[Cl-].CCN(CC)CCNC(=O)c1ccc(NC(C)=O)cc1,IYEWBJUCJHKLHD-UHFFFAOYSA-N
3,ALFUZOSIN,alpha(1)-adrenergic blocker,COc1cc2nc(N(C)CCCNC(=O)C3CCCO3)nc(N)c2cc1OC,COc1cc2nc(nc(N)c2cc1OC)N(C)CCCNC(=O)C3CCCO3,WNMJYKCGWZFFKR-UHFFFAOYSA-N
4,ALMOTRIPTAN,5HT 1B/2D receptor agonist,C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12,CN(C)CCc1c[nH]c2ccc(C[S](=O)(=O)N3CCCC3)cc12,WKEMJKQOLOHJLZ-UHFFFAOYSA-N
...,...,...,...,...,...
266,WARFARIN,"anticoagulant, rodenticide",CC(=O)CC(c1ccccc1)c1c([O-])c2ccccc2oc1=O,CC(=O)CC(c1ccccc1)C2=C(O)Oc3ccccc3C2=O,PJVWKTKQMONHTI-UHFFFAOYSA-N
267,ZALEPLON,"sedative, hypnotic",CCN(C(C)=O)c1cccc(-c2ccnc3c(C#N)cnn23)c1,CCN(C(C)=O)c1cccc(c1)c2ccnc3n2ncc3C#N,HUNXMJYCHXQEGX-UHFFFAOYSA-N
268,ZIDOVUDINE [AZT],"RT transferase inhibitor, antiviral",Cc1cn(C2CC(N=[N+]=[N-])C(CO)O2)c(=O)[nH]c1=O,CC1=CN([C@H]2C[C@H](N=[N+]=[N-])[C@@H](CO)O2)C...,HBOMLICNUCNMMY-XLPZGREQSA-N
269,ZIPRASIDONE MESYLATE,antipsychotic,O=C1Cc2cc(CC[NH+]3CCN(c4nsc5ccccc45)CC3)c(Cl)c...,O.O.O.C[S](O)(=O)=O.Clc1cc2NC(=O)Cc2cc1CCN3CCN...,WLQZEFFFIUHSJB-UHFFFAOYSA-N


In [19]:
zim_df.to_csv('../data/zimmermann_smiles_inchikeys_df.csv')

## Map compound names for Campos et al. dataset

In [3]:
camCompoundNameInputFile = "../data/campos_cpd_name_df.csv"

cam_df = pd.read_csv(camCompoundNameInputFile, index_col = False)

In [4]:
cam_df

Unnamed: 0,compound_name
0,Streptozotocin
1,Deferoxamine mesylate
2,Lithocholic acid
3,Sulfamethazine sodium salt
4,"Mevalonic-D, L acid lactone"
...,...
1274,Tigecycline
1275,Butylscopolammonium (n-) bromide
1276,Sumatriptan succinate
1277,Opipramol dihydrochloride


In [5]:
cmpCmpName2smiles = {}

In [6]:
for cmp in cam_df['compound_name'].values:
    try:
        cmp_smiles = Converter().chemical_name_to_smiles(cmp)
        
    except:
        try:
            cmp_smiles = Converter().chemical_name_to_smiles(re.sub('[L,R,S,+,-,\s]*\(.+\)-*', '', re.sub('\\,*\s.+', '', cmp)))
        
        except:
            try:
                cmp_smiles = Converter().chemical_name_to_smiles(re.sub('\s+\\(R,S\\)', '', re.sub('\s+', ' ', re.sub('\\(n \\)', '', re.sub('-', ' ', cmp)))))
                
            except:
                try: 
                    pcp_compound = pcp.get_compounds(cmp, 'name')
                    cmp_smiles = [pcp_cmp.isomeric_smiles for pcp_cmp in pcp_compound][0]
                    
                except:
                    cmp_smiles = ''
                    print(cmp)
            
    cmpCmpName2smiles[cmp] = cmp_smiles

Oxibendazol


In [8]:
# manually add Oxibendazol
cmpCmpName2smiles['Oxibendazol'] = 'CCCOC1=CC2=C(C=C1)N=C(N2)NC(=O)OC'

In [9]:
# add column with cc smiles to df
cam_df['SMILES_cc'] = cam_df['compound_name'].map(cmpCmpName2smiles)

In [10]:
cam_df

Unnamed: 0,compound_name,SMILES_cc
0,Streptozotocin,CN(N=O)C(=O)N[C@H]1[C@@H](O)O[C@H](CO)[C@@H](O...
1,Deferoxamine mesylate,CC(=O)N(O)CCCCCNC(=O)CCC(=O)N(O)CCCCCNC(=O)CCC...
2,Lithocholic acid,C[C@H](CCC(O)=O)[C@H]1CC[C@H]2[C@@H]3CC[C@@H]4...
3,Sulfamethazine sodium salt,[Na+].Cc1cc(C)nc([N-][S](=O)(=O)c2ccc(N)cc2)n1
4,"Mevalonic-D, L acid lactone",C[C@@]1(O)CCOC(=O)C1
...,...,...
1274,Tigecycline,CN(C)[C@H]1[C@@H]2C[C@@H]3Cc4c(cc(NC(=O)CNC(C)...
1275,Butylscopolammonium (n-) bromide,[Br-].CCCC[N+]1(C)[C@H]2CC(C[C@H]1[C@@H]3O[C@H...
1276,Sumatriptan succinate,CN[S](=O)(=O)Cc1ccc2[nH]cc(CCN(C)C)c2c1.OC(=O)...
1277,Opipramol dihydrochloride,[H+].[H+].[Cl-].[Cl-].OCCN1CCN(CCCN2c3ccccc3C=...


### Now map SMILES to InChiKeys

In [11]:
camSmiles2inchikeys = {}

In [12]:
for cmp in cam_df['SMILES_cc'].values:
    try:
        pcp_compound = pcp.get_compounds(cmp, 'smiles')
        cmp_inchi = [pcp_cmp.inchikey for pcp_cmp in pcp_compound][0]
        
    except:
        cmp_inchi = ''
        print(cmp)
            
    camSmiles2inchikeys[cmp] = cmp_inchi

<!DOCTYPE html>

<html>

<head>
	<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
	<!--<meta http-equiv="X-UA-Compatible" content="chrome=1">-->
	<meta http-equiv="X-UA-Compatible" content="IE=EmulateIE7, IE=9" >
	<title>NCI/CADD Chemical Identifier Resolver</title>
	<meta name="robots" content="index, follow" />
	<meta name="author" content="NCICADD Group, National Cancer Institute" />
	<meta name="author-personal" content="Markus Sitzmann, Igor Filippov, Marc Nicklaus" />
	<meta name="author-mail" content="webmaster@https://cactus.nci.nih.gov"/>
	<meta name="keywords" content="chemical identifier, resolver, InChI, InChIKey, SMILES, GIF, database, chemical names, inchikey resolver, InChIKey resolver" />
	<meta name="description" content="Chemical Identifier Resolver" />
	
	<link rel="stylesheet" href="/style/carbon.css" type="text/css" media="screen,projection" />
	<!--<link rel="stylesheet" href="/style/browser.css" type="text/css" media="screen,projection">-->
 

In [13]:
cam_df['InChiKey'] = cam_df['SMILES_cc'].map(camSmiles2inchikeys)

In [14]:
cam_df.to_csv('../data/campos_smiles_inchikeys_df.csv')

## Map SMILES to InChiKeys for CHEMBL antibiotics

In [3]:
chembl_df = pd.read_csv('../data/chembl_fil_df.csv')

In [4]:
chembl_df

Unnamed: 0,ChEMBL ID,Name,Smiles
0,CHEMBL3989817,DIPROLEANDOMYCIN,CCC(=O)O[C@H]1[C@H](C)O[C@@H](O[C@@H]2[C@@H](C...
1,CHEMBL2106418,DOXYCYCLINE FOSFATEX,C[C@H]1c2cccc(O)c2C(=O)C2=C(O)[C@]3(O)C(=O)C(C...
2,CHEMBL4297677,TEBIPENEM PIVOXIL HYDROBROMIDE,Br.C[C@@H](O)[C@H]1C(=O)N2C(C(=O)OCOC(=O)C(C)(...
3,CHEMBL415689,,C[C@@H](c1ccccc1)[C@@H]1NC(=O)CNC(=O)[C@H](CO)...
4,CHEMBL3989416,GANEFROMYCIN,C/C=C\C=C\[C@@H]1O[C@](O)(C(CO[C@@H]2C[C@@H](O...
...,...,...,...
566,CHEMBL1201752,IXABEPILONE,C/C(=C\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[...
567,CHEMBL1191,SULFAMETHIZOLE,Cc1nnc(NS(=O)(=O)c2ccc(N)cc2)s1
568,CHEMBL577736,TYROTHRICIN,CC(C)C[C@@H]1NC(=O)[C@H](CCCN)NC(=O)[C@H](C(C)...
569,CHEMBL99,TRICHOSTATIN,CC(/C=C/C(=O)NO)=C\[C@@H](C)C(=O)c1ccc(N(C)C)cc1


### Map SMILES to InChiKeys

In [6]:
chemblSmiles2inchikeys = {}

In [7]:
for cmp in chembl_df['Smiles'].values:
    try:
        pcp_compound = pcp.get_compounds(cmp, 'smiles')
        cmp_inchi = [pcp_cmp.inchikey for pcp_cmp in pcp_compound][0]
        
    except:
        cmp_inchi = ''
        print(cmp)
            
    chemblSmiles2inchikeys[cmp] = cmp_inchi

In [8]:
chembl_df['InChiKey'] = chembl_df['Smiles'].map(chemblSmiles2inchikeys)

In [9]:
chembl_df

Unnamed: 0,ChEMBL ID,Name,Smiles,InChiKey
0,CHEMBL3989817,DIPROLEANDOMYCIN,CCC(=O)O[C@H]1[C@H](C)O[C@@H](O[C@@H]2[C@@H](C...,HYQJLIGADHPYIR-OJBANTNJSA-N
1,CHEMBL2106418,DOXYCYCLINE FOSFATEX,C[C@H]1c2cccc(O)c2C(=O)C2=C(O)[C@]3(O)C(=O)C(C...,ROUBCDXYSWYAEB-YDLUHMIOSA-M
2,CHEMBL4297677,TEBIPENEM PIVOXIL HYDROBROMIDE,Br.C[C@@H](O)[C@H]1C(=O)N2C(C(=O)OCOC(=O)C(C)(...,MMWWBQNLJKFAIN-HXLQFWNVSA-N
3,CHEMBL415689,,C[C@@H](c1ccccc1)[C@@H]1NC(=O)CNC(=O)[C@H](CO)...,YBILMIDCJFLBHS-MVXAWGMXSA-N
4,CHEMBL3989416,GANEFROMYCIN,C/C=C\C=C\[C@@H]1O[C@](O)(C(CO[C@@H]2C[C@@H](O...,OITXQULUNSCKPV-GQHWWOMESA-N
...,...,...,...,...
566,CHEMBL1201752,IXABEPILONE,C/C(=C\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[...,FABUFPQFXZVHFB-PVYNADRNSA-N
567,CHEMBL1191,SULFAMETHIZOLE,Cc1nnc(NS(=O)(=O)c2ccc(N)cc2)s1,VACCAVUAMIDAGB-UHFFFAOYSA-N
568,CHEMBL577736,TYROTHRICIN,CC(C)C[C@@H]1NC(=O)[C@H](CCCN)NC(=O)[C@H](C(C)...,NLJVXZFCYKWXLH-DXTIXLATSA-N
569,CHEMBL99,TRICHOSTATIN,CC(/C=C/C(=O)NO)=C\[C@@H](C)C(=O)c1ccc(N(C)C)cc1,RTKIYFITIVXBLE-QEQCGCAPSA-N


In [10]:
chembl_df.to_csv('../data/chembl_antobiotics_inchikeys_df.csv')

## Map spice compounds to SMILES and to InChiKeys 

In [4]:
spice_df = pd.read_csv('../data/spices.csv')

In [5]:
spice_df

Unnamed: 0,compound_name
0,Piperine
1,Capsaicin
2,Dihydrocapsaicin
3,Ferulic acid
4,Luteolin
5,Apigenin
6,Cinnamic acid
7,Cinnamaldehyde
8,6-Gingerol
9,Vanillin


In [6]:
spiceCmpName2smiles = {}

In [7]:
for cmp in spice_df['compound_name'].values:
    try:
        cmp_smiles = Converter().chemical_name_to_smiles(cmp)
        
    except:
        try:
            cmp_smiles = Converter().chemical_name_to_smiles(re.sub('[L,R,S,+,-,\s]*\(.+\)-*', '', re.sub('\\,*\s.+', '', cmp)))
        
        except:
            try:
                cmp_smiles = Converter().chemical_name_to_smiles(re.sub('\s+\\(R,S\\)', '', re.sub('\s+', ' ', re.sub('\\(n \\)', '', re.sub('-', ' ', cmp)))))
                
            except:
                try: 
                    pcp_compound = pcp.get_compounds(cmp, 'name')
                    cmp_smiles = [pcp_cmp.isomeric_smiles for pcp_cmp in pcp_compound][0]
                    
                except:
                    cmp_smiles = ''
                    print(cmp)
            
    spiceCmpName2smiles[cmp] = cmp_smiles

Luteolin-3-glucuronide


In [10]:
spiceCmpName2smiles['Luteolin-3-glucuronide'] = 'C1=CC(=C(C=C1C2=CC(=O)C3=C(C=C(C=C3O2)O)O)O[C@H]4[C@@H]([C@H]([C@@H]([C@H](O4)C(=O)O)O)O)O)O'
spiceCmpName2smiles

{'Piperine': 'O=C(/C=C/C=C/c1ccc2OCOc2c1)N3CCCCC3',
 'Capsaicin': 'COc1cc(CNC(=O)CCCC\\C=C\\C(C)C)ccc1O',
 'Dihydrocapsaicin': 'COc1cc(CNC(=O)CCCCCCC(C)C)ccc1O',
 'Ferulic acid': 'COc1cc(\\C=C\\C(O)=O)ccc1O',
 'Luteolin': 'Oc1cc(O)c2C(=O)C=C(Oc2c1)c3ccc(O)c(O)c3',
 'Apigenin': 'Oc1ccc(cc1)C2=CC(=O)c3c(O)cc(O)cc3O2',
 'Cinnamic acid': 'OC(=O)C=Cc1ccccc1',
 'Cinnamaldehyde': 'O=CC=Cc1ccccc1',
 '6-Gingerol': 'CCCCCC(O)CC(=O)CCC1CCC(O)C(C1)OC',
 'Vanillin': 'COc1cc(C=O)ccc1O',
 'P-coumaric acid': 'OC(=O)/C=C/c1ccc(O)cc1',
 'Rosmarinic acid': 'OC(=O)[C@@H](Cc1ccc(O)c(O)c1)OC(=O)/C=C/c2ccc(O)c(O)c2',
 'Lithospermic acid': 'OC(=O)[C@@H](Cc1ccc(O)c(O)c1)OC(=O)\\C=C\\c2ccc(O)c3OC(C(C(O)=O)c23)c4ccc(O)c(O)c4',
 'Luteolin-3-glucuronide': 'C1=CC(=C(C=C1C2=CC(=O)C3=C(C=C(C=C3O2)O)O)O[C@H]4[C@@H]([C@H]([C@@H]([C@H](O4)C(=O)O)O)O)O)O',
 'Bisdemethoxycurcumin': 'Oc1ccc(cc1)\\C=C\\C(=O)CC(=O)/C=C/c2ccc(O)cc2',
 'Demethoxycurcumin': 'COc1cc(\\C=C\\C(O)=C\\C(=O)/C=C/c2ccc(O)cc2)ccc1O',
 'Curcumin': 'COc1

In [12]:
spice_df['SMILES_cc'] = spice_df['compound_name'].map(spiceCmpName2smiles)
spice_df

Unnamed: 0,compound_name,SMILES_cc
0,Piperine,O=C(/C=C/C=C/c1ccc2OCOc2c1)N3CCCCC3
1,Capsaicin,COc1cc(CNC(=O)CCCC\C=C\C(C)C)ccc1O
2,Dihydrocapsaicin,COc1cc(CNC(=O)CCCCCCC(C)C)ccc1O
3,Ferulic acid,COc1cc(\C=C\C(O)=O)ccc1O
4,Luteolin,Oc1cc(O)c2C(=O)C=C(Oc2c1)c3ccc(O)c(O)c3
5,Apigenin,Oc1ccc(cc1)C2=CC(=O)c3c(O)cc(O)cc3O2
6,Cinnamic acid,OC(=O)C=Cc1ccccc1
7,Cinnamaldehyde,O=CC=Cc1ccccc1
8,6-Gingerol,CCCCCC(O)CC(=O)CCC1CCC(O)C(C1)OC
9,Vanillin,COc1cc(C=O)ccc1O


### Map SMILES to InChiKeys

In [13]:
spiceSmiles2inchikeys = {}

In [15]:
for cmp in spice_df['SMILES_cc'].values:
    try:
        pcp_compound = pcp.get_compounds(cmp, 'smiles')
        cmp_inchi = [pcp_cmp.inchikey for pcp_cmp in pcp_compound][0]
        
    except:
        cmp_inchi = ''
        print(cmp)
            
    spiceSmiles2inchikeys[cmp] = cmp_inchi

In [16]:
spice_df['InChiKey'] = spice_df['SMILES_cc'].map(spiceSmiles2inchikeys)
spice_df

Unnamed: 0,compound_name,SMILES_cc,InChiKey
0,Piperine,O=C(/C=C/C=C/c1ccc2OCOc2c1)N3CCCCC3,MXXWOMGUGJBKIW-YPCIICBESA-N
1,Capsaicin,COc1cc(CNC(=O)CCCC\C=C\C(C)C)ccc1O,YKPUWZUDDOIDPM-SOFGYWHQSA-N
2,Dihydrocapsaicin,COc1cc(CNC(=O)CCCCCCC(C)C)ccc1O,XJQPQKLURWNAAH-UHFFFAOYSA-N
3,Ferulic acid,COc1cc(\C=C\C(O)=O)ccc1O,KSEBMYQBYZTDHS-HWKANZROSA-N
4,Luteolin,Oc1cc(O)c2C(=O)C=C(Oc2c1)c3ccc(O)c(O)c3,IQPNAANSBPBGFQ-UHFFFAOYSA-N
5,Apigenin,Oc1ccc(cc1)C2=CC(=O)c3c(O)cc(O)cc3O2,KZNIFHPLKGYRTM-UHFFFAOYSA-N
6,Cinnamic acid,OC(=O)C=Cc1ccccc1,WBYWAXJHAXSJNI-UHFFFAOYSA-N
7,Cinnamaldehyde,O=CC=Cc1ccccc1,KJPRLNWUNMBNBZ-UHFFFAOYSA-N
8,6-Gingerol,CCCCCC(O)CC(=O)CCC1CCC(O)C(C1)OC,ONQQLFWDTJJQKU-UHFFFAOYSA-N
9,Vanillin,COc1cc(C=O)ccc1O,MWOOGOJBHIARFG-UHFFFAOYSA-N


In [17]:
spice_df.to_csv('../data/spices_inchi.csv')

### Map Compound names for new Zampieri dataset 

In [2]:
zamCompoundNameInputFile = "../data/anglada_girotto_cpd_name_df.csv"

zam_df = pd.read_csv(zamCompoundNameInputFile, index_col = False)

In [3]:
zamCmpName2smiles = {}

In [4]:
for cmp in zam_df['full_name'].values:
    try:
        cmp_smiles = Converter().chemical_name_to_smiles(cmp)
        
    except:
        try:
            cmp_smiles = Converter().chemical_name_to_smiles(re.sub('[L,R,S,+,-,\s]*\(.+\)-*', '', re.sub('\\,*\s.+', '', cmp)))
        
        except:
            try:
                cmp_smiles = Converter().chemical_name_to_smiles(re.sub('\s+\\(R,S\\)', '', re.sub('\s+', ' ', re.sub('\\(n \\)', '', re.sub('-', ' ', cmp)))))
                
            except:
                try: 
                    pcp_compound = pcp.get_compounds(cmp, 'name')
                    cmp_smiles = [pcp_cmp.isomeric_smiles for pcp_cmp in pcp_compound][0]
                    
                except:
                    cmp_smiles = ''
                    print(cmp)
            
    zamCmpName2smiles[cmp] = cmp_smiles

CCCP/Carbonyl cyanide 3-chlorophenylhydrazone


In [5]:
# manually add SMILES for CCCP
zamCmpName2smiles['CCCP/Carbonyl cyanide 3-chlorophenylhydrazone'] = 'C1=CC(=CC(=C1)Cl)NN=C(C#N)C#N'

In [6]:
# add column with cc smiles to df
zam_df['SMILES_cc'] = zam_df['full_name'].map(zamCmpName2smiles)

In [7]:
zam_df

Unnamed: 0,compound_name_abbr,full_name,SMILES_cc
0,Streptozotocin,Streptozotocin,CN(N=O)C(=O)N[C@H]1[C@@H](O)O[C@H](CO)[C@@H](O...
1,Deferoxamine mesylate,Deferoxamine mesylate,CC(=O)N(O)CCCCCNC(=O)CCC(=O)N(O)CCCCCNC(=O)CCC...
2,Lithocholic acid,Lithocholic acid,C[C@H](CCC(O)=O)[C@H]1CC[C@H]2[C@@H]3CC[C@@H]4...
3,Sulfamethazine sodium salt,Sulfamethazine sodium salt,[Na+].Cc1cc(C)nc([N-][S](=O)(=O)c2ccc(N)cc2)n1
4,"Mevalonic-D, L acid lactone","Mevalonic-D, L acid lactone",C[C@@]1(O)CCOC(=O)C1
...,...,...,...
1337,TX100,Triton X-100,CC(C)(C)CC(C)(C)c1ccc(OCCO)cc1
1338,STR,Streptomycin,CN[C@H]1[C@H](O)[C@@H](O)[C@H](CO)O[C@H]1O[C@H...
1339,NFL,Norfloxacin,CCN1C=C(C(O)=O)C(=O)c2cc(F)c(cc12)N3CCNCC3
1340,OXTb,Oxytetracycline,O.O.CN(C)[C@H]1[C@@H]2[C@@H](O)[C@H]3C(=C(O)c4...


In [8]:
zam_df.to_csv('../data/anglada_girotto_cpd_name_smiles_df.csv')

#### Map SMILES to InChiKeys

In [9]:
zamSmiles2inchikeys = {}

In [10]:
for cmp in zam_df['SMILES_cc'].values:
    try:
        pcp_compound = pcp.get_compounds(cmp, 'smiles')
        cmp_inchi = [pcp_cmp.inchikey for pcp_cmp in pcp_compound][0]
        
    except:
        cmp_inchi = ''
        print(cmp)
            
    zamSmiles2inchikeys[cmp] = cmp_inchi

<!DOCTYPE html>

<html>

<head>
	<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
	<!--<meta http-equiv="X-UA-Compatible" content="chrome=1">-->
	<meta http-equiv="X-UA-Compatible" content="IE=EmulateIE7, IE=9" >
	<title>NCI/CADD Chemical Identifier Resolver</title>
	<meta name="robots" content="index, follow" />
	<meta name="author" content="NCICADD Group, National Cancer Institute" />
	<meta name="author-personal" content="Markus Sitzmann, Igor Filippov, Marc Nicklaus" />
	<meta name="author-mail" content="webmaster@https://cactus.nci.nih.gov"/>
	<meta name="keywords" content="chemical identifier, resolver, InChI, InChIKey, SMILES, GIF, database, chemical names, inchikey resolver, InChIKey resolver" />
	<meta name="description" content="Chemical Identifier Resolver" />
	
	<link rel="stylesheet" href="/style/carbon.css" type="text/css" media="screen,projection" />
	<!--<link rel="stylesheet" href="/style/browser.css" type="text/css" media="screen,projection">-->
 

In [13]:
# add column with cc smiles to df
zam_df['InChiKey'] = zam_df['SMILES_cc'].map(zamSmiles2inchikeys)

In [14]:
zam_df

Unnamed: 0,compound_name_abbr,full_name,SMILES_cc,InChiKey
0,Streptozotocin,Streptozotocin,CN(N=O)C(=O)N[C@H]1[C@@H](O)O[C@H](CO)[C@@H](O...,ZSJLQEPLLKMAKR-GKHCUFPYSA-N
1,Deferoxamine mesylate,Deferoxamine mesylate,CC(=O)N(O)CCCCCNC(=O)CCC(=O)N(O)CCCCCNC(=O)CCC...,IDDIJAWJANBQLJ-UHFFFAOYSA-N
2,Lithocholic acid,Lithocholic acid,C[C@H](CCC(O)=O)[C@H]1CC[C@H]2[C@@H]3CC[C@@H]4...,SMEROWZSTRWXGI-HVATVPOCSA-N
3,Sulfamethazine sodium salt,Sulfamethazine sodium salt,[Na+].Cc1cc(C)nc([N-][S](=O)(=O)c2ccc(N)cc2)n1,NGIVTUVVBWOTNT-UHFFFAOYSA-N
4,"Mevalonic-D, L acid lactone","Mevalonic-D, L acid lactone",C[C@@]1(O)CCOC(=O)C1,JYVXNLLUYHCIIH-ZCFIWIBFSA-N
...,...,...,...,...
1337,TX100,Triton X-100,CC(C)(C)CC(C)(C)c1ccc(OCCO)cc1,JYCQQPHGFMYQCF-UHFFFAOYSA-N
1338,STR,Streptomycin,CN[C@H]1[C@H](O)[C@@H](O)[C@H](CO)O[C@H]1O[C@H...,UCSJYZPVAKXKNQ-HZYVHMACSA-N
1339,NFL,Norfloxacin,CCN1C=C(C(O)=O)C(=O)c2cc(F)c(cc12)N3CCNCC3,OGJPXUAPXNRGGI-UHFFFAOYSA-N
1340,OXTb,Oxytetracycline,O.O.CN(C)[C@H]1[C@@H]2[C@@H](O)[C@H]3C(=C(O)c4...,IMLJLCJZQLGHJS-JEKSYDDFSA-N


In [15]:
zam_df.to_csv('../data/anglada_girotto_cpd_name_smiles_inchikey_df.csv')

## Map CC universe compound InChiKeys to SMILES for signaturization

In [2]:
cc_universe_file = '../data/M1_101_tsne_values_10000_cc_compounds.csv'
cc_df = pd.read_csv(cc_universe_file, index_col=0)
cc_df

Unnamed: 0_level_0,V1,V2
index,Unnamed: 1_level_1,Unnamed: 2_level_1
AAAYSHMCBYUFIR-MFOYZWKCSA-N,6.688449,11.356979
AABLNHSZFIPOBJ-UHFFFAOYSA-N,-29.599192,14.538304
AADBEZALNPVNKC-UHFFFAOYSA-N,6.330449,24.452001
AAOUHTYFBALOPI-MKICQXMISA-N,28.826166,4.160808
AASLOEMOCBUITM-UHFFFAOYSA-N,25.562814,-6.885511
...,...,...
ZZRBRBGJCLGUGH-IVMQYODDSA-N,-11.694713,19.701623
ZZTFYTXRPLQSFM-UHFFFAOYSA-N,3.149795,-30.870100
ZZWDPPPPDLGPEB-UHFFFAOYSA-N,11.391674,-7.286328
ZZXOSCMOPBHOGQ-UHFFFAOYSA-N,-9.595184,-25.560444


In [3]:
ccInchikey2smiles = {}

In [4]:
for cmp in cc_df.index.values:
    try:
        pcp_compound = pcp.get_compounds(cmp, 'inchikey')
        cmp_smiles = [pcp_cmp.isomeric_smiles for pcp_cmp in pcp_compound][0]
        
    except:
        cmp_smiles = ''
        print(cmp)
            
    ccInchikey2smiles[cmp] = cmp_smiles

AEKGEXDIYOFBRS-FIXSFTCYSA-N
AFVVZSOHFRSNTE-IDNYSXLCSA-N
AIEHOVCPKVVGNX-PEUGIQGRSA-N
AIRZFEKQGRSBNX-ISLYRVAYSA-N
AIXDLJSJCJOIBM-XLIONFOSSA-N
AJGRXKLDPZVROS-HPJWIMSGSA-N
AJHZDTVNCDDNPL-UHFFFAOYSA-N
ALGBWVDEPXFAMU-UZIRPGAUSA-N
ALHQNZLACBUYGW-UHFFFAOYSA-N
ATEBXHFBFRCZMA-WNTWPGMYSA-N
AZIFYQDMFJNOQK-UAZDYKOFSA-N
AZUMDPMNGBWFOH-MZNYFNKISA-N
BAOIHMKOTNBRPR-HZCBDIJESA-N
BBBUKQRTMPJLSL-WDOBWYJBSA-N
BCQISXRXOQRPFE-SILNSSARSA-N
BDXSZJXLXZJDAM-BLTQDSCZSA-N
BJQKLNSJYHMULA-WOFXILAISA-N
BKIWSQUNFCJSOI-LQRHGLAMSA-E
BLVVIFDPFZHYLP-UHFFFAOYSA-N
BMPZYUNSISKDDP-FEUVXWNVSA-N
BTBLVKQZYNTQOX-NBEIKUQISA-N
BUYBRCKGSKNPEL-UHFFFAOYSA-N
BXAVZUDRELSKOC-LORLDUPWSA-N
BXGCCZDRUAOMRC-SZPZYZBQSA-N
CABIXBDCJSQXLH-KESTWPANSA-N
CAYIEOCGADLORC-UHFFFAOYSA-N
CBBLYOQJDPCTMR-SHTZXODSSA-N
CCKTYTPEQBNMIC-PNOGMODKSA-N
CCMCQXPLCVUZML-RZDIXWSQSA-N
CDKNIMJRBVGTLI-PMOQBDJRSA-N
CEWUZDSTHUFKMS-KUCRQJOESA-N
CGRVHZIOGDYGJY-UHFFFAOYSA-N
CIMSZTVVTWKOJY-UUDCSCGESA-N
CNJNAQNICBKURV-BJRWYXCZSA-N
CQBDSHCGYWLMPX-JFCQCOQKSA-N
CQOYNSMPGNUDSZ-WMEHT

In [None]:
# manually add smiles of missed cmpds
#ccInchikey2smiles['BJQKLNSJYHMULA-PHXKMMTBSA-N'] = 'C1[C@@H]2[C@@H](C2C3=CC(=NN3CC(F)(F)F)C4=CC(=C(N=C4)N)C(F)(F)F)CN1'

In [5]:
# add column with cc smiles to df
cc_df['smiles'] = cc_df.index.map(ccInchikey2smiles)
cc_df

Unnamed: 0_level_0,V1,V2,smiles
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAAYSHMCBYUFIR-MFOYZWKCSA-N,6.688449,11.356979,CC1=CC(=C(C=C1)OCC(=O)N/N=C\C2=CC=C(C=C2)O)[N+...
AABLNHSZFIPOBJ-UHFFFAOYSA-N,-29.599192,14.538304,C1CN(CCC1C2=CC=CC=C2)C3=C(C(=O)NC(=O)N3)CC4=CC...
AADBEZALNPVNKC-UHFFFAOYSA-N,6.330449,24.452001,CC(C)C1=CC2=C(CC(NC2=O)(C)C)C=C1
AAOUHTYFBALOPI-MKICQXMISA-N,28.826166,4.160808,CC(=O)OC1=C(C=C(C=C1)/C=C/C(=O)NCCOC(=O)/C=C/C...
AASLOEMOCBUITM-UHFFFAOYSA-N,25.562814,-6.885511,CCC1=NN(C(=N1)C2=NC=C(C=N2)OC)C3=CC=CC(=C3)C(=...
...,...,...,...
ZZRBRBGJCLGUGH-IVMQYODDSA-N,-11.694713,19.701623,CC1=NC2=CC=NN2C(C1C(=O)N3CCC[C@H]3C4=NC(=NO4)C...
ZZTFYTXRPLQSFM-UHFFFAOYSA-N,3.149795,-30.870100,CCN(CCC(=O)C1=CN=CC=C1)CC2=CC=CC=C2
ZZWDPPPPDLGPEB-UHFFFAOYSA-N,11.391674,-7.286328,C1=CC=C2C(=C1)C=C(C=N2)OC3=C(C=C(C=C3Cl)NS(=O)...
ZZXOSCMOPBHOGQ-UHFFFAOYSA-N,-9.595184,-25.560444,CN(C)S(=O)(=O)C1=CC(=C(C=C1)N2CCCC2)C(=O)NC3=N...


In [6]:
cc_df.to_csv('../data/M1_101_tsne_values_10000_cc_compounds_smiles.csv')

## Map InChi Keys to smiles for best M1 sign3 t-sne so far

In [4]:
m1_best_tsne_df = pd.read_csv('../data/m1_best_sign3_joint_tnse.csv')
m1_best_tsne_df

Unnamed: 0,InChiKey,V1,V2,compound_ids,prestwick_id,compound_name,SMILES_id,drug_class,n_hit
0,AACMKKWADJBYOE-UHFFFAOYSA-N,9.833360,-6.863870,,,,,,
1,AADCDMQTJNYOSS-LBPRGKRZSA-N,5.389593,-16.528657,Prestw-932_S(-)Eticlopride hydrochloride,Prestw-932,S(-)Eticlopride hydrochloride,CCN1CCC[C@H]1CNC(=O)c2c(O)c(CC)cc(Cl)c2OC,non-drugs,0.0
2,AADJJWDBCQRALD-UHFFFAOYSA-N,8.824800,-7.960148,,,,,,
3,AAENIXAZGRAAII-UHFFFAOYSA-N,-15.488416,-10.353366,,,,,,
4,AAGKMGNYUYCEPD-UHFFFAOYSA-N,29.500637,-15.697789,,,,,,
...,...,...,...,...,...,...,...,...,...
11026,ZZVLMHCBMDEQKG-QHCPKHFHSA-N,-39.716162,1.932816,,,,,,
11027,ZZWJKLGCDHYVMB-APNKWYJYSA-N,5.460662,15.963245,,,,,,
11028,ZZXPUJBAUVDSRS-XDOYNYLZSA-N,-15.089193,-11.494783,,,,,,
11029,ZZYNPFIDNXMCGK-UHFFFAOYSA-N,17.829278,-15.839339,,,,,,


In [5]:
bestM1Inchikey2smiles = {}

In [8]:
for cmp in m1_best_tsne_df['InChiKey'].values:
    try:
        pcp_compound = pcp.get_compounds(cmp, 'inchikey')
        cmp_smiles = [pcp_cmp.isomeric_smiles for pcp_cmp in pcp_compound][0]
        
    except:
        cmp_smiles = ''
        print(cmp)
            
    bestM1Inchikey2smiles[cmp] = cmp_smiles

ABRQLTJFNHCMHW-UHFFFAOYSA-N
ACBWGVGVTPIEPA-VQVSGNEDSA-N
AEHFUSPLVGWEEP-UYULQCGDSA-N
AIEZQFQSOAPLSA-GLILEQIGSA-N
AKUYTFPSUTXIOD-UHFFFAOYSA-N
ALJKTYDBOJYIPS-PNHLSOANSA-N
ATEUHFXRLAHXQF-UHFFFAOYSA-N
AWBWQBMGIDKGBV-DJARZULQSA-N
BINXOHRETRQOCW-NCYKPQTJSA-N
BIRJBNROQLNSKY-TYKILGCBSA-N
BMVSRGXHTJYIKY-UHFFFAOYSA-N
BSNJSZUDOMPYIR-CUKLWHKZSA-N
BSUXIHZNMCOAOK-NJTBNVCCSA-N
BSZJLJRYNPZJPB-UHFFFAOYSA-N
BTBLVKQZYNTQOX-NBEIKUQISA-N
BUHLNHUYJXDISF-DHAXNPMCSA-N
BWMLEHOGRQOGBL-QLPKVWCKSA-N
BWQUQNCBNOHLOO-AVGNSLFASA-N
BWVYXPYWNXXIEZ-FTZHGRJDSA-N
BXFSLJYJAZAYCW-XHLNEMQHSA-N
BXUWIRFXQNUOEB-HNSKJHPRSA-N
CBHRLZCFSLCIPK-VCHYOVAHSA-N
CBQDXQVMGQAGBR-CITOCJSESA-N
CCWMEUGJIIMLHD-VQGAUUQYSA-N
CDKRXWKJLOFKPT-HAQNSBGRSA-N
CHAYRPTYNSHELC-JCYRPKCISA-N
CHCFFXBWNOXOJJ-REPLKXPHSA-N
CLARGBYBPGPRGV-TXEJJXNPSA-N
CLYLEOXSUSFWBW-WGSAOQKQSA-N
CQKLVJYDMLDRIW-KGENOOAVSA-N
CUPCBPXWWHJIFP-SFJHNWJUSA-N
CVOSCQFZGAIWAC-UHFFFAOYSA-N
CXCNCVYTXKNZER-RVZUZXSLSA-N
DALDXVZWVAVOKE-AHLJTMJASA-N
DBTLAKRZPVHNGJ-CQUPSJMFSA-N
DCRWQJRPXZNJRD-YBEGL

In [9]:
# add column with cc smiles to df
m1_best_tsne_df['smiles'] = m1_best_tsne_df['InChiKey'].map(bestM1Inchikey2smiles)
m1_best_tsne_df

Unnamed: 0,InChiKey,V1,V2,compound_ids,prestwick_id,compound_name,SMILES_id,drug_class,n_hit,smiles
0,AACMKKWADJBYOE-UHFFFAOYSA-N,9.833360,-6.863870,,,,,,,C1=CC=C2C(=C1)C=C(O2)C3=CC=NN3
1,AADCDMQTJNYOSS-LBPRGKRZSA-N,5.389593,-16.528657,Prestw-932_S(-)Eticlopride hydrochloride,Prestw-932,S(-)Eticlopride hydrochloride,CCN1CCC[C@H]1CNC(=O)c2c(O)c(CC)cc(Cl)c2OC,non-drugs,0.0,CCC1=CC(=C(C(=C1O)C(=O)NC[C@@H]2CCCN2CC)OC)Cl
2,AADJJWDBCQRALD-UHFFFAOYSA-N,8.824800,-7.960148,,,,,,,C1CCC(CC1)(CC(=O)O)O
3,AAENIXAZGRAAII-UHFFFAOYSA-N,-15.488416,-10.353366,,,,,,,C1=CC(=CC(=C1)Cl)C2=C(C=NO2)NC(=O)C3=C4N=C(C=C...
4,AAGKMGNYUYCEPD-UHFFFAOYSA-N,29.500637,-15.697789,,,,,,,CC(=O)NCCN1C=CC2=C1C(=NC=N2)NC3=CC(=C(C=C3)OC4...
...,...,...,...,...,...,...,...,...,...,...
11026,ZZVLMHCBMDEQKG-QHCPKHFHSA-N,-39.716162,1.932816,,,,,,,C1CC1CN2C=CC(=N2)C3=C(C(=O)N[C@@](C3)(C4=CC=C(...
11027,ZZWJKLGCDHYVMB-APNKWYJYSA-N,5.460662,15.963245,,,,,,,C[C@@H]([C@H](C1=CC2=C(C=C1)OCCO2)OC3=CC4=C(C=...
11028,ZZXPUJBAUVDSRS-XDOYNYLZSA-N,-15.089193,-11.494783,,,,,,,CC1=CC=C(C=C1)S/C(=N\S(=O)(=O)C2=CC=C(C=C2)Cl)...
11029,ZZYNPFIDNXMCGK-UHFFFAOYSA-N,17.829278,-15.839339,,,,,,,CC1=C(C=CC=C1NC(=O)C2=CC=CS2)C3=NC4=C(O3)C=CC(...


In [10]:
m1_best_tsne_df.to_csv('../data/m1_best_tsne_smiles.csv')