In [71]:
import re
import requests
from urllib.request import quote
import pandas as pd
import bs4

In [38]:
df = pd.read_excel('data/vcd_database.xlsx')

In [39]:
cas_list = set(list(df.iloc[:,6].replace(0,None).dropna())[1:])
print(len(cas_list))

119


In [54]:
cas_cid_smiles = pd.DataFrame(columns=['CID', 'SMILES'])
for cas in list(cas_list):
    url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/%s/property/isomericSMILES/JSON" % cas
    r = requests.get(url)
    if r.status_code == 200:
        xs = r.json()['PropertyTable']['Properties']
        found_chiral_smiles = False
        for x in xs:
            if any([char in x['IsomericSMILES'] for char in ['@','@@','/','\\']]): # Is chiral
                cas_cid_smiles.loc[cas] = [x['CID'], x['IsomericSMILES']]
                found_chiral_smiles = True
                break
        if not found_chiral_smiles:
            print("No chiral SMILES found for CAS %s" % cas)
    else:
        print("No result for CAS %s" % cas)
cas_cid_smiles['Type'] = 'Original'
cas_cid_smiles['Enantiomer'] = ''
print(cas_cid_smiles.shape[0])

No result for CAS 28283-97-8
No chiral SMILES found for CAS 564-94-3
No chiral SMILES found for CAS 536-59-4
No chiral SMILES found for CAS 99-83-2
115


In [55]:
original_smiles_list = list(cas_cid_smiles['SMILES'])
original_cas_list = list(cas_cid_smiles.index)
for i,smiles in enumerate(original_smiles_list):
    smiles_m = smiles.replace('@@','a').replace('@','b').replace('/','c').replace('\\','d')\
                     .replace('a','@').replace('b','@@').replace('c','\\').replace('d','/')             
    url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/%s/synonyms/JSON" % quote(smiles_m)  
    r = requests.get(url)
    if r.status_code == 200:
        x = r.json()['InformationList']['Information'][0]
        CID = x['CID']
        found_cas = False
        for synonym in x['Synonym']:
            if re.match('[0-9]+-[0-9]+-[0-9]+', synonym) and ' ' not in synonym:
                found_cas = True
                cas = synonym
                original_cas = cas_cid_smiles.index[i]
                if cas == original_cas:
                    print('Rejected identical CAS %s' % cas)
                    continue
                if cas in original_cas_list:
                    print('Already have data on CAS %s' % cas)
                    continue
                cas_cid_smiles.loc[cas] = [x['CID'], smiles_m, 'Mirrored', original_cas]
                cas_cid_smiles.loc[original_cas, 'Enantiomer'] = cas
                break
        if not found_cas:
            print("No CAS found for mirror SMILES %s" % smiles_m)
    else:
        print("No result for mirror SMILES string %s" % smiles_m)
print(cas_cid_smiles.shape[0])

Already have data on CAS 302911-94-0
Already have data on CAS 17392-83-5
Already have data on CAS 1845-25-6
Already have data on CAS 33758-15-5
No result for mirror SMILES string CCCCC(=O)O[C@@H]1CC[C@]2([C@@H]3CC[C@@]4([C@@H]([C@H]3CC=C2C1)CC[C@H]4[C@@H](C)CCCC(C)C)C)C
Already have data on CAS 72657-23-9
Already have data on CAS 16404-54-9
Already have data on CAS 2216-51-5
No result for mirror SMILES string CC(C)[C@H]1CC[C@@]2([C@@H]3[C@H]1[C@@H]4[C@]2([C@@H]4C3)C)C
Already have data on CAS 103128-76-3
Already have data on CAS 2767-84-2
Already have data on CAS 464-48-2
Already have data on CAS 22144-60-1
No result for mirror SMILES string CCCCCCCC\C=C/CCCCCCCCOC(=O)O[C@@H]1CC[C@]2([C@@H]3CC[C@@]4([C@@H]([C@H]3CC=C2C1)CC[C@H]4[C@@H](C)CCCC(C)C)C)C
Already have data on CAS 2623-23-6
Already have data on CAS 83541-68-8
Already have data on CAS 80657-57-4
Already have data on CAS 89-82-7
No result for mirror SMILES string CCCCCC(=O)O[C@@H]1CC[C@]2([C@@H]3CC[C@@]4([C@@H]([C@H]3CC=C2C1)CC

In [56]:
cas_cid_smiles['Type'].value_counts()

Original    115
Mirrored     41
Name: Type, dtype: int64

In [62]:
# This should print nothing if all the SMILES strings have stereometic information
for i,s in enumerate(cas_cid_smiles['SMILES']):
    if not any([x in s for x in ['@','@@','/','\\']]):
        print('This SMILES string has no stereomeric information: CAS=%s; CID=%d; SMILES=%s' % (cas_cid_smiles['Type'][i],cas_cid_smiles.index[i],cas_cid_smiles['CID'][i],s))

In [63]:
pairs = cas_cid_smiles[cas_cid_smiles['Enantiomer'] != '']
print("There are %d pairs of enantiomers for which we have VCD spectra for at least one member of the pair." % (len(pairs)/2))

There are 42 pairs of enantiomers for which we have VCD spectra for at least one member of the pair.


In [64]:
pairs

Unnamed: 0,CID,SMILES,Type,Enantiomer
18172-67-3,440967,CC1([C@H]2CCC(=C)[C@@H]1C2)C,Original,19902-08-0
116910-11-3,7157231,CC(C)[C@H]1CO[C@]2(N1C(=O)C=C2)C,Original,222629-69-8
15356-60-2,165675,C[C@H]1CC[C@@H]([C@H](C1)O)C(C)C,Original,89-78-1
53369-17-8,104476,CC1([C@H]2CC[C@@H]([C@@H]1C2)CO)C,Original,132203-71-5
2217-02-9,6997371,C[C@@]12CC[C@@H](C1)C([C@@H]2O)(C)C,Original,512-13-0
5157-89-1,62335,C[C@H]1CC[C@@H]([C@H](C1)OC(=O)C)C(C)C,Original,89-48-5
3391-90-0,638012,C[C@H]1CCC(=C(C)C)C(=O)C1,Original,90449-51-7
27779-29-9,90350,C[C@H]1[C@@H]2C[C@@H](C2(C)C)C[C@@H]1O,Original,1196-00-5
1767-46-0,13463446,CC[C@H](C)CCCO,Original,53353-04-1
5989-27-5,440917,CC1=CC[C@@H](CC1)C(=C)C,Original,7721-11-1


In [68]:
def cas2soup(cas):
    import requests
    base_url = "http://www.thegoodscentscompany.com"
    url = "%s/search3.php?qName=%s&submit.x=0&submit.y=0" % (base_url,cas)
    response = requests.get(url)
    index = response.text.find('data/')
    soup = None
    if index > 0:
        path = response.text[index:(index+25)].split("'")[0]
        url = "%s/%s" % (base_url,path)
        response = requests.get(url)
        soup = bs4.BeautifulSoup(response.text)
    return soup

In [69]:
def soup2odor(soup):
    chem_tables = soup.find_all("table",{"class":"cheminfo"})
    odor = None
    for t in chem_tables:
        if 'Odor' in t.text:
            odor = [x.text for x in t.find_all('td')]
            break
    return odor

In [72]:
cas_odors = {}
for cas in cas_cid_smiles.index:
    print(cas)
    soup = cas2soup(cas)
    if soup:
        odor = soup2odor(soup)
        if odor:
            print("Found odor info for %s" % cas)
            cas_odors[cas] = odor

18172-67-3
Found odor info for 18172-67-3
116910-11-3
27871-49-4
Found odor info for 27871-49-4
24047-72-1
33758-16-6
7726-03-6
Found odor info for 7726-03-6
80657-57-4
14898-80-7
Found odor info for 14898-80-7
15356-60-2
Found odor info for 15356-60-2
22469-52-9
Found odor info for 22469-52-9
26127-08-2
53369-17-8
Found odor info for 53369-17-8
10334-26-6
2217-02-9
Found odor info for 2217-02-9
464-49-3
Found odor info for 464-49-3
22135-49-5
17110-51-9
Found odor info for 17110-51-9
5157-89-1
Found odor info for 5157-89-1
83540-97-0
72657-23-9
3391-90-0
Found odor info for 3391-90-0
1062-96-0
89-82-7
Found odor info for 89-82-7
521-13-1
Found odor info for 521-13-1
27779-29-9
Found odor info for 27779-29-9
1767-46-0
Found odor info for 1767-46-0
70419-07-7
546-80-5
Found odor info for 546-80-5
26184-62-3
Found odor info for 26184-62-3
21210-43-5
15507-52-5
5989-27-5
Found odor info for 5989-27-5
19894-97-4
Found odor info for 19894-97-4
17392-83-5
Found odor info for 17392-83-5
31087

In [84]:
print("We have GoodScents odors information for %d molecules in the enantiomer set" % len(cas_odors))

We have GoodScents odors information for 92 molecules in the enantiomer set


In [98]:
x = cas_cid_smiles.copy()
x['Odor'] = 0
for cas in cas_odors:
    x.loc[cas,'Odor'] = 1
x2 = x[(x['Odor']==1) & (x['Enantiomer'] != '')]
x3 = x2[x2['Enantiomer'].isin(list(x2.index))]
print("We have odor info and spectra for %d enantiomers (%d pairs)" % (len(x3), len(x3)/2))

We have odor info and spectra for 36 enantiomers (18 pairs)


In [101]:
x3.shape

(36, 6)

In [100]:
x3['Strength'] = ''
for cas, odor in cas_odors.items():
    for o in odor:
        if 'Odor Strength' in o:
            strength = o.split(':')[1].split(',')[0].strip()
            if cas in x3.index:
                x3.loc[cas, 'Strength'] = strength

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [102]:
x3

Unnamed: 0,CID,SMILES,Type,Enantiomer,Strength,Odor
18172-67-3,440967,CC1([C@H]2CCC(=C)[C@@H]1C2)C,Original,19902-08-0,high,1
15356-60-2,165675,C[C@H]1CC[C@@H]([C@H](C1)O)C(C)C,Original,89-78-1,,1
2217-02-9,6997371,C[C@@]12CC[C@@H](C1)C([C@@H]2O)(C)C,Original,512-13-0,,1
5157-89-1,62335,C[C@H]1CC[C@@H]([C@H](C1)OC(=O)C)C(C)C,Original,89-48-5,,1
27779-29-9,90350,C[C@H]1[C@@H]2C[C@@H](C2(C)C)C[C@@H]1O,Original,1196-00-5,,1
19894-97-4,88301,CC1([C@H]2CC=C([C@@H]1C2)CO)C,Original,6712-78-3,medium,1
2552-91-2,75699,C[C@@H]1CC[C@H]([C@H](C1)OC(=O)C)C(C)C,Original,2230-87-7,,1
14073-97-3,26447,C[C@@H]1CC[C@H](C(=O)C1)C(C)C,Original,3391-87-5,medium,1
25465-65-0,10524983,C[C@@H]1[C@H]2C[C@H](C2(C)C)C[C@H]1O,Original,24041-60-9,,1
42072-39-9,641003,CC[C@H](C)CCO,Original,70224-28-1,,1
