In [92]:
import bs4
import os
import pandas as pd
from urllib.request import urlopen

import pyrfume
from pyrfume import odorants

In [93]:
# Scrape compounds names and links from SenseLab
info = []
for page in range(1,8):
    url = 'https://senselab.med.yale.edu/OdorDB/Browse?db=5&cl=1&page=%d' % page
    f = urlopen(url)
    html = f.read()
    soup = bs4.BeautifulSoup(html)
    table = soup.find('table')
    for span in table.find_all('span'):
        name = span.text.strip()
        link = span.find('a').get('href')
        info.append((name, link))

In [94]:
# Make into a dataframe
df = pd.DataFrame.from_records(info, columns=['name', 'url'])
df.head()

Unnamed: 0,name,url
0,(+)-2-Phenylbutyric acid,Data/144607/?db=5
1,(+)-Camphor,Data/144622/?db=5
2,(+)-Carvone,Data/144619/?db=5
3,(+)-Dihydrocarvone,Data/144621/?db=5
4,(+)-Fenchone,Data/144612/?db=5


In [95]:
# Get CIDS by searching the names
cids = odorants.get_cids(df['name'], kind='name')

[-----------------------100%---------------------] 257 out of 257 complete           
Could not find TURPENTINE
Could not find ß-Pinene
Could not find ß-Citronellol
Could not find SOAP
Could not find SHOWER GEL


Could not find (+)-2-Phenylbutyric acid
Could not find (-)-2-Phenylbutyric acid
Could not find 2,4,5-TRIMETHYLTHIAZOLINE
Could not find 2-METHYL-4-PROPYL-1,3-OXALTHIANE
Could not find 3,6,7-Trimethyl-2,6-octadienal
Could not find 3-methyl-4-(4-methyl-cyclohexyl)-propanal
Could not find ALMOND OIL
Could not find Banana aroma
Could not find BEESWAX
Could not find BETA-PHENYLETHYL ALCOHOL (PEA)
Could not find BRANDY
Could not find BROMOBUTANOIC ACID
Could not find BROMOHEPTANOIC ACID
Could not find BROMOPENTANOIC ACID
Could not find Butyl butyryllactate
Could not find CAGE AIR
Could not find CARVONE(+)
Could not find CARVONE(-)
Could not find CIGARETTE BUTTS
Could not find CINNAMON
Could not find CITRONELLAL(+)
Could not find CITRONELLAL(-)
Could not find CLOVE
Could not find COC

In [96]:
# Add these CIDs to the dataframe
df = df.set_index('name').join(pd.Series(cids, name='CID'))

In [97]:
# Get CAS strings for compounds with no CID was found based on the name
for name, url_suffix in df[df['CID']==0]['url'].items():
    url = 'https://senselab.med.yale.edu/OdorDB/%s' % url_suffix
    f = urlopen(url)
    html = f.read()
    soup = bs4.BeautifulSoup(html)
    table = soup.find('table')
    cas_row = table.find_all('tr')[5]
    cas_text = cas_row.find_all('span')[-1].text
    cas = cas_text.replace('\r\n','').strip()
    df.loc[name, 'CAS'] = cas

In [98]:
# Add CIDs obtained from searching the CAS string
for name, cas in df[df['CAS'].notnull()]['CAS'].items():
    if cas:
        cid = odorants.get_cid(cas, kind='name')
        df.loc[name, 'CID'] = cid
        
# Fill remaining missing CIDs with 0
df.loc[:, 'CID'] = df['CID'].fillna(0)

In [99]:
# Manual fills
df.loc['2,4,5-TRIMETHYLTHIAZOLINE', 'CID'] = 263626
df.loc['METHYLSALICYLATE', 'CID'] = 4133
df.loc['PHENYLETHYL ALCOHOL (PEA)', 'CID'] = 6054
df.loc['Perillaalcohol', 'CID'] = 10819
df.loc['Perillaaldehyde', 'CID'] = 16441
#df[df['CID']==0]

Unnamed: 0,url,CID,CAS
3-methyl-4-(4-methyl-cyclohexyl)-propanal,Data/145001/?db=5,0,
ALMOND OIL,Data/83402/?db=5,0,
BEESWAX,Data/78/?db=5,0,8012-89-3
BRANDY,Data/83401/?db=5,0,
BROMOHEPTANOIC ACID,Data/1789/?db=5,0,
Banana aroma,Data/83332/?db=5,0,
CAGE AIR,Data/3608/?db=5,0,
CIGARETTE BUTTS,Data/83397/?db=5,0,
CINNAMON,Data/83393/?db=5,0,977000660
CLOVE,Data/83392/?db=5,0,977007796


In [100]:
file_path = os.path.join(pyrfume.DATA, 'senselab.csv')
df.to_csv(file_path)