In [7]:
import os
import re

import requests
from bs4 import BeautifulSoup


NIST_URL = 'http://webbook.nist.gov/cgi/cbook.cgi'
EXACT_RE = re.compile('/cgi/cbook.cgi\?GetInChI=(.*?)$')
ID_RE = re.compile('/cgi/cbook.cgi\?ID=(.*?)&')
JDX_PATH = 'nist/jdx/'
MOL_PATH = 'nist/mol/'


def search_nist_inchi(inchi):
    """Search NIST using the specified InChI or InChIKey query and return the matching NIST ID."""
    print('Searching: %s' % inchi)
    response = requests.get(NIST_URL, params={'InChI': inchi, 'Units': 'SI'})
    soup = BeautifulSoup(response.text)
    idlink = soup.find('a', href=EXACT_RE)
    if idlink:
        nistid = re.match(EXACT_RE, idlink['href']).group(1)
        print('Result: %s' % nistid)
        return nistid

def search_nist_formula(formula, allow_other=False, allow_extra=False, match_isotopes=False, exclude_ions=False, has_uv=False):
    """Search NIST using the specified formula query and return the matching NIST IDs."""
    print('Searching: %s' % formula)
    params = {'Formula': formula, 'Units': 'SI'}
    if allow_other:
        params['AllowOther'] = 'on'
    if allow_extra:
        params['AllowExtra'] = 'on'
    if match_isotopes:
        params['MatchIso'] = 'on'
    if exclude_ions:
        params['NoIon'] = 'on'
    if has_uv:
        params['cUV'] = 'on'
    response = requests.get(NIST_URL, params=params)
    soup = BeautifulSoup(response.text)
    ids = [re.match(ID_RE, link['href']).group(1) for link in soup('a', href=ID_RE)]
    print('Result: %s' % ids)
    return ids


def get_jdx(nistid, stype='UVVis'):
    """Download jdx file for the specified NIST ID, unless already downloaded."""
    filepath = os.path.join(JDX_PATH, '%s-%s.jdx' % (nistid, stype))
    if os.path.isfile(filepath):
        print('%s %s: Already exists at %s' % (nistid, stype, filepath))
        return
    print('%s %s: Downloading' % (nistid, stype))
    response = requests.get(NIST_URL, params={'JCAMP': nistid, 'Type': stype, 'Index': 0})
    if response.text == '##TITLE=Spectrum not found.\n##END=\n':
        print('%s %s: Spectrum not found' % (nistid, stype))
        return
    print('Saving %s' % filepath)
    with open(filepath, 'w') as file:
        file.write(str(response.content))


def get_mol(nistid):
    filepath = os.path.join(MOL_PATH, '%s.mol' % nistid)
    if os.path.isfile(filepath):
        print('%s: Already exists at %s' % (nistid, filepath))
        return
    print('%s: Downloading mol' % nistid)
    response = requests.get(NIST_URL, params={'Str2File': nistid})
    if response.text == 'NIST    12121112142D 1   1.00000     0.00000\nCopyright by the U.S. Sec. Commerce on behalf of U.S.A. All rights reserved.\n0  0  0     0  0              1 V2000\nM  END\n':
        print('%s: MOL not found' % nistid)
        return
    print('Saving %s' % filepath)
    with open(filepath, 'w') as file:
        file.write(str(response.content))


def get_all_uvvis():
    for i in range(1, 100):
        ids = search_nist_formula('C%s' % i, allow_other=True, exclude_ions=True, has_uv=True)
        print('%s spectra found' % len(ids))
        for nistid in ids:
            get_mol(nistid)
            get_jdx(nistid, stype='UVVis')


if __name__ == '__main__':
    get_all_uvvis()

Searching: C1
Result: ['C75616', 'C558134', 'C75638', 'C463718', 'C334996', 'C334883', 'C18588164', 'C74839', 'C115093', 'C75525', 'C26981931', 'C57136', 'C556887', 'C74931', 'C74895', 'C113008', 'C593544', 'C75252', 'C1511622']
19 spectra found
C75616: Already exists at nist/mol/C75616.mol
C75616 UVVis: Already exists at nist/jdx/C75616-UVVis.jdx
C558134: Already exists at nist/mol/C558134.mol
C558134 UVVis: Already exists at nist/jdx/C558134-UVVis.jdx
C75638: Already exists at nist/mol/C75638.mol
C75638 UVVis: Already exists at nist/jdx/C75638-UVVis.jdx
C463718: Already exists at nist/mol/C463718.mol
C463718 UVVis: Already exists at nist/jdx/C463718-UVVis.jdx
C334996: Already exists at nist/mol/C334996.mol
C334996 UVVis: Already exists at nist/jdx/C334996-UVVis.jdx
C334883: Already exists at nist/mol/C334883.mol
C334883 UVVis: Already exists at nist/jdx/C334883-UVVis.jdx
C18588164: Already exists at nist/mol/C18588164.mol
C18588164 UVVis: Already exists at nist/jdx/C18588164-UVVis.jd

Result: ['C685632', 'C7445605', 'C110850', 'C7119928', 'C352932', 'C75661', 'C513440', 'C3877154', 'C1551219', 'C628397', 'C77861', 'C64200', 'C75763', 'C460128', 'C4858859', 'C3934201', 'C1193211', 'C2802622', 'C108316', 'C1722129', 'C31575356', 'C10486610', 'C1001565', 'C28176105', 'C541593', 'C609405', 'C822844', 'C290379', 'C289952', 'C289805', 'C66228', 'C5919266', 'C1193244', 'C504176', 'C67527', 'C141902', 'C591286', 'C55770768', 'C110009', 'C328427', 'C288051', 'C126998', 'C109977', 'C109751', 'C5049616', 'C71307', 'C932525', 'C123568', 'C6339873', 'C3581871', 'C106990', 'C590192', 'C693981', 'C1603914', 'C1558232', 'C3759602', 'C4170303', 'C78944', 'C1191953', 'C109933', 'C108054', 'C79414', 'C107937', 'C625343', 'C57466645', 'C60275', 'C57716', 'C106989', 'C115117', 'C930552', 'C95454', 'C70473', 'C120796', 'C2691410', 'C123728', 'C109999', 'C123911', 'C922690', 'C14109729', 'C926670', 'C110010', 'C505293', 'C505237', 'C507197', 'C507404', 'C123751', 'C540807', 'C600248', 'C7

Result: ['C287923', 'C563462', 'C56859', 'C563804', 'C142687', 'C13005817', 'C57295882', 'C110894', 'C516063', 'C63683', 'C59518', 'C147842', 'C632224', 'C2782914', 'C110667', 'C628295', 'C1679090', 'C6163640', 'C2084186', 'C1878188', 'C1679089', 'C4548452', 'C5470188', 'C87423', 'C4556392', 'C626551', 'C109046', 'C109091', 'C626608', 'C16879020', 'C372485', 'C372474', 'C1122618', 'C2530269', 'C1124330', 'C14256996', 'C120730', 'C68940', 'C13877559', 'C69896', 'C69932', 'C1680111', 'C98011', 'C616024', 'C1003414', 'C38373443', 'C765504', 'C367577', 'C17148491', 'C110861', 'C98964', 'C4214760', 'C4214759', 'C73405', 'C70182886', 'C1003298', 'C142085', 'C626642', 'C694597', 'C109002', 'C72762006', 'C634979', 'C6890626', 'C2637345', 'C16133269', 'C4556234', 'C542927', 'C646059', 'C504245', 'C504290', 'C462088', 'C3438468', 'C2466764', 'C931635', 'C65714', 'C615770', 'C626482', 'C608344', 'C55770779', 'C534225', 'C98000', 'C498248', 'C16839977', 'C554143', 'C7559424', 'C108521', 'C15579825

Result: ['C344047', 'C319880', 'C118752', 'C344070', 'C832531', 'C13007926', 'C356423', 'C13601144', 'C75207771', 'C527219', 'C1423138', 'C880784', 'C392563', 'C110838', 'C2430004', 'C695647', 'C52682050', 'C108941', 'C141797', 'C1757422', 'C109499', 'C141979', 'C2179579', 'C57295893', 'C626620', 'C100641', 'C6050266', 'C110827', 'C592416', 'C691372', 'C625274', 'C563791', 'C7688213', 'C760203', 'C563780', 'C674760', 'C558372', 'C691383', 'C2315368', 'C120865', 'C56893', 'C16597356', 'C97745', 'C137268', 'C75978', 'C100798', 'C51690370', 'C53715472', 'C7133360', 'C685916', 'C1119499', 'C106558', 'C4164298', 'C111477', 'C625809', 'C638460', 'C14290927', 'C629196', 'C4253898', 'C4151693', 'C121448', 'C4375831', 'C13436038', 'C615930', 'C95943', 'C58902', 'C935955', 'C87876', 'C327548', 'C2367820', 'C771608', 'C618622', 'C120821', 'C97007', 'C367237', 'C700174', 'C70348', 'C99354', 'C608333', 'C460004', 'C2101884', 'C591355', 'C576249', 'C5326238', 'C619089', 'C100549', 'C100481', 'C27309

KeyboardInterrupt: 