In [59]:
# Connecting to the db
import lib.handle_db as dbh

# read and write csv files
import lib.handle_csv as csv_rw

# Parsing html 
from bs4 import BeautifulSoup

# http requests 
import requests

# url parser
from urllib.parse import urlparse  # python 3.x

# values for metadata class names to exclude
exclude_metadata = {'nature':['viewport', 'msapplication-TileColor', 'msapplication-config',  'theme-color', 
                    'application-name', 'robots', 'access', 'WT.cg_s', 'WT.z_bandiera_abtest', 'WT.page_categorisation',
                    'WT.template', 'WT.z_cg_type', 'WT.cg_n', 'dc.rights', 'prism.issn'],'springer':['viewport', 'msapplication-TileColor', 'msapplication-config',  'theme-color', 
                    'application-name', 'robots', 'access', 'WT.cg_s', 'WT.z_bandiera_abtest', 'WT.page_categorisation',
                    'WT.template', 'WT.z_cg_type', 'WT.cg_n', 'dc.rights', 'prism.issn'],"wiley":[]}

# values for section labels which may contain references to data
section_labels = {'nature':{'aria-labelledby':'data-availability'},'springer':{'aria-labelledby':'data-availability'}}

# values for div which may contain references to data
div_filters = {'nature':{'class':'c-article-supplementary__item'}, 'springer':{'class':"c-article-supplementary__item"}}

#  Custom functions to get references to datasets
# returns beautifulsoup object from given url
def get_content(url):
    html_soup = None
    try:
        req_head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
                    (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
        response = requests.get(url, headers = req_head)
        redirected_to = response.url
        html_soup = BeautifulSoup(response.text,'html.parser')       
    except Exception as e:
        print(e)
    return html_soup, redirected_to

# get metadata
def get_metadata(soup, journal):
    result=[]
    try:
        metadata = soup.find_all('meta')
        ignore_these = []
        if journal in exclude_metadata:
            ignore_these = exclude_metadata[journal] 
        else:
            print('new journal')
        for md_item in metadata:
            if md_item.has_attr("name") and not md_item["name"] in ignore_these :
                result.append(md_item)
    except Exception as e:
        print(e)
    return result

# get data ref from metadata
def get_data_from_metadata(soup, journal = 'nature', data_refs = []):
    data_refs = []
    res = get_metadata(soup, journal)
    # check if metadata references supporting data or supplementary data
    for md_item in res:
        if 'data' in str(md_item["name"]).lower():
            #print(md_item["name"], md_item["content"])
            ret_data = md_item["content"]
            data_refs.append({'type':"metadata", "name":md_item["name"], 'data_url':md_item["content"]})
    # get author(s) data from metadata
    #for md_item in res:
    #    if 'author' in str(md_item["name"]).lower():
    #        print(md_item["name"], md_item["content"])    
    return data_refs

def get_data_from_section(soup, journal = 'nature', data_refs = [], base_url=""):
    inspect_these = {}
    if journal in section_labels:
        inspect_these = section_labels[journal]
    for sec_filter in inspect_these:
        sections = soup.find_all('section', {sec_filter:inspect_these[sec_filter]})
        for section in sections:
            pars = section.find_all('p')
            for par in pars:
                references = par.find_all('a')
                if len(references) == 0:
                   data_refs.append({'type':inspect_these[sec_filter], "name":par.contents[0], 'data_url':None}) 
                for a_ref in references:
                    content_text = a_ref.contents[0]
                    data_url = a_ref['href']
                    if data_url[0] == '/' and base_url != "":
                        data_url = base_url + data_url
                    data_refs.append({'type':inspect_these[sec_filter], "name":a_ref.contents[0], 'data_url':data_url})
    return data_refs

def get_data_from_divs(soup, journal = 'nature', data_refs = [], base_url=""):
    inspect_these = {}
    if journal in div_filters:
        inspect_these = div_filters[journal]
    for div_filter in inspect_these:
        divs = soup.find_all('div',{div_filter:inspect_these[div_filter]})
        for div in divs:
            a_ref =  div.find('a')
            content_text = a_ref.contents[0]
            data_url = a_ref['href']
            if data_url[0] == '/' and base_url != "":
                data_url = base_url + data_url
            data_refs.append({'type':"supplementary", "name":a_ref.contents[0], 'data_url':data_url})
    return data_refs


# Wiley online stores supplementary in tables on the article page. 
table_filters={'wiley':{"class":"support-info__table"}}
def get_data_from_tables(soup, journal = 'wiley', data_refs = [], base_url=""):
    inspect_these = {}
    if journal in table_filters:
        inspect_these = table_filters[journal]
    for tbl_filter in inspect_these:
        tables = soup.find_all('table',{tbl_filter:inspect_these[tbl_filter]})
        for table in tables:
            # find rows
            trs = table.find_all('tr')
            # get the type and link from each row
            for tr in trs:
                td_link = tr.find('td',{"headers":"article-filename"})
                td_desc = tr.find('td',{"headers":"article-description"})
                data_link = td_link.find('a')
                data_url = data_link['href']
                if data_url[0] == '/' and base_url != "":
                    data_url = base_url + data_url
                data_refs.append({'type':td_desc.contents[0], "name":data_link.contents[0], 'data_url':data_url})
    return data_refs

# get a list of ids, titles, dois, links, pdf_file and 
# html_file names from the app database
def get_pub_app_data(db_name = "app_db.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    search_in = 'articles'
    fields_required = "id, title, doi, link, pdf_file, html_file"
    filter_str = "status = 'Added'"
    db_titles = db_conn.get_values(search_in, fields_required, filter_str)
    db_conn.close()
    return db_titles

def get_base_url(response_url):
    parsed_uri = urlparse(response_url)  # returns six components
    base_url = parsed_uri.scheme + "://" + parsed_uri.netloc
    return base_url

In [60]:
ukchapp_db = "db_files/app_db.sqlite3"
db_pubs = get_pub_app_data(ukchapp_db)
data_records = {}
ref_count = 1
for a_pub in db_pubs:
    pub_id = a_pub[0]
    pub_title = a_pub[1]
    pub_doi = a_pub[2]
    pub_url = a_pub[3]
    pub_pdf = a_pub[4]
    pub_html = a_pub[5]
    publisher = "wiley"# "springet"
    if publisher in str(pub_url).lower():
        #print(pub_title) 
        # use doi reference to get landing page
        url = "http://dx.doi.org/" + pub_doi
        #print("Title: ", pub_title, " look up: ", url)
        doc_content, response_url = get_content(url)
        base_url = get_base_url(response_url)
        res = []
        if publisher in ['springer', 'nature']:
            res = get_data_from_metadata(doc_content, publisher, res)
            res = get_data_from_section(doc_content, publisher, res, base_url)
            res = get_data_from_divs(doc_content, publisher, res, base_url)
        if publisher in ['wiley']:
            res = get_data_from_tables(doc_content, publisher, res, base_url)
        if res != []:
            for data_ref in res:
                #print(data_ref)
                data_record = {'id':pub_id, 'doi':pub_doi}    
                data_record.update(data_ref)
                data_records[ref_count] = data_record
                ref_count += 1


Supplementary /action/downloadSupplement?doi=10.1002%2Fcctc.201901268&file=cctc201901268-sup-0001-misc_information.pdf cctc201901268-sup-0001-misc_information.pdf
Supplementary /action/downloadSupplement?doi=10.1002%2Fchem.201805250&file=chem201805250-sup-0001-misc_information.pdf chem201805250-sup-0001-misc_information.pdf
Supporting Information /action/downloadSupplement?doi=10.1002%2Fejoc.201800799&file=ejoc201800799-sup-0001-SupMat.pdf ejoc201800799-sup-0001-SupMat.pdf
Supplementary /action/downloadSupplement?doi=10.1002%2Fcctc.201801299&file=cctc201801299-sup-0001-misc_information.pdf cctc201801299-sup-0001-misc_information.pdf
Supplementary /action/downloadSupplement?doi=10.1002%2Fcelc.201800729&file=celc201800729-sup-0001-misc_information.pdf celc201800729-sup-0001-misc_information.pdf
Supplementary /action/downloadSupplement?doi=10.1002%2Fcelc.201800052&file=celc201800052-sup-0001-misc_information.pdf celc201800052-sup-0001-misc_information.pdf
Supplementary /action/downloadSup

In [61]:
data_records

{1: {'id': 3,
  'doi': '10.1002/cctc.201901268',
  'type': 'Supplementary',
  'name': 'cctc201901268-sup-0001-misc_information.pdf',
  'data_url': 'https://chemistry-europe.onlinelibrary.wiley.com/action/downloadSupplement?doi=10.1002%2Fcctc.201901268&file=cctc201901268-sup-0001-misc_information.pdf'},
 2: {'id': 4,
  'doi': '10.1002/chem.201805250',
  'type': 'Supplementary',
  'name': 'chem201805250-sup-0001-misc_information.pdf',
  'data_url': 'https://chemistry-europe.onlinelibrary.wiley.com/action/downloadSupplement?doi=10.1002%2Fchem.201805250&file=chem201805250-sup-0001-misc_information.pdf'},
 3: {'id': 7,
  'doi': '10.1002/ejoc.201800799',
  'type': 'Supporting Information',
  'name': 'ejoc201800799-sup-0001-SupMat.pdf',
  'data_url': 'https://chemistry-europe.onlinelibrary.wiley.com/action/downloadSupplement?doi=10.1002%2Fejoc.201800799&file=ejoc201800799-sup-0001-SupMat.pdf'},
 4: {'id': 28,
  'doi': '10.1002/cctc.201801299',
  'type': 'Supplementary',
  'name': 'cctc2018012

In [62]:
csv_rw.write_csv_data(data_records, 'pub_data.csv')

In [64]:
missing_pubs = []
for a_pub in db_pubs:
    pub_id = a_pub[0]
    pub_title = a_pub[1]
    pub_doi = a_pub[2]
    pub_url = a_pub[3]
    pub_pdf = a_pub[4]
    pub_html = a_pub[5]
    if "wiley" in str(pub_url).lower():
        data_found = False
        for dr in data_records:
            if pub_id == data_records[dr]['id']:
                data_found = True
                break
        if not data_found:
            url = "http://dx.doi.org/" + pub_doi
            print("missing: ", pub_id, pub_doi, url)
            missing_pubs.append(a_pub)
missing_pubs

missing:  64 10.1002/9783527804085.ch10 http://dx.doi.org/10.1002/9783527804085.ch10
missing:  97 10.1002/pssa.201600440 http://dx.doi.org/10.1002/pssa.201600440
missing:  227 10.1002/cctc.201601692 http://dx.doi.org/10.1002/cctc.201601692
missing:  241 10.1002/cssc.201500503 http://dx.doi.org/10.1002/cssc.201500503
missing:  248 10.1002/cssc.201403190 http://dx.doi.org/10.1002/cssc.201403190
missing:  255 10.1002/cctc.201500242 http://dx.doi.org/10.1002/cctc.201500242
missing:  274 10.1002/cctc.201900401 http://dx.doi.org/10.1002/cctc.201900401
missing:  569 10.1002/adfm.201400338 http://dx.doi.org/10.1002/adfm.201400338


[(64,
  'Hybrid Catalysts for Other C-C and C-X Bond Formation Reactions',
  '10.1002/9783527804085.ch10',
  'https://api.wiley.com/onlinelibrary/tdm/v1/articles/10.1002%2F9783527804085.ch10',
  'None',
  None),
 (97,
  'Heterostructures of GaN with SiC and ZnO enhance carrier stability and separation in framework semiconductors',
  '10.1002/pssa.201600440',
  'https://api.wiley.com/onlinelibrary/tdm/v1/articles/10.1002%2Fpssa.201600440',
  'pssa.201600440.pdf',
  None),
 (227,
  'The Effects of Secondary Oxides on Copper‚ÄêBased Catalysts for Green Methanol Synthesis',
  '10.1002/cctc.201601692',
  'https://api.wiley.com/onlinelibrary/tdm/v1/articles/10.1002%2Fcctc.201601692',
  'cctc.201601692.pdf',
  None),
 (241,
  'Oxidation of Aliphatic Alcohols by Using Precious Metals Supported on Hydrotalcite under Solvent- and Base-Free Conditions',
  '10.1002/cssc.201500503',
  'https://api.wiley.com/onlinelibrary/tdm/v1/articles/10.1002%2Fcssc.201500503',
  'cssc.201500503.pdf',
  None),
 (

In [6]:
data_refs = []
for sup_inf_div in sup_info_divs:
    s_i_d_r =  sup_inf_div.find('a')
    print (s_i_d_r.contents[0])
    print (s_i_d_r['href'])
    data_refs.append({'type':"supplementary", "name":s_i_d_r.contents[0], 'data_url':s_i_d_r['href']})

NameError: name 'sup_info_divs' is not defined

In [None]:
data_refs = get_data_from_divs(soup, 'nature catalysis', [])

In [10]:
data_refs

[]

In [30]:
url = 'http://dx.doi.org/10.1002/cctc.201901268'
req_head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
response = requests.get(url, headers = req_head)
print(response)
print(response.url)
redirected_to = response.url
parsed_uri = urlparse(redirected_to)  # returns six components
print(parsed_uri)
domain = parsed_uri.netloc
result = domain.replace('www.', '')  # as per your case
print(domain)
base_url = parsed_uri.scheme + "://" + parsed_uri.netloc

print(base_url)
soup = BeautifulSoup(response.text,'html.parser')

sup_info_divs = soup.find_all("div", {"class": "c-article-supplementary__item"})
data_avl = soup.find_all('section', {'aria-labelledby':'data-availability'})



<Response [200]>
https://chemistry-europe.onlinelibrary.wiley.com/doi/full/10.1002/cctc.201901268
ParseResult(scheme='https', netloc='chemistry-europe.onlinelibrary.wiley.com', path='/doi/full/10.1002/cctc.201901268', params='', query='', fragment='')
chemistry-europe.onlinelibrary.wiley.com
https://chemistry-europe.onlinelibrary.wiley.com


In [45]:
tables = soup.find_all("table", {"class":"support-info__table"})
for table in tables:
    # find rows
    trs = table.find_all('tr')
    # get the type and link from each row
    for tr in trs:
        td_link = tr.find('td',{"headers":"article-filename"})
        td_desc = tr.find('td',{"headers":"article-description"})
        data_link = td_link.find('a')
        print(td_desc.contents[0], data_link['href'], data_link.contents[0])

Supplementary /action/downloadSupplement?doi=10.1002%2Fcctc.201901268&file=cctc201901268-sup-0001-misc_information.pdf cctc201901268-sup-0001-misc_information.pdf


In [52]:
get_data_from_table(soup, 'elsevier', [], base_url)

Supplementary /action/downloadSupplement?doi=10.1002%2Fcctc.201901268&file=cctc201901268-sup-0001-misc_information.pdf cctc201901268-sup-0001-misc_information.pdf


[{'type': 'Supplementary',
  'name': 'cctc201901268-sup-0001-misc_information.pdf',
  'data_url': 'https://chemistry-europe.onlinelibrary.wiley.com/action/downloadSupplement?doi=10.1002%2Fcctc.201901268&file=cctc201901268-sup-0001-misc_information.pdf'}]