In [8]:
# Connecting to the db
import lib.handle_db as dbh

# read and write csv files
import lib.handle_csv as csv_rw

# Parsing html 
from bs4 import BeautifulSoup

# http requests 
import requests

# url parser
from urllib.parse import urlparse  # python 3.x

# values for metadata class names to exclude
exclude_metadata = {'nature catalysis':['viewport', 'msapplication-TileColor', 'msapplication-config',  'theme-color', 
                    'application-name', 'robots', 'access', 'WT.cg_s', 'WT.z_bandiera_abtest', 'WT.page_categorisation',
                    'WT.template', 'WT.z_cg_type', 'WT.cg_n', 'dc.rights', 'prism.issn']}

# values for section labels which may contain references to data
section_labels = {'nature catalysis':{'aria-labelledby':'data-availability'}}

# values for div which may contain references to data
div_filters = {'nature catalysis':{'class':'c-article-supplementary__item'}}

#  Custom functions to get references to datasets
# returns beautifulsoup object from given url
def get_content(url):
    html_soup = None
    try:
        req_head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
                    (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
        response = requests.get(url, headers = req_head)
        redirected_to = response.url
        html_soup = BeautifulSoup(response.text,'html.parser')       
    except Exception as e:
        print(e)
    return html_soup, redirected_to

# get metadata
def get_metadata(soup, journal):
    result=[]
    try:
        metadata = soup.find_all('meta')
        ignore_these = []
        if journal in exclude_metadata:
            ignore_these = exclude_metadata[journal] 
        else:
            print('new journal')
        for md_item in metadata:
            if md_item.has_attr("name") and not md_item["name"] in ignore_these :
                result.append(md_item)
    except Exception as e:
        print(e)
    return result

# get data ref from metadata
def get_data_from_metadata(soup, journal = 'nature catalysis', data_refs = []):
    data_refs = []
    res = get_metadata(soup, 'nature catalysis')
    # check if metadata references supporting data or supplementary data
    for md_item in res:
        if 'data' in str(md_item["name"]).lower():
            #print(md_item["name"], md_item["content"])
            ret_data = md_item["content"]
            data_refs.append({'type':"metadata", "name":md_item["name"], 'data_url':md_item["content"]})
    # get author(s) data from metadata
    #for md_item in res:
    #    if 'author' in str(md_item["name"]).lower():
    #        print(md_item["name"], md_item["content"])    
    return data_refs

def get_data_from_section(soup, journal = 'nature catalysis', data_refs = [], base_url=""):
    sections = soup.find_all('section')
    for section in sections:
        inspect_this = section_labels[journal]
        for sect_attr in inspect_this:
            if section.has_attr(sect_attr) and section[sect_attr] == 'data-availability':
                #print(section["aria-labelledby"])
                pars = section.find_all('p')
                for par in pars:
                    #print(par)
                    references = par.find_all('a')
                    for a_ref in references:
                        content_text = a_ref.contents[0]
                        data_url = a_ref['href']
                        if data_url[0] == '/' and base_url != "":
                            data_url = base_url + data_url
                        data_refs.append({'type':"supplementary", "name":a_ref.contents[0], 'data_url':data_url})
    return data_refs

def get_data_from_divs(soup, journal = 'nature catalysis', data_refs = [], base_url=""):
    inspect_these = div_filters[journal]
    for div_filter in inspect_these:
        divs = soup.find_all('div',{div_filter:inspect_these[div_filter]})
        for div in divs:
            a_ref =  div.find('a')
            content_text = a_ref.contents[0]
            data_url = a_ref['href']
            if data_url[0] == '/' and base_url != "":
                data_url = base_url + data_url
            data_refs.append({'type':"supplementary", "name":a_ref.contents[0], 'data_url':data_url})
    return data_refs


# get a list of ids, titles, dois, links, pdf_file and 
# html_file names from the app database
def get_pub_app_data(db_name = "app_db.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    search_in = 'articles'
    fields_required = "id, title, doi, link, pdf_file, html_file"
    filter_str = "status = 'Added'"
    db_titles = db_conn.get_values(search_in, fields_required, filter_str)
    db_conn.close()
    return db_titles

def get_base_url(response_url):
    parsed_uri = urlparse(response_url)  # returns six components
    base_url = parsed_uri.scheme + "://" + parsed_uri.netloc
    return base_url

In [9]:
ukchapp_db = "db_files/app_db.sqlite3"
db_pubs = get_pub_app_data(ukchapp_db)
data_records = {}
ref_count = 1
for a_pub in db_pubs:
    pub_id = a_pub[0]
    pub_title = a_pub[1]
    pub_doi = a_pub[2]
    pub_url = a_pub[3]
    pub_pdf = a_pub[4]
    pub_html = a_pub[5]
    if "nature" in str(pub_url).lower():
        print(pub_title) 
        # use doi reference to get landing page
        url = "http://dx.doi.org/" + pub_doi
        print("Title: ", pub_title, " look up: ", url)
        doc_content, response_url = get_content(url)
        base_url = get_base_url(response_url)
        res = []
        res = get_data_from_metadata(doc_content, 'nature catalysis', res)
        res = get_data_from_section(doc_content, 'nature catalysis', res, base_url)
        res = get_data_from_divs(doc_content, 'nature catalysis', res, base_url)
        if res != []:
            for data_ref in res:
                print(data_ref)
                data_record = {'id':pub_id, 'doi':pub_doi}    
                data_record.update(data_ref)
                data_records[ref_count] = data_record
                ref_count += 1


Tuning of catalytic sites in Pt/TiO2 catalysts for the chemoselective hydrogenation of 3-nitrostyrene
Title:  Tuning of catalytic sites in Pt/TiO2 catalysts for the chemoselective hydrogenation of 3-nitrostyrene  look up:  http://dx.doi.org/10.1038/s41929-019-0334-3
{'type': 'supplementary', 'name': 'https://doi.org/10.17035/d.2019.0079744472', 'data_url': 'https://doi.org/10.17035/d.2019.0079744472'}
{'type': 'supplementary', 'name': 'Supplementary Figs. 1–11, Tables 1–4 and references', 'data_url': 'https://static-content.springer.com/esm/art%3A10.1038%2Fs41929-019-0334-3/MediaObjects/41929_2019_334_MOESM1_ESM.pdf'}
Structural selectivity of supported Pd nanoparticles for catalytic NH3 oxidation resolved using combined operando spectroscopy
Title:  Structural selectivity of supported Pd nanoparticles for catalytic NH3 oxidation resolved using combined operando spectroscopy  look up:  http://dx.doi.org/10.1038/s41929-018-0213-3
{'type': 'supplementary', 'name': 'https://doi.org/10.525

{'type': 'supplementary', 'name': 'Supplementary Information', 'data_url': 'https://static-content.springer.com/esm/art%3A10.1038%2Fs41929-018-0206-2/MediaObjects/41929_2018_206_MOESM1_ESM.pdf'}
Synergistic ultraviolet and visible light photo-activation enables intensified low-temperature methanol synthesis over copper/zinc oxide/alumina
Title:  Synergistic ultraviolet and visible light photo-activation enables intensified low-temperature methanol synthesis over copper/zinc oxide/alumina  look up:  http://dx.doi.org/10.1038/s41467-020-15445-z
{'type': 'supplementary', 'name': 'Supplementary information', 'data_url': 'https://static-content.springer.com/esm/art%3A10.1038%2Fs41467-020-15445-z/MediaObjects/41467_2020_15445_MOESM1_ESM.pdf'}
{'type': 'supplementary', 'name': 'Peer Review', 'data_url': 'https://static-content.springer.com/esm/art%3A10.1038%2Fs41467-020-15445-z/MediaObjects/41467_2020_15445_MOESM2_ESM.pdf'}
Quantitative production of butenes from biomass-derived γ-valerolacto

In [10]:
data_records

{1: {'id': 1,
  'doi': '10.1038/s41929-019-0334-3',
  'type': 'supplementary',
  'name': 'https://doi.org/10.17035/d.2019.0079744472',
  'data_url': 'https://doi.org/10.17035/d.2019.0079744472'},
 2: {'id': 1,
  'doi': '10.1038/s41929-019-0334-3',
  'type': 'supplementary',
  'name': 'Supplementary Figs. 1–11, Tables 1–4 and references',
  'data_url': 'https://static-content.springer.com/esm/art%3A10.1038%2Fs41929-019-0334-3/MediaObjects/41929_2019_334_MOESM1_ESM.pdf'},
 3: {'id': 17,
  'doi': '10.1038/s41929-018-0213-3',
  'type': 'supplementary',
  'name': 'https://doi.org/10.5258/SOTON/D0709',
  'data_url': 'https://doi.org/10.5258/SOTON/D0709'},
 4: {'id': 17,
  'doi': '10.1038/s41929-018-0213-3',
  'type': 'supplementary',
  'name': 'Supplementary Information',
  'data_url': 'https://static-content.springer.com/esm/art%3A10.1038%2Fs41929-018-0213-3/MediaObjects/41929_2018_213_MOESM1_ESM.pdf'},
 5: {'id': 25,
  'doi': '10.1038/s41467-018-03138-7',
  'type': 'supplementary',
  'name

In [11]:
csv_rw.write_csv_data(data_records, 'pub_data.csv')

In [None]:
missing_pubs = []
for a_pub in db_pubs:
    pub_id = a_pub[0]
    pub_title = a_pub[1]
    pub_doi = a_pub[2]
    pub_url = a_pub[3]
    pub_pdf = a_pub[4]
    pub_html = a_pub[5]
    if "nature" in str(pub_url).lower():
        if not pub_id in data_records:
            url = "http://dx.doi.org/" + pub_doi
            print("missing: ", pub_id, pub_doi, url)

In [None]:
from urllib.parse import urlparse  # python 3.x

url = 'http://dx.doi.org/10.1038/s41563-019-0562-6'
req_head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
response = requests.get(url, headers = req_head)
print(response)
print(response.url)
redirected_to = response.url
parsed_uri = urlparse(redirected_to)  # returns six components
print(parsed_uri)
domain = parsed_uri.netloc
result = domain.replace('www.', '')  # as per your case
print(domain)
base_url = parsed_uri.scheme + "://" + parsed_uri.netloc

print(base_url)
soup = BeautifulSoup(response.text,'html.parser')

sup_info_divs = soup.find_all("div", {"class": "c-article-supplementary__item"})


In [None]:


data_refs = []

for sup_inf_div in sup_info_divs:
    s_i_d_r =  sup_inf_div.find('a')
    print (s_i_d_r.contents[0])
    print (s_i_d_r['href'])
    data_refs.append({'type':"supplementary", "name":s_i_d_r.contents[0], 'data_url':s_i_d_r['href']})

In [None]:
data_refs = get_data_from_divs(soup, 'nature catalysis', [])

In [None]:
data_refs

In [None]:
from urllib.parse import urlparse  # python 3.x
parsed_uri = urlparse('http://www.stackoverflow.com/questions/41899120/whatever')  # returns six components
domain = '{uri.netloc}'.format(uri=parsed_uri)
result = domain.replace('www.', '')  # as per your case
print(domain)