In [1]:
# Connecting to the db
import lib.handle_db as dbh

# read and write csv files
import lib.handle_csv as csv_rw

# Parsing html 
from bs4 import BeautifulSoup

# http requests 
import requests

# url parser
from urllib.parse import urlparse  # python 3.x

# values for metadata class names to exclude
exclude_metadata = {'nature':['viewport', 'msapplication-TileColor', 'msapplication-config',  'theme-color', 
                    'application-name', 'robots', 'access', 'WT.cg_s', 'WT.z_bandiera_abtest', 'WT.page_categorisation',
                    'WT.template', 'WT.z_cg_type', 'WT.cg_n', 'dc.rights', 'prism.issn']}

# values for section labels which may contain references to data
section_labels = {'nature':{'aria-labelledby':'data-availability'},'springer':{'aria-labelledby':'data-availability'}}

# values for div which may contain references to data
div_filters = {'nature':{'class':'c-article-supplementary__item'}, 'springer':{'class':"c-article-supplementary__item"}}

#  Custom functions to get references to datasets
# returns beautifulsoup object from given url
def get_content(url):
    html_soup = None
    try:
        req_head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
                    (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
        response = requests.get(url, headers = req_head)
        redirected_to = response.url
        html_soup = BeautifulSoup(response.text,'html.parser')       
    except Exception as e:
        print(e)
    return html_soup, redirected_to

# get metadata
def get_metadata(soup, journal):
    result=[]
    try:
        metadata = soup.find_all('meta')
        ignore_these = []
        if journal in exclude_metadata:
            ignore_these = exclude_metadata[journal] 
        else:
            print('new journal')
        for md_item in metadata:
            if md_item.has_attr("name") and not md_item["name"] in ignore_these :
                result.append(md_item)
    except Exception as e:
        print(e)
    return result

# get data ref from metadata
def get_data_from_metadata(soup, journal = 'nature catalysis', data_refs = []):
    data_refs = []
    res = get_metadata(soup, 'nature catalysis')
    # check if metadata references supporting data or supplementary data
    for md_item in res:
        if 'data' in str(md_item["name"]).lower():
            #print(md_item["name"], md_item["content"])
            ret_data = md_item["content"]
            data_refs.append({'type':"metadata", "name":md_item["name"], 'data_url':md_item["content"]})
    # get author(s) data from metadata
    #for md_item in res:
    #    if 'author' in str(md_item["name"]).lower():
    #        print(md_item["name"], md_item["content"])    
    return data_refs

def get_data_from_section(soup, journal = 'nature catalysis', data_refs = [], base_url=""):
    inspect_these = section_labels[journal]
    for sec_filter in inspect_these:
        sections = soup.find_all('section', {sec_filter:inspect_these[sec_filter]})
        for section in sections:
            pars = section.find_all('p')
            for par in pars:
                references = par.find_all('a')
                if len(references) == 0:
                   data_refs.append({'type':inspect_these[sec_filter], "name":par.contents[0], 'data_url':None}) 
                for a_ref in references:
                    content_text = a_ref.contents[0]
                    data_url = a_ref['href']
                    if data_url[0] == '/' and base_url != "":
                        data_url = base_url + data_url
                    data_refs.append({'type':inspect_these[sec_filter], "name":a_ref.contents[0], 'data_url':data_url})
    return data_refs

def get_data_from_divs(soup, journal = 'nature catalysis', data_refs = [], base_url=""):
    inspect_these = div_filters[journal]
    for div_filter in inspect_these:
        divs = soup.find_all('div',{div_filter:inspect_these[div_filter]})
        for div in divs:
            a_ref =  div.find('a')
            content_text = a_ref.contents[0]
            data_url = a_ref['href']
            if data_url[0] == '/' and base_url != "":
                data_url = base_url + data_url
            data_refs.append({'type':"supplementary", "name":a_ref.contents[0], 'data_url':data_url})
    return data_refs


# get a list of ids, titles, dois, links, pdf_file and 
# html_file names from the app database
def get_pub_app_data(db_name = "app_db.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    search_in = 'articles'
    fields_required = "id, title, doi, link, pdf_file, html_file"
    filter_str = "status = 'Added'"
    db_titles = db_conn.get_values(search_in, fields_required, filter_str)
    db_conn.close()
    return db_titles

def get_base_url(response_url):
    parsed_uri = urlparse(response_url)  # returns six components
    base_url = parsed_uri.scheme + "://" + parsed_uri.netloc
    return base_url

In [None]:
ukchapp_db = "db_files/app_db.sqlite3"
db_pubs = get_pub_app_data(ukchapp_db)
data_records = {}
ref_count = 1
for a_pub in db_pubs:
    pub_id = a_pub[0]
    pub_title = a_pub[1]
    pub_doi = a_pub[2]
    pub_url = a_pub[3]
    pub_pdf = a_pub[4]
    pub_html = a_pub[5]
    publisher = "springer"# "Nature Catalysis"
    if publisher in str(pub_url).lower():
        #print(pub_title) 
        # use doi reference to get landing page
        url = "http://dx.doi.org/" + pub_doi
        #print("Title: ", pub_title, " look up: ", url)
        doc_content, response_url = get_content(url)
        base_url = get_base_url(response_url)
        res = []
        res = get_data_from_metadata(doc_content, publisher, res)
        res = get_data_from_section(doc_content, publisher, res, base_url)
        res = get_data_from_divs(doc_content, publisher, res, base_url)
        if res != []:
            for data_ref in res:
                #print(data_ref)
                data_record = {'id':pub_id, 'doi':pub_doi}    
                data_record.update(data_ref)
                data_records[ref_count] = data_record
                ref_count += 1


In [8]:
data_records

{1: {'id': 27,
  'doi': '10.1007/s11244-018-0923-4',
  'type': 'data-availability',
  'name': 'The datasets acquired during and/or analysed during the current study are available from the corresponding author on reasonable request.',
  'data_url': None},
 2: {'id': 27,
  'doi': '10.1007/s11244-018-0923-4',
  'type': 'supplementary',
  'name': 'Supplementary material 1 (PDF 294 KB)',
  'data_url': 'https://static-content.springer.com/esm/art%3A10.1007%2Fs11244-018-0923-4/MediaObjects/11244_2018_923_MOESM1_ESM.pdf'},
 3: {'id': 31,
  'doi': '10.1007/s11244-018-0888-3',
  'type': 'supplementary',
  'name': 'Supplementary material 1 (DOCX 775 KB)',
  'data_url': 'https://static-content.springer.com/esm/art%3A10.1007%2Fs11244-018-0888-3/MediaObjects/11244_2018_888_MOESM1_ESM.docx'},
 4: {'id': 32,
  'doi': '10.1007/s11244-018-0887-4',
  'type': 'supplementary',
  'name': 'Supplementary material 1 (DOCX 1439 KB)',
  'data_url': 'https://static-content.springer.com/esm/art%3A10.1007%2Fs11244-

In [9]:
csv_rw.write_csv_data(data_records, 'pub_data.csv')

In [18]:
missing_pubs = []
for a_pub in db_pubs:
    pub_id = a_pub[0]
    pub_title = a_pub[1]
    pub_doi = a_pub[2]
    pub_url = a_pub[3]
    pub_pdf = a_pub[4]
    pub_html = a_pub[5]
    if "springer" in str(pub_url).lower():
        data_found = False
        for dr in data_records:
            if pub_id == data_records[dr]['id']:
                data_found = True
                break
        if not data_found:
            url = "http://dx.doi.org/" + pub_doi
            print("missing: ", pub_id, pub_doi, url)
            missing_pubs.append(a_pub)
missing_pubs

missing:  58 10.1007/s12039-019-1608-7 http://dx.doi.org/10.1007/s12039-019-1608-7
missing:  75 10.1007/s12010-017-2422-7 http://dx.doi.org/10.1007/s12010-017-2422-7
missing:  120 10.1007/s10562-016-1742-5 http://dx.doi.org/10.1007/s10562-016-1742-5
missing:  229 10.1007/s11244-016-0539-5 http://dx.doi.org/10.1007/s11244-016-0539-5
missing:  318 10.1007/s11244-019-01204-y http://dx.doi.org/10.1007/s11244-019-01204-y
missing:  338 10.1007/s11244-018-0885-6 http://dx.doi.org/10.1007/s11244-018-0885-6
missing:  456 10.1007/s11244-014-0346-9 http://dx.doi.org/10.1007/s11244-014-0346-9
missing:  478 10.1007/s11244-018-1057-4 http://dx.doi.org/10.1007/s11244-018-1057-4
missing:  496 10.1007/s11244-018-0890-9 http://dx.doi.org/10.1007/s11244-018-0890-9


[(58,
  'Influence of TiO2 structural properties on photocatalytic hydrogen gas production',
  '10.1007/s12039-019-1608-7',
  'http://link.springer.com/content/pdf/10.1007/s12039-019-1608-7.pdf',
  'Bahruji2019_Article_InfluenceOfHboxTiO2TiO2Structu.pdf',
  None),
 (75,
  'Nanosilicalites as Support for β-Glucosidases Covalent Immobilization',
  '10.1007/s12010-017-2422-7',
  'http://link.springer.com/article/10.1007/s12010-017-2422-7/fulltext.html',
  '2017_Article_.pdf',
  None),
 (120,
  'Application of Inelastic Neutron Scattering to the Methanol-to-Gasoline Reaction Over a ZSM-5 Catalyst',
  '10.1007/s10562-016-1742-5',
  'http://link.springer.com/content/pdf/10.1007/s10562-016-1742-5.pdf',
  'Howe2016_Article_ApplicationOfInelasticNeutronS.pdf',
  None),
 (229,
  'On the Role of Water in Heterogeneous Catalysis: A Tribute to Professor M. Wyn Roberts',
  '10.1007/s11244-016-0539-5',
  'http://link.springer.com/content/pdf/10.1007/s11244-016-0539-5.pdf',
  'Davies2016_Article_OnThe

In [6]:
data_refs = []
for sup_inf_div in sup_info_divs:
    s_i_d_r =  sup_inf_div.find('a')
    print (s_i_d_r.contents[0])
    print (s_i_d_r['href'])
    data_refs.append({'type':"supplementary", "name":s_i_d_r.contents[0], 'data_url':s_i_d_r['href']})

NameError: name 'sup_info_divs' is not defined

In [None]:
data_refs = get_data_from_divs(soup, 'nature catalysis', [])

In [None]:
data_refs

In [None]:
url = 'http://dx.doi.org/10.1007/s11244-018-0923-4'
req_head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
response = requests.get(url, headers = req_head)
print(response)
print(response.url)
redirected_to = response.url
parsed_uri = urlparse(redirected_to)  # returns six components
print(parsed_uri)
domain = parsed_uri.netloc
result = domain.replace('www.', '')  # as per your case
print(domain)
base_url = parsed_uri.scheme + "://" + parsed_uri.netloc

print(base_url)
soup = BeautifulSoup(response.text,'html.parser')

sup_info_divs = soup.find_all("div", {"class": "c-article-supplementary__item"})
data_avl = soup.find_all('section', {'aria-labelledby':'data-availability'})
pars = data_avl[0].find_all('p')
references = pars[0].find_all('a')
print(pars[0].contents[0])
len(references)
res = []
res = get_data_from_section(soup, 'springer', res, base_url)
res