In [1]:
# Connecting to the db
import lib.handle_db as dbh

# Parsing html 
from bs4 import BeautifulSoup

# http requests 
import requests

# values for metadata class names to exclude
exclude_metadata = {'nature catalysis':['viewport', 'msapplication-TileColor', 'msapplication-config',  'theme-color', 
                    'application-name', 'robots', 'access', 'WT.cg_s', 'WT.z_bandiera_abtest', 'WT.page_categorisation',
                    'WT.template', 'WT.z_cg_type', 'WT.cg_n', 'dc.rights', 'prism.issn']}

# values for section labels which may contain references to data
section_labels = {'nature catalysis':{'aria-labelledby':'data-availability'}}

#  Custom functions to get references to datasets
# returns beautifulsoup object from given url
def get_content(url):
    html_soup = None
    try:
        req_head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
        response = requests.get(url, headers = req_head)
        html_soup = BeautifulSoup(response.text,'html.parser')       
    except Exception as e:
        print(e)
    return html_soup

# get metadata
def get_metadata(soup, journal):
    result=[]
    try:
        metadata = soup.find_all('meta')
        ignore_these = []
        if journal in exclude_metadata:
            ignore_these = exclude_metadata[journal] 
        else:
            print('new journal')
        for md_item in metadata:
            if md_item.has_attr("name") and not md_item["name"] in ignore_these :
                result.append(md_item)
    except Exception as e:
        print(e)
    return result

# get data ref from metadata
def get_data_from_metadata(soup, journal = 'nature catalysis'):
    ret_data = ""
    res = get_metadata(soup, 'nature catalysis')
    # check if metadata references supporting data or supplementary data
    for md_item in res:
        if 'data' in str(md_item["name"]).lower():
            #print(md_item["name"], md_item["content"])
            ret_data = md_item["content"]
    # get author(s) data from metadata
    #for md_item in res:
    #    if 'author' in str(md_item["name"]).lower():
    #        print(md_item["name"], md_item["content"])    
    return ret_data

def get_data_from_section(soup, journal = 'nature catalysis'):
    data_link = ""
    sections = soup.find_all('section')
    for section in sections:
        inspect_this = section_labels[journal]
        for sect_attr in inspect_this:
            if section.has_attr(sect_attr) and section[sect_attr] == 'data-availability':
                #print(section["aria-labelledby"])
                pars = section.find_all('p')
                for par in pars:
                    #print(par)
                    references = par.find_all('a')
                    for reference in references:
                        for content in reference.contents:
                            #print(content)
                            data_link = content
    return data_link

In [2]:
url = "http://dx.doi.org/10.1038/s41563-020-0800-y"

doc_content = get_content(url)
res = get_data_from_metadata(doc_content, journal = 'nature catalysis')
if res != "":
    print(res)
else:
    print("Not found in metadata")
    
    
res = get_data_from_section(doc_content, 'nature catalysis')

if res != "":
    print("Found as section: ", res)
else:
    print("Not found in section")


Not found in metadata
Found as section:  https://github.com/AlexanderHoffman/supporting-info


In [3]:
res

'https://github.com/AlexanderHoffman/supporting-info'

In [4]:
req_head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
response = requests.get(url, headers = req_head)
soup = BeautifulSoup(response.text,'html.parser')  
sections = soup.find_all('section')

