# Get data references from html pages

A list of publications is obtainded from the app database. This list will contain a titles, IDs and DOIs which need to be explored to look for asociated data (suplementary data, raw data, processed data).

The steps of the process are:

1. get a Title, DOI, and URL for each publication 
2. get the DOI landing page and see if it contains references to data 
3. add a new dataset entry each time a new ds is found 
4. link the dataset to the publication

In [None]:
# Connecting to the db
import lib.handle_db as dbh

# read and write csv files
import lib.handle_csv as csv_rw

# Parsing html 
from bs4 import BeautifulSoup

# http requests 
import requests

# url parser
from urllib.parse import urlparse  # python 3.x

# add aprogress bar
from tqdm import tqdm_notebook 

# library for using regular expressions
import re

# values for metadata class names to exclude
exclude_metadata = {'nature':['viewport', 'msapplication-TileColor', 'msapplication-config', 'theme-color', 
                    'application-name', 'robots', 'access', 'WT.cg_s', 'WT.z_bandiera_abtest', 'WT.page_categorisation',
                    'WT.template', 'WT.z_cg_type', 'WT.cg_n', 'dc.rights', 'prism.issn'],'springer':['viewport', 'msapplication-TileColor', 'msapplication-config',  'theme-color', 
                    'application-name', 'robots', 'access', 'WT.cg_s', 'WT.z_bandiera_abtest', 'WT.page_categorisation',
                    'WT.template', 'WT.z_cg_type', 'WT.cg_n', 'dc.rights', 'prism.issn'],"wiley":[],'rsc':['viewport',
                    'format-detection', 'msapplication-TileColor', 'theme-color', 'dc.domain', 'twitter:card',
                    'twitter:site'], "acs":['pbContext','viewport','robots','twitter:description','pb-robots-disabled',
                    'twitter:card','twitter:site','twitter:image','twitter:title','google-site-verification']}

# values for section labels which may contain references to data
section_labels = {'nature':{'aria-labelledby':'data-availability'},'springer':{'aria-labelledby':'data-availability'}}

# values for div which may contain references to data
div_filters = {'nature':{'class':'c-article-supplementary__item'}, 'springer':{'class':"c-article-supplementary__item"}}

#  Custom functions to get references to datasets
# returns beautifulsoup object from given url
def get_content(url):
    html_soup = None
    try:
        req_head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
                    (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
        response = requests.get(url, headers = req_head)
        redirected_to = response.url
        html_soup = BeautifulSoup(response.text,'html.parser')       
    except Exception as e:
        print(e)
    return html_soup, redirected_to

# get metadata
def get_metadata(soup, journal):
    result=[]
    try:
        metadata = soup.find_all('meta')
        ignore_these = []
        if journal in exclude_metadata:
            ignore_these = exclude_metadata[journal] 
        else:
            print('new journal')
        for md_item in metadata:
            if md_item.has_attr("name") and not md_item["name"] in ignore_these :
                result.append(md_item)
    except Exception as e:
        print(e)
    return result

# get data ref from metadata
def get_data_from_metadata(soup, journal = 'nature', data_refs = []):
    data_refs = []
    res = get_metadata(soup, journal)
    # check if metadata references supporting data or supplementary data
    for md_item in res:
        if 'data' in str(md_item["name"]).lower():
            #print(md_item["name"], md_item["content"])
            ret_data = md_item["content"]
            data_refs.append({'type':"metadata", "name":md_item["name"], 'data_url':md_item["content"]})
    # get author(s) data from metadata
    #for md_item in res:
    #    if 'author' in str(md_item["name"]).lower():
    #        print(md_item["name"], md_item["content"])    
    return data_refs

def get_data_from_section(soup, journal = 'nature', data_refs = [], base_url=""):
    inspect_these = {}
    if journal in section_labels:
        inspect_these = section_labels[journal]
    for sec_filter in inspect_these:
        sections = soup.find_all('section', {sec_filter:inspect_these[sec_filter]})
        for section in sections:
            pars = section.find_all('p')
            for par in pars:
                references = par.find_all('a')
                if len(references) == 0:
                   data_refs.append({'type':inspect_these[sec_filter], "name":par.contents[0], 'data_url':None}) 
                for a_ref in references:
                    content_text = a_ref.contents[0]
                    data_url = a_ref['href']
                    if data_url[0] == '/' and base_url != "":
                        data_url = base_url + data_url
                    data_refs.append({'type':inspect_these[sec_filter], "name":a_ref.contents[0], 'data_url':data_url})
    return data_refs

def get_data_from_divs(soup, journal = 'nature', data_refs = [], base_url=""):
    inspect_these = {}
    if journal in div_filters:
        inspect_these = div_filters[journal]
    for div_filter in inspect_these:
        divs = soup.find_all('div',{div_filter:inspect_these[div_filter]})
        for div in divs:
            a_ref =  div.find('a')
            content_text = a_ref.contents[0]
            data_url = a_ref['href']
            if data_url[0] == '/' and base_url != "":
                data_url = base_url + data_url
            data_refs.append({'type':"supplementary", "name":a_ref.contents[0], 'data_url':data_url})
    return data_refs


# Wiley online stores supplementary in tables on the article page. 
table_filters={'wiley':{"class":"support-info__table"}}
def get_data_from_tables(soup, journal = 'wiley', data_refs = [], base_url=""):
    inspect_these = {}
    if journal in table_filters:
        inspect_these = table_filters[journal]
    for tbl_filter in inspect_these:
        tables = soup.find_all('table',{tbl_filter:inspect_these[tbl_filter]})
        for table in tables:
            # find rows
            trs = table.find_all('tr')
            # get the type and link from each row
            for tr in trs:
                td_link = tr.find('td',{"headers":"article-filename"})
                td_desc = tr.find('td',{"headers":"article-description"})
                data_link = td_link.find('a')
                data_url = data_link['href']
                if data_url[0] == '/' and base_url != "":
                    data_url = base_url + data_url
                data_refs.append({'type':td_desc.contents[0], "name":data_link.contents[0], 'data_url':data_url})
    return data_refs

# extract from anchor in text publications
a_filters={'rsc':{"class":"list__item-link"}, 'acs':{"class":"suppl-anchor"}}
def get_data_from_anchor(soup, journal = 'rsc', data_refs = [], base_url=""):
    # find line for supplementary
    if journal in a_filters:
        inspect_these = a_filters[journal]
    for a_filter in inspect_these:
        supp_h2_line = -1
        inspect_heads = soup.find_all("h2")
        for a_head in inspect_heads:
            for content in a_head.contents:
                if content != None and "supplementary" in str(content).lower() :
                    supp_h2_line = a_head.sourceline

        # Use the position of "header line" as offset to look for data links
        links = soup.find_all("a", {a_filter:inspect_these[a_filter]})
        for link in links:
            dt_type = dt_link = dt_name = ""
            if link.sourceline > supp_h2_line:
                #print(link, "\nLine: ", link.sourceline)
                if journal == 'rsc':
                    l_spans = link.find_all("span",{"class":"list__item-label"})
                    for a_span in l_spans:
                        for contnt in a_span.contents:
                            if 'supplementary' in str(contnt).lower():
                                dt_link = link['href']
                                dt_name = str(contnt).strip()
                                #print('supplementary', link['href'], str(contnt).strip())
                            if str(type(contnt)) == "<class 'bs4.element.Tag'>":
                                #print(contnt.contents[0])
                                dt_name += contnt.contents[0]
                            #print(str(type(contnt)), str(contnt).strip())
                elif journal == 'acs':
                    dt_link = link['href']
                    dt_name = str(link.contents[0]).strip()
            if dt_link != "" and dt_name != "":
                if dt_link[0] == '/' and base_url != "":
                    dt_link = base_url + dt_link
                data_refs.append({'type':'supplementary',"name":dt_name, 'data_url':dt_link})
    return data_refs

# get full doc from rsc landig page
def get_full_html_doc(soup):
    # check if full html text is available
    more_soup = anoter_url = None
    metadata = soup.find_all("meta",{"name":"citation_fulltext_html_url"})
    if len(metadata)> 0:
        more_soup, anoter_url = get_content(metadata[0]['content'])
    return more_soup, anoter_url

# verify if statement refers to supporting data
def is_data_stmt(statement=""):
    support_keys = ["data", "underpin", "support", "result", "found", "find", "obtain", "doi","raw", "information"
                    "provide", "availabe", "online"]
    count = 0
    for a_word in support_keys:
        if a_word in statement:
            count += 1
    if count > 2:
        return True
    return False

# get data references from full html doc
def get_data_from_html_doc(soup, journal = 'rsc', data_refs = [], base_url=""):
    if journal == 'rsc':
        # rsc lists the link to full html document in metadata
        more_soup, another_url  = get_full_html_doc(soup)
        if more_soup != None and another_url != None:
            base_url = get_base_url(another_url)
            soup = more_soup
    tag_targets = ['p', 'span']
    for tag_name in tag_targets:
        paras = soup.find_all(tag_name)
        for para in paras:
            for cont_para in para.contents:
                content = str(cont_para).lower()
                if 'data' in content:
                    intresting = ""
                    if 'data' in content[content.rfind(".")+2:]:
                        intresting = content[content.rfind(".")+2:]
                    else:
                        intresting = content[:content.rfind(".")]
                    anchor_refs = para.find_all('a')
                    if len(anchor_refs)>0 and is_data_stmt(intresting):
                        for a_ref in anchor_refs:
                            dt_link = a_ref['href']
                            dt_name = str(a_ref.contents[0])
                            if dt_link != "" and dt_name != "" and dt_link[0] != "#":
                                if dt_link[0] == '/' and base_url != "":
                                    dt_link = base_url + dt_link 
                                data_refs.append({'type':'supporting',"name":dt_name, 'data_url':dt_link})
    return data_refs

# get a list of ids, titles, dois, links, pdf_file and 
# html_file names from the app database
def get_pub_app_data(db_name = "app_db.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    search_in = 'articles'
    fields_required = "id, title, doi, link, pdf_file, html_file"
    filter_str = "status = 'Added'"
    db_titles = db_conn.get_values(search_in, fields_required, filter_str)
    db_conn.close()
    return db_titles

def get_base_url(response_url):
    parsed_uri = urlparse(response_url)  # returns six components
    base_url = parsed_uri.scheme + "://" + parsed_uri.netloc
    return base_url

# use regular expression to check if a given string
# is a valid DOI, using pattern from CR
def valid_doi(cr_doi):
    # CR DOIS: https://www.crossref.org/blog/dois-and-matching-regular-expressions/
    # CR DOIs re1
    # /^10.\d{4,9}/[-._;()/:A-Z0-9]+$/i
    if cr_doi == None:
        return False
    cr_re_01 = '^10.\d{4,9}/[-._;()/:A-Z0-9]+'
    compare = re.match(cr_re_01, cr_doi, re.IGNORECASE)
    if compare != None and cr_doi == compare.group():
        return True
    else:
        return False

In [None]:
# get the publications list from the app database
ukchapp_db = "db_files/app_db2.sqlite3"
db_pubs = get_pub_app_data(ukchapp_db)

# get the list of dois already mined for data 
input_file = 'pub_data_add202012.csv'
id_field = 'num'
processed, headings = csv_rw.get_csv_data(input_file, id_field)
processed_dois = []
for entry in processed:
    if not processed[entry]['doi'] in processed_dois:
        processed_dois.append( processed[entry]['doi'])

data_records = {}
ref_count = 1
for a_pub in tqdm_notebook(db_pubs):
    if a_pub[0] > 616: # only check new publications added after 616
        pub_id = a_pub[0]
        pub_title = a_pub[1]
        pub_doi = a_pub[2]
        pub_url = a_pub[3]
        pub_pdf = a_pub[4]
        pub_html = a_pub[5]
        publishers = ['acs', "wiley", "springer", "rsc", 'nature','elsevier']
        if not pub_doi in processed_dois and valid_doi(pub_doi):
            # use doi reference to get landing page
            url = "http://dx.doi.org/" + pub_doi
            doc_content, response_url = get_content(url)
            base_url = get_base_url(response_url)
            publisher = 'another_pub'
            for pb_name in publishers:
                if pb_name in base_url:
                    publisher = pb_name
            print(pub_id, "Title: ", pub_title, " look up: ", base_url, " publisher:", publisher)
            res = []
            if publisher in ['springer', 'nature']:
                res = get_data_from_metadata(doc_content, publisher, res)
                res = get_data_from_section(doc_content, publisher, res, base_url)
                res = get_data_from_divs(doc_content, publisher, res, base_url)
            if publisher in ['wiley']:
                res = get_data_from_tables(doc_content, publisher, res, base_url)
            if publisher in ['rsc','acs']:
                res = get_data_from_anchor(doc_content, publisher, res, base_url)
                res = get_data_from_html_doc(doc_content, publisher, res, base_url)
            if res != []:
                for data_ref in res:
                    #print(data_ref)
                    data_record = {'id':pub_id, 'doi':pub_doi}    
                    data_record.update(data_ref)
                    data_records[ref_count] = data_record
                    ref_count += 1

In [None]:
len(processed_dois)

In [None]:
if len(data_records) > 0:
    csv_rw.write_csv_data(data_records, 'pub_data_add202012.csv')

In [None]:
missing_pubs = []
for a_pub in db_pubs:
    if a_pub[0] > 616: # only check new publications added after 616
        pub_id = a_pub[0]
        pub_title = a_pub[1]
        pub_doi = a_pub[2]
        pub_url = a_pub[3]
        pub_pdf = a_pub[4]
        pub_html = a_pub[5]
        if "acs" in str(pub_url).lower():
            data_found = False
            for dr in data_records:
                if pub_id == data_records[dr]['id']:
                    data_found = True
                    break
            if not data_found:
                url = "http://dx.doi.org/" + pub_doi
                print("missing: ", pub_id, pub_doi, url)
                missing_pubs.append(a_pub)
    len(missing_pubs)
print(missing_pubs)

In [None]:
url = 'https://doi.org/10.1016/j.apcata.2018.10.010'
req_head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
response = requests.get(url, headers = req_head)

response = requests.get(response.url, headers = req_head)
print(response)
print(response.url)
redirected_to = response.url
parsed_uri = urlparse(redirected_to)  # returns six components
print(parsed_uri)
domain = parsed_uri.netloc
result = domain.replace('www.', '')  # as per your case
print(domain)
base_url = parsed_uri.scheme + "://" + parsed_uri.netloc

print(base_url)
soup = BeautifulSoup(response.text,'html.parser')

# check if full html text is available
metadata = soup.find_all("meta")#,{"name":"citation_fulltext_html_url"})
for meta in metadata:
  print(meta)

In [None]:
len(metadata)

In [None]:
exclude_metadata = {'nature':['viewport', 'msapplication-TileColor', 'msapplication-config',  'theme-color', 
                    'application-name', 'robots', 'access', 'WT.cg_s', 'WT.z_bandiera_abtest', 'WT.page_categorisation',
                    'WT.template', 'WT.z_cg_type', 'WT.cg_n', 'dc.rights', 'prism.issn'],'springer':['viewport', 
                    'msapplication-TileColor', 'msapplication-config',  'theme-color', 
                    'application-name', 'robots', 'access', 'WT.cg_s', 'WT.z_bandiera_abtest', 'WT.page_categorisation',
                    'WT.template', 'WT.z_cg_type', 'WT.cg_n', 'dc.rights', 'prism.issn'],"wiley":[], 'rsc':['viewport',
                    'format-detection', 'msapplication-TileColor', 'theme-color', 'dc.domain','twitter:card',
                    'twitter:site'],"acs":['pbContext','viewport','robots','twitter:description','pb-robots-disabled',
                    'twitter:card','twitter:site','twitter:image','twitter:title','google-site-verification'],'elsevier':
                    []}
metadata = soup.find_all("meta",{"name":True})

publisher = 'elsevier'
for md in metadata:
    if not md['name'] in exclude_metadata[publisher]:
        print("X:", md['name'])
        print(md)