# Add datasets to UK Catalysis Hub app db
A list of publications is obtainded from the app database. This list will contain a titles, IDs and DOIs which need to be explored to look for asociated data (suplementary data, raw data, processed data). 

The steps of the process are: 

 -a) get a Title, DOI, and URL for each publication
 -b) get the DOI landing page and see if it contains references to data
 -c) add a new dataset entry each time a new ds is found
 -d) link the dataset to the publication.

In [1]:
# Libraries
# library containign functions that read and write to csv files
import lib.handle_csv as csvh
# library for connecting to the db
import lib.handle_db as dbh
# library for handling text matchings
import lib.text_comp as txtc
# library for getting data from crossref
import lib.crossref_api as cr_api
# library for handling url searchs
import lib.handle_urls as urlh
# managing files and file paths
from pathlib import Path
# add aprogress bar
from tqdm import tqdm_notebook 
# library for getting data from crossref
import lib.crossref_api as cr_api
#library for handling json files
import json
# library for using regular expressions
import re


In [2]:
# Custom Functions
# get the crossreference json page from doi
def get_cr_json_object(cr_doi):
  crjd = None
  doi_file = 'json_files/' + cr_doi.replace('/','_').lower() + '.json'
  if not Path(doi_file).is_file():
    crjd = cr_api.getBibData(cr_doi)
    with open(doi_file, 'w', encoding='utf-8') as f:
                json.dump(crjd, f, ensure_ascii=False, indent=4)
  else:
    jf = open(doi_file, 'r')
    crjd = json.load(jf)
  # return the content and the file name 
  return crjd, doi_file

# get the landing page for the publication from uri
def get_pub_html_doi(cr_doi):
    html_file = 'html_files/' + cr_doi.replace('/','_').lower() + '.html'
    if not Path(html_file).is_file():
        page_content = urlh.getPageFromDOI(doi_text)
        with open(html_file, 'w', encoding='utf-8') as f:
            f.write(page_content.decode("utf-8") )
    else:
        f = open(html_file, "r")
        page_content = f.read()
    return page_content, html_file
             
def get_titles(str_pub_title, db_name = "prev_search.sqlite3"):
    print(db_name)
    db_conn = dbh.DataBaseAdapter(db_name)
    search_in = 'prev_pop_searches'
    fields_required = "Num, Title"
    filter_str = "Title like '"+str_pub_title[0]+"%';"

    db_titles = db_conn.get_values(search_in, fields_required, filter_str)
    db_conn.close()
    return db_titles

def get_titles_and_dois(str_pub_title, db_name = "app_db.sqlite3"):
    print(db_name)
    db_conn = dbh.DataBaseAdapter(db_name)
    search_in = 'articles'
    fields_required = "id, title, doi"
    filter_str = "Title like '"+str_pub_title[0]+"%';"
    db_titles = db_conn.get_values(search_in, fields_required, filter_str)
    db_conn.close()
    return db_titles

def get_pub_app_data(db_name = "app_db.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    search_in = 'articles'
    fields_required = "id, title, doi, link"
    filter_str = "status = 'Added'"
    db_titles = db_conn.get_values(search_in, fields_required, filter_str)
    db_conn.close()
    return db_titles

# get the current csv working file
def get_working_file(nr_wf):
    working_file = wf_fields = None
    current_pass = 0
    if Path(nr_wf).is_file():
        working_file, wf_fields = csvh.get_csv_data(nr_wf,'Num')
        for art_num in tqdm_notebook(working_file):
            if 'ignore' in working_file[art_num].keys():
                if current_pass < int(working_file[art_num]['ignore']):
                    current_pass = int(working_file[art_num]['ignore'])
            else:
                break
    print("Current pass:", current_pass)
    return working_file, wf_fields, current_pass

def get_pub_html_url(text_url, entry_id):
    html_file = 'html_files/' +  entry_id + '.html'
    if not Path(html_file).is_file():
        print("")
        page_content = urlh.getPageFromURL(text_url)
        with open(html_file, 'w', encoding='utf-8') as f:
            f.write(page_content)
    else:
        f = open(html_file, "r")
        page_content = f.read()
    return page_content, html_file

def valid_doi(cr_doi):
    # CR DOIS: https://www.crossref.org/blog/dois-and-matching-regular-expressions/
    # CR DOIs re1
    # /^10.\d{4,9}/[-._;()/:A-Z0-9]+$/i
    cr_re_01 = '^10.\d{4,9}/[-._;()/:A-Z0-9]+'
    compare = re.match(cr_re_01, cr_doi, re.IGNORECASE)
    if compare != None and cr_doi == compare.group():
        return True
    else:
        return False
    
# get a semicolon separated list of authors from CR json data
def get_cr_author_list(article_data):
    authors = []
    if 'author' in article_data.keys():
        for author in article_data['author']:
            new_author=""
            new_author = author['family']
            if 'given' in author.keys():
                new_author += ", " + author['given']
            authors.append(new_author)
    return ("; ").join(authors)

# get the publication date from CR json data
def get_cr_year_published(article_data):
    year_print = 0
    if 'published-print' in article_data.keys() \
        and article_data['published-print'] != None \
        and article_data['published-print']['date-parts'][0] != None:
        year_print = int(article_data['published-print']['date-parts'][0][0])    
    elif 'journal-issue' in article_data.keys() \
        and article_data['journal-issue'] != None \
        and 'published-print' in article_data['journal-issue'].keys() \
        and article_data['journal-issue']['published-print'] != None \
        and article_data['journal-issue']['published-print']['date-parts'][0] != None:
        year_print = int(article_data['journal-issue']['published-print']['date-parts'][0][0])

    year_online = 0
    if 'published-online' in article_data.keys() \
        and article_data['published-online'] != None \
        and article_data['published-online']['date-parts'][0] != None:
        year_online = int(article_data['published-online']['date-parts'][0][0])    
    elif 'journal-issue' in article_data.keys() \
        and article_data['journal-issue'] != None \
        and 'published-online' in article_data['journal-issue'].keys() \
        and article_data['journal-issue']['published-online'] != None \
        and article_data['journal-issue']['published-online']['date-parts'][0] != None:
        year_print = int(article_data['journal-issue']['published-online']['date-parts'][0][0])
    
    if year_print != 0 and year_online != 0:
        return year_print if year_print < year_online else year_online
    else:
        return year_print if year_online == 0 else year_online
    return 0


Get the name of the current app db file:

In [3]:
# app db file with path: db_files/app_db.sqlite3
ukchapp_db = "db_files/app_db.sqlite3"
while not Path(ukchapp_db).is_file():
    print('Please enter the name of app db file:')
    ukchapp_db = input()

## Get pdf files for publications

Read database and try to recover pdf files

In [21]:
import requests
# get publication data from the ukch app
db_pubs = get_pub_app_data(ukchapp_db)

for a_pub in tqdm_notebook(db_pubs):
    pub_id = a_pub[0]
    pub_title = a_pub[1]
    pub_doi = a_pub[2]
    pub_url = a_pub[3]
   
    if "pdf" in pub_url:
        print ("try to get the pdf")
        try:
            response = requests.get(pub_url)
            content_type = response.headers['content-type']
            if not 'text' in content_type:
                #print(response.headers)
                cd= response.headers['content-disposition']
                #print(cd)
                fname = re.findall("filename=(.+)", cd)[0]
                #print(fname)
                with open('pdf_files/'+ fname +'.pdf', 'wb') as f:
                    f.write(response.content)
        except:
            print("ID: ", pub_id, "\nPublication: ",pub_title, 
                   "\nDOI: ", pub_doi, "\nDOI: ", pub_url)
    
    

HBox(children=(IntProgress(value=0, max=343), HTML(value='')))

try to get the pdf
try to get the pdf
try to get the pdf
try to get the pdf
ID:  8 
Publication:  QM/MM simulations identify the determinants of catalytic activity differences between type II dehydroquinase enzymes 
DOI:  10.1039/c8ob00066b 
DOI:  http://pubs.rsc.org/en/content/articlepdf/2018/OB/C8OB00066B
try to get the pdf
try to get the pdf
try to get the pdf
try to get the pdf
try to get the pdf
try to get the pdf
try to get the pdf
ID:  16 
Publication:  Waste not, want not: CO2 (re)cycling into block polymers 
DOI:  10.1039/c9cc02459j 
DOI:  http://pubs.rsc.org/en/content/articlepdf/2019/CC/C9CC02459J
try to get the pdf
try to get the pdf
ID:  22 
Publication:  Room temperature methoxylation in zeolite H-ZSM-5: an operando DRIFTS/mass spectrometric study 
DOI:  10.1039/c8cc07444e 
DOI:  http://pubs.rsc.org/en/content/articlepdf/2018/CC/C8CC07444E
try to get the pdf
try to get the pdf
ID:  24 
Publication:  The effects of MTG catalysis on methanol mobility in ZSM-5 
DOI:  10.1039

try to get the pdf
try to get the pdf
try to get the pdf
try to get the pdf
try to get the pdf
try to get the pdf
try to get the pdf
ID:  140 
Publication:  Light alkane oxidation using catalysts prepared by chemical vapour impregnation: tuning alcohol selectivity through catalyst pre-treatment 
DOI:  10.1039/c4sc00545g 
DOI:  http://pubs.rsc.org/en/content/articlepdf/2014/SC/C4SC00545G
try to get the pdf
ID:  142 
Publication:  Selective photocatalytic oxidation of benzene for the synthesis of phenol using engineered Au–Pd alloy nanoparticles supported on titanium dioxide 
DOI:  10.1039/c4cc04024d 
DOI:  http://pubs.rsc.org/en/content/articlepdf/2014/CC/C4CC04024D
try to get the pdf
ID:  145 
Publication:  Optimised photocatalytic hydrogen production using core–shell AuPd promoters with controlled shell thickness 
DOI:  10.1039/c4cp04693e 
DOI:  http://pubs.rsc.org/en/content/articlepdf/2014/CP/C4CP04693E
try to get the pdf
ID:  146 
Publication:  Well-controlled metal co-catalysts sy

try to get the pdf
try to get the pdf
try to get the pdf
try to get the pdf
try to get the pdf
try to get the pdf
ID:  263 
Publication:  Catalytic and biophysical investigation of rhodium hydroformylase 
DOI:  10.1039/c9cy01679a 
DOI:  http://pubs.rsc.org/en/content/articlepdf/2019/CY/C9CY01679A
try to get the pdf
ID:  267 
Publication:  Extracting structural information of Au colloids at ultra-dilute concentrations: identification of growth during nanoparticle immobilization 
DOI:  10.1039/c9na00159j 
DOI:  http://pubs.rsc.org/en/content/articlepdf/2019/NA/C9NA00159J
try to get the pdf
ID:  268 
Publication:  Low-temperature studies of propene oligomerization in ZSM-5 by inelastic neutron scattering spectroscopy 
DOI:  10.1039/c9ra03568k 
DOI:  http://pubs.rsc.org/en/content/articlepdf/2019/RA/C9RA03568K
try to get the pdf
ID:  269 
Publication:  In situ synthesized low-PtCo@porous carbon catalyst for highly efficient hydrogen evolution 
DOI:  10.1039/c8ta12263f 
DOI:  http://pubs.rs