# Add datasets to UK Catalysis Hub app db
A list of publications is obtainded from the app database. This list will contain a titles, IDs and DOIs which need to be explored to look for asociated data (suplementary data, raw data, processed data). 

The steps of the process are: 

 -a) get a Title, DOI, and URL for each publication
 -b) get the DOI landing page and see if it contains references to data
 -c) add a new dataset entry each time a new ds is found
 -d) link the dataset to the publication.

In [8]:
# Libraries
# library containign functions that read and write to csv files
import lib.handle_csv as csvh
# library for connecting to the db
import lib.handle_db as dbh
# library for handling text matchings
import lib.text_comp as txtc
# library for getting data from crossref
import lib.crossref_api as cr_api
# library for handling url searchs
import lib.handle_urls as urlh
# managing files and file paths
from pathlib import Path
# add aprogress bar
from tqdm import tqdm_notebook 
# library for getting data from crossref
import lib.crossref_api as cr_api
#library for handling json files
import json
# library for using regular expressions
import re
# library for handling http requests
import requests

In [9]:
# Custom Functions
# get the crossreference json page from doi
def get_cr_json_object(cr_doi):
  crjd = None
  doi_file = 'json_files/' + cr_doi.replace('/','_').lower() + '.json'
  if not Path(doi_file).is_file():
    crjd = cr_api.getBibData(cr_doi)
    with open(doi_file, 'w', encoding='utf-8-sig', errors='ignore') as f:
                json.dump(crjd, f, ensure_ascii=False, indent=4)
  else:
    with open(doi_file, 'r', encoding='utf-8-sig') as jf:
        crjd = json.load(jf)
  # return the content and the file name 
  return crjd, doi_file

# get the landing page for the publication from uri
def get_pub_html_doi(cr_doi):
    html_file = 'html_files/' + cr_doi.replace('/','_').lower() + '.html'
    if not Path(html_file).is_file():
        page_content = urlh.getPageFromDOI(doi_text)
        with open(html_file, 'w', encoding='utf-8') as f:
            f.write(page_content.decode("utf-8") )
    else:
        f = open(html_file, "r")
        page_content = f.read()
    return page_content, html_file
             
def get_titles(str_pub_title, db_name = "prev_search.sqlite3"):
    print(db_name)
    db_conn = dbh.DataBaseAdapter(db_name)
    search_in = 'prev_pop_searches'
    fields_required = "Num, Title"
    filter_str = "Title like '"+str_pub_title[0]+"%';"

    db_titles = db_conn.get_values(search_in, fields_required, filter_str)
    db_conn.close()
    return db_titles

def get_titles_and_dois(str_pub_title, db_name = "app_db.sqlite3"):
    print(db_name)
    db_conn = dbh.DataBaseAdapter(db_name)
    search_in = 'articles'
    fields_required = "id, title, doi"
    filter_str = "Title like '"+str_pub_title[0]+"%';"
    db_titles = db_conn.get_values(search_in, fields_required, filter_str)
    db_conn.close()
    return db_titles

def get_pub_app_data(db_name = "app_db.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    search_in = 'articles'
    fields_required = "id, title, doi, link, pdf_file, html_file"
    filter_str = "status = 'Added'"
    db_titles = db_conn.get_values(search_in, fields_required, filter_str)
    db_conn.close()
    return db_titles

# get the current csv working file
def get_working_file(nr_wf):
    working_file = wf_fields = None
    current_pass = 0
    if Path(nr_wf).is_file():
        working_file, wf_fields = csvh.get_csv_data(nr_wf,'Num')
        for art_num in tqdm_notebook(working_file):
            if 'ignore' in working_file[art_num].keys():
                if current_pass < int(working_file[art_num]['ignore']):
                    current_pass = int(working_file[art_num]['ignore'])
            else:
                break
    print("Current pass:", current_pass)
    return working_file, wf_fields, current_pass

def get_pub_html_url(text_url, entry_id):
    html_file = 'html_files/' +  entry_id + '.html'
    if not Path(html_file).is_file():
        print("")
        page_content = urlh.getPageFromURL(text_url)
        with open(html_file, 'w', encoding='utf-8') as f:
            f.write(page_content)
    else:
        f = open(html_file, "r")
        page_content = f.read()
    return page_content, html_file

def valid_doi(cr_doi):
    # CR DOIS: https://www.crossref.org/blog/dois-and-matching-regular-expressions/
    # CR DOIs re1
    # /^10.\d{4,9}/[-._;()/:A-Z0-9]+$/i
    if cr_doi == None:
        return False
    cr_re_01 = '^10.\d{4,9}/[-._;()/:A-Z0-9]+'
    compare = re.match(cr_re_01, cr_doi, re.IGNORECASE)
    if compare != None and cr_doi == compare.group():
        return True
    else:
        return False
    
# get a semicolon separated list of authors from CR json data
def get_cr_author_list(article_data):
    authors = []
    if 'author' in article_data.keys():
        for author in article_data['author']:
            new_author=""
            new_author = author['family']
            if 'given' in author.keys():
                new_author += ", " + author['given']
            authors.append(new_author)
    return ("; ").join(authors)

# get the publication date from CR json data
def get_cr_year_published(article_data):
    year_print = 0
    if 'published-print' in article_data.keys() \
        and article_data['published-print'] != None \
        and article_data['published-print']['date-parts'][0] != None:
        year_print = int(article_data['published-print']['date-parts'][0][0])    
    elif 'journal-issue' in article_data.keys() \
        and article_data['journal-issue'] != None \
        and 'published-print' in article_data['journal-issue'].keys() \
        and article_data['journal-issue']['published-print'] != None \
        and article_data['journal-issue']['published-print']['date-parts'][0] != None:
        year_print = int(article_data['journal-issue']['published-print']['date-parts'][0][0])

    year_online = 0
    if 'published-online' in article_data.keys() \
        and article_data['published-online'] != None \
        and article_data['published-online']['date-parts'][0] != None:
        year_online = int(article_data['published-online']['date-parts'][0][0])    
    elif 'journal-issue' in article_data.keys() \
        and article_data['journal-issue'] != None \
        and 'published-online' in article_data['journal-issue'].keys() \
        and article_data['journal-issue']['published-online'] != None \
        and article_data['journal-issue']['published-online']['date-parts'][0] != None:
        year_print = int(article_data['journal-issue']['published-online']['date-parts'][0][0])
    
    if year_print != 0 and year_online != 0:
        return year_print if year_print < year_online else year_online
    else:
        return year_print if year_online == 0 else year_online
    return 0

def get_pdf_from_url(pdf_url):
    fname = ""
    try:
        response = requests.get(pdf_url)
        content_type = response.headers['content-type']
        if not 'text' in content_type:
            #print(response.headers)
            cd= response.headers['content-disposition']
            #print(cd)
            fname = re.findall("filename=(.+)", cd)[0]
            #print(fname)
            with open('pdf_files/'+ fname +'.pdf', 'wb') as f:
                f.write(response.content)
    except:
        print("Error getting file from: ", pdf_url)
    finally:
        return fname
    
def set_pdf_file_value(file_name, pub_id, db_name = "app_db.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    table = 'articles'   
    done = db_conn.set_value_table(table, pub_id, "pdf_file", file_name)
    db_conn.close()
    return done

Get the name of the current app db file:

In [3]:
# app db file with path: db_files/app_db.sqlite3
ukchapp_db = "db_files/app_db.sqlite3"
while not Path(ukchapp_db).is_file():
    print('Please enter the name of app db file:')
    ukchapp_db = input()

## Get pdf files for publications

Read database and try to recover pdf files

In [None]:


# get publication data from the ukch app
db_pubs = get_pub_app_data(ukchapp_db)

for a_pub in tqdm_notebook(db_pubs):
    pub_id = a_pub[0]
    pub_title = a_pub[1]
    pub_doi = a_pub[2]
    pub_url = a_pub[3]
    pub_pdf = a_pub[4]
    pub_html = a_pub[5]
    if "pdf" in pub_url:
        print ("try to get the pdf from url")
        try:
            response = requests.get(pub_url)
            content_type = response.headers['content-type']
            if not 'text' in content_type:
                #print(response.headers)
                cd= response.headers['content-disposition']
                #print(cd)
                fname = re.findall("filename=(.+)", cd)[0]
                #print(fname)
                if not Path('pdf_files/' + pdf_file).is_file():
                    with open('pdf_files/'+ fname +'.pdf', 'wb') as f:
                        f.write(response.content)
                else:
                    set_pdf_file_value(pdf_file, pub_id, ukchapp_db)
        except:
            print("ID: ", pub_id, "\nPublication: ",pub_title, 
                   "\nDOI: ", pub_doi, "\nDOI: ", pub_url)
    elif pub_pdf == None:
        print("try to see if json file has link to pdf", pub_doi)
        if valid_doi(pub_doi):
            crjd, doi_file = get_cr_json_object(pub_doi)
            if "link" in crjd.keys():
                for a_link in crjd["link"]:
                    if "URL" in a_link.keys() and ("pdf" in a_link["URL"] or "pdf" in a_link["content-type"]):
                        cr_url = a_link["URL"]
                        print("URL: ", cr_url)
                        pdf_file = get_pdf_from_url(cr_url)
                        print("File name:", pdf_file)
                        # if the name corresponds to a existing file, assign value to db_record
                        if Path('pdf_files/' + pdf_file).is_file():
                            set_pdf_file_value(pdf_file, pub_id, ukchapp_db)
                    else:
                        print("no pdf links in json", pub_doi)
            else: 
                print("no links in json", pub_doi)
                

In [5]:
from chemdataextractor import Document

# A function for getting a list of files from the directory
# This will be modified to get the list from a csv file
def get_files_list (source_dir):
    i_counter = 0
    files_list = []
    for filepath in sorted(source_dir.glob('*.pdf')):
        i_counter += 1
        files_list.append(filepath)
    return files_list

def cde_read_pdfs(a_file):
    pdf_f = open(a_file, 'rb')
    doc = Document.from_file(pdf_f)
    return doc

def find_doi(element_text):
    cr_re_01 = '10.\d{4,9}/[-._;()/:A-Z0-9]+'
    compare = re.search(cr_re_01, element_text, re.IGNORECASE)
    if compare != None:
        return compare.group()
    return ""

def get_db_id(doi_value, db_name = "app_db.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    table = 'articles'   
    id_val = db_conn.get_value(table, "id", "doi", doi_value)
    db_conn.close()
    if id_val != None:
        return id_val[0]
    else:
        return 0

def get_db_title(doi_value, db_name = "app_db.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    table = 'articles'   
    id_val = db_conn.get_value(table, "title", "doi", doi_value)
    db_conn.close()
    if id_val != None:
        return id_val[0]
    else:
        return 0

def get_close_dois(str_name, db_name = "prev_search.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    search_in = 'articles'
    fields_required = "id, doi, title, pdf_file"
    filter_str = "doi like '%"+str_name+"%';"

    db_titles = db_conn.get_values(search_in, fields_required, filter_str)
    db_conn.close()
    return db_titles

files_list = get_files_list(Path("pdf_files"))
not_assigned = []
for a_file in tqdm_notebook(files_list):
    search_this = a_file.name.replace(".pdf", "").lower()
    print(a_file.name,"\t",search_this)
    close_dois = get_close_dois(search_this, ukchapp_db)
    print(len(close_dois))
    
    if len(close_dois) == 1 :
        doi_dat = close_dois[0]
        selected = False
        if doi_dat[3] == None:
            while not selected:
                print("Assign file: ", a_file.name, " to:\n\t", doi_dat[0],doi_dat[1],doi_dat[2], doi_dat[3])
                print('***************************************************************')
                print("Options:\n\ta) assign\n\tb)go to next")
                print("selection:")
                usr_select = input()
                if usr_select == 'a':
                    selected = True
                    set_pdf_file_value(a_file.name, doi_dat[0], ukchapp_db)
                    print("assing and go to next")
                elif usr_select == 'b':
                    #working_file[art_num]['ignore']=3 # visual inspection
                    selected = True
                    print("going to next")
        else:
            print("Assigned in db: ",  doi_dat[0],doi_dat[1],doi_dat[2], doi_dat[3])
    else:
        not_assigned.append(a_file)        

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=230.0), HTML(value='')))

1-s2.0-S0021951719301459-main.pdf 	 1-s2.0-s0021951719301459-main
0
1-s2.0-S0021951720302542-main.pdf 	 1-s2.0-s0021951720302542-main
0
1-s2.0-S0920586118300415-main.pdf 	 1-s2.0-s0920586118300415-main
0
1-s2.0-S0920586118303055-main.pdf 	 1-s2.0-s0920586118303055-main
0
1-s2.0-S0920586118316456-main.pdf 	 1-s2.0-s0920586118316456-main
0
1-s2.0-S0920586120303370-main.pdf 	 1-s2.0-s0920586120303370-main
0
1-s2.0-S0926860X18301686-main.pdf 	 1-s2.0-s0926860x18301686-main
0
1-s2.0-S0926860X18305003-main.pdf 	 1-s2.0-s0926860x18305003-main
0
1-s2.0-S1010603017313138-main.pdf 	 1-s2.0-s1010603017313138-main
0
1-s2.0-S1385894719325884-main.pdf 	 1-s2.0-s1385894719325884-main
0
1.5039294.pdf.pdf 	 1.5039294
1
Assigned in db:  29 10.1063/1.5039294 QENS study of methane diffusion in Mo/H-ZSM-5 used for the methane dehydroaromatisation reaction 1.5039294.pdf.pdf
1143844.1143874.pdf 	 1143844.1143874
0
157800467.pdf 	 157800467
0
2015_Article_.pdf.pdf 	 2015_article_
0
2017_Article_.pdf.pdf 	 201

1
Assigned in db:  148 10.1039/c3cp52653d Molecular dynamics simulations of longer n-alkanes in silicalite: a comparison of framework and hydrocarbon models C3CP52653D.pdf
C4CC04024D.pdf 	 c4cc04024d
1
Assigned in db:  142 10.1039/c4cc04024d Selective photocatalytic oxidation of benzene for the synthesis of phenol using engineered Au–Pd alloy nanoparticles supported on titanium dioxide C4CC04024D.pdf
C4CP00753K.pdf 	 c4cp00753k
1
Assigned in db:  550 10.1039/c4cp00753k Segregation effects on the properties of (AuAg)147 C4CP00753K.pdf
C4CP04693E.pdf 	 c4cp04693e
1
Assigned in db:  145 10.1039/c4cp04693e Optimised photocatalytic hydrogen production using core–shell AuPd promoters with controlled shell thickness C4CP04693E.pdf
C4DT01309C.pdf 	 c4dt01309c
1
Assigned in db:  146 10.1039/c4dt01309c Well-controlled metal co-catalysts synthesised by chemical vapour impregnation for photocatalytic hydrogen production and water purification C4DT01309C.pdf
C4GC00087K.pdf 	 c4gc00087k
1
Assigned i

1
Assigned in db:  195 10.1039/c7dt02167d Ring opening polymerisation of lactide with uranium(iv) and cerium(iv) phosphinoaryloxide complexes C7DT02167D.pdf
C7DT03395H.pdf 	 c7dt03395h
1
Assigned in db:  67 10.1039/c7dt03395h Catalytic applications of small bite-angle diphosphorus ligands with single-atom linkers C7DT03395H.pdf
c7dt04805j.pdf 	 c7dt04805j
1
Assigned in db:  452 10.1039/c7dt04805j Destruction of chemical warfare agent simulants by air and moisture stable metal NHC complexes c7dt04805j.pdf
C7FD00159B.pdf 	 c7fd00159b
1
Assigned in db:  42 10.1039/c7fd00159b Impact of SCILL catalysts for the S–S coupling of thiols to disulfides C7FD00159B.pdf
C7FD00210F.pdf 	 c7fd00210f
1
Assigned in db:  61 10.1039/c7fd00210f The deposition of metal nanoparticles on carbon surfaces: the role of specific functional groups C7FD00210F.pdf
C7FD00216E.pdf 	 c7fd00216e
1
Assigned in db:  570 10.1039/c7fd00216e Supported metal nanoparticles with tailored catalytic properties through sol-immobil

1
Assigned in db:  598 10.1039/d0sc01317j Direct synthesis of amides from nonactivated carboxylic acids using urea as nitrogen source and Mg(NO3)2 or imidazole as catalysts d0sc01317j.pdf
D0SC01924K.pdf 	 d0sc01924k
1
Assigned in db:  585 10.1039/d0sc01924k Methanol loading dependent methoxylation in zeolite H-ZSM-5 D0SC01924K.pdf
D0SC02152K.pdf 	 d0sc02152k
1
Assigned in db:  610 10.1039/d0sc02152k In situ K-edge X-ray absorption spectroscopy of the ligand environment of single-site Au/C catalysts during acetylene hydrochlorination D0SC02152K.pdf
d0sc02253e.pdf 	 d0sc02253e
1
Assigned in db:  608 10.1039/d0sc02253e  Asymmetric synthesis of primary amines catalyzed by thermotolerant fungal reductive aminases d0sc02253e.pdf
d0sc02253e1.pdf 	 d0sc02253e1
0
daniel_dervin_thesis_final.pdf 	 daniel_dervin_thesis_final
0
Davies2016_Article_OnTheRoleOfWaterInHeterogeneou.pdf.pdf 	 davies2016_article_ontheroleofwaterinheterogeneou
0
Decarolis2018_Article_EffectOfParticleSizeAndSupport.pdf.pdf 

In [6]:
len(not_assigned)

56

In [19]:
for a_file in tqdm_notebook(not_assigned):
    pdf_doc = cde_read_pdfs(a_file)
    print(a_file.name)
    dois_list = []
    for element in pdf_doc.elements:
        if 'doi' in str(element):
            found_doi = find_doi(str(element))
            if found_doi[-1:] == ".":
                found_doi = found_doi[:-1]
            if not found_doi in dois_list:
                dois_list.append(found_doi)       
    
    if dois_list != [] and len(dois_list) == 1:
        for a_doi in dois_list:
            close_dois = get_close_dois(a_doi, ukchapp_db)
            selected = False
            if len(close_dois) == 1:
                doi_dat = close_dois[0]
                if doi_dat[3] == None:
                    while not selected:
                        print("Assign file: ",a_file.name, " to:\n\t", doi_dat[0],doi_dat[1],doi_dat[2], doi_dat[3])
                        print('***************************************************************')
                        print("Options:\n\ta) assign\n\tb)go to next")
                        print("selection:")
                        usr_select = input()
                        if usr_select == 'a':
                            selected = True
                            set_pdf_file_value(a_file.name, doi_dat[0], ukchapp_db)
                            print("assing and go to next")
                        elif usr_select == 'b':
                            #working_file[art_num]['ignore']=3 # visual inspection
                            selected = True
                            print("going to next")
                else: 
                    print("Already assingned to:\n\t", doi_dat[0],doi_dat[1],doi_dat[2], doi_dat[3])
                

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=56.0), HTML(value='')))

1-s2.0-S0021951719301459-main.pdf
Already assingned to:
	 365 10.1016/j.jcat.2019.03.037 Combined spatially resolved operando spectroscopy: New insights into kinetic oscillations of CO oxidation on Pd/γ-Al2O3 1-s2.0-S0021951719301459-main.pdf
1-s2.0-S0021951720302542-main.pdf
1-s2.0-S0920586118300415-main.pdf
Already assingned to:
	 445 10.1016/j.cattod.2018.01.033 Preparation of bifunctional Au-Pd/TiO2 catalysts and research on methanol liquid phase one-step oxidation to methyl formate 1-s2.0-S0920586118300415-main.pdf
1-s2.0-S0920586118303055-main.pdf
1-s2.0-S0920586118316456-main.pdf
Already assingned to:
	 282 10.1016/j.cattod.2019.01.065 In-depth characterisation of metal-support compounds in spent Co/SiO2 Fischer-Tropsch model catalysts 1-s2.0-S0920586118316456-main.pdf
1-s2.0-S0920586120303370-main.pdf
1-s2.0-S0926860X18301686-main.pdf
1-s2.0-S0926860X18305003-main.pdf
Already assingned to:
	 18 10.1016/j.apcata.2018.10.010 Investigation of ZSM-5 catalysts for dimethylether conv

In [24]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument

for a_file in tqdm_notebook(not_assigned):
    fp = open(a_file, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    #print(doc.info)  # The "Info" metadata
    if 'Subject' in doc.info[0].keys():
        print("File name: ", a_file.name, " Subject ", doc.info[0]['Subject'])
    else:
        print("********", a_file.name, "*******")


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=56.0), HTML(value='')))

File name:  1-s2.0-S0021951719301459-main.pdf  Subject  b'Journal of Catalysis, 373 (2019) 201-208. doi:10.1016/j.jcat.2019.03.037'
File name:  1-s2.0-S0021951720302542-main.pdf  Subject  b'Journal of Catalysis, 389 (2020) 483-491. doi:10.1016/j.jcat.2020.06.021'
File name:  1-s2.0-S0920586118300415-main.pdf  Subject  b'Catalysis Today, 316 (2018) 206-213. doi:10.1016/j.cattod.2018.01.033'
File name:  1-s2.0-S0920586118303055-main.pdf  Subject  b'Catalysis Today, 317 (2018) 12-20. doi:10.1016/j.cattod.2018.03.046'
File name:  1-s2.0-S0920586118316456-main.pdf  Subject  b'Catalysis Today,Corrected proof,doi:10.1016/j.cattod.2019.01.065'
File name:  1-s2.0-S0920586120303370-main.pdf  Subject  b'Catalysis Today,Corrected proof,doi:10.1016/j.cattod.2020.05.045'
File name:  1-s2.0-S0926860X18301686-main.pdf  Subject  b'Applied Catalysis A, General, 559 (2018) 1-9. doi:10.1016/j.apcata.2018.04.006'
File name:  1-s2.0-S0926860X18305003-main.pdf  Subject  b'Applied Catalysis A, General, 569 (2

In [None]:
doi[:-1]