# Get PDF Files for publications in the UK Catalysis Hub app db
A list of publications is obtainded from the app database. This list will contain a titles, IDs and DOIs which need to be explored to look for the corresponding pdf files. 
The steps of the process are: 
 1. get a Title, DOI, and URL for each publication
 2. convert the DOI to a pdf file name and try to open de file
 3. use pdfMiner and/or CDE to get the reference to data
 4. add a new dataset entry each time a new data object is found

In [1]:
# Libraries
# library containign functions that read and write to csv files
import lib.handle_csv as csvh
# library for connecting to the db
import lib.handle_db as dbh
# library for handling text matchings
import lib.text_comp as txtc
# library for getting data from crossref
import lib.crossref_api as cr_api
# library for handling url searchs
import lib.handle_urls as urlh
# managing files and file paths
from pathlib import Path
# add aprogress bar
from tqdm import tqdm_notebook 
#library for handling json files
import json
# library for using regular expressions
import re
# library for handling http requests
import requests

# import custom functions (common to various notebooks)
import processing_functions as pr_fns



In [2]:
# functions for ChemDataExtractor
# not used for mining data references (suplementary/raw) or to get pdf metadata
from chemdataextractor import Document

# A function for getting a list of files from the directory
# This will be modified to get the list from a csv file
def get_files_list (source_dir):
    i_counter = 0
    files_list = []
    for filepath in sorted(source_dir.glob('*.pdf')):
        i_counter += 1
        files_list.append(filepath)
    return files_list

def cde_read_pdfs(a_file):
    pdf_f = open(a_file, 'rb')
    doc = Document.from_file(pdf_f)
    return doc

def find_doi(element_text):
    cr_re_01 = '10.\d{4,9}/[-._;()/:A-Z0-9]+'
    compare = re.search(cr_re_01, element_text, re.IGNORECASE)
    if compare != None:
        return compare.group()
    return ""

def get_db_id(doi_value, db_name = "app_db.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    table = 'articles'   
    id_val = db_conn.get_value(table, "id", "doi", doi_value)
    db_conn.close()
    if id_val != None:
        return id_val[0]
    else:
        return 0

def get_db_title(doi_value, db_name = "app_db.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    table = 'articles'   
    id_val = db_conn.get_value(table, "title", "doi", doi_value)
    db_conn.close()
    if id_val != None:
        return id_val[0]
    else:
        return 0

def get_close_dois(str_name, db_name = "prev_search.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    search_in = 'articles'
    fields_required = "id, doi, title, pdf_file"
    filter_str = "doi like '%"+str_name+"%';"

    db_titles = db_conn.get_values(search_in, fields_required, filter_str)
    db_conn.close()
    return db_titles

Get the name of the current app db file:

In [3]:
# app db file with path: db_files/app_db.sqlite3
ukchapp_db = "db_files/app_db.sqlite3"
while not Path(ukchapp_db).is_file():
    print('Please enter the name of app db file:')
    ukchapp_db = input()

## Get pdf files for publications

Read database and try to recover pdf files

In [4]:
# get publication data from the ukch app
db_pubs = pr_fns.get_pub_app_data(ukchapp_db)

for a_pub in tqdm_notebook(db_pubs):
    pub_id = a_pub[0]
    pub_title = a_pub[1]
    pub_doi = a_pub[2]
    pub_url = a_pub[3]
    pub_pdf = a_pub[4]
    pub_html = a_pub[5]
    if pub_pdf == 'None':
        print("*************************")
        print("Missing PDF for:", pub_doi)
        print("*************************")
    else:
        pdf_file = "pdf_files/" + pub_pdf
        if not Path(pdf_file).is_file():
            print("*************************")
            print("Missing file for:", pdf_file, "for", pub_doi)
            print("*************************")
        else: 
            print("PDF filename", pub_pdf)
            

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=342.0), HTML(value='')))

PDF filename s41929-019-0334-3.pdf
PDF filename acscatal.9b00685.pdf
PDF filename cctc.201901268.pdf
PDF filename chem.201805250.pdf
PDF filename 1-s2.0-S0968089618313233-main.pdf
PDF filename acssuschemeng.8b03568.pdf
PDF filename Smart anime donors.pdf
PDF filename C8OB00066B.pdf
PDF filename 1-s2.0-S0968089617301268-main.pdf
PDF filename acs.biochem.8b00169.pdf
PDF filename acscatal.8b00389.pdf
PDF filename acscatal.8b00624.pdf
PDF filename jacs.7b12621.pdf
PDF filename acscatal.8b03169.pdf
PDF filename acscatal.9b01820(1).pdf
PDF filename C9CC02459J.pdf
PDF filename s41929-018-0213-3.pdf
PDF filename 1-s2.0-S0926860X18305003-main.pdf
PDF filename 1-s2.0-S0926860X18305817-main.pdf
PDF filename acs.jpcc.8b08420.pdf
PDF filename 1-s2.0-S0926337318306167-main.pdf
PDF filename C8CC07444E.pdf
PDF filename acscatal.8b02232.pdf
PDF filename C8CY00422F.pdf
PDF filename s41467-018-03138-7.pdf
PDF filename C8CP01022F.pdf
PDF filename Locke2018_Article_CatalysisOfTheOxygenEvolutionR.pdf
PDF fi

PDF filename C8NJ03632B.pdf
PDF filename aic.16687.pdf
PDF filename s11244-018-0890-9.pdf
PDF filename acs.jpcc.6b11186.pdf
PDF filename acscatal.6b00982.pdf
PDF filename C9SC03374B.pdf
PDF filename ja512868a.pdf
PDF filename C8DT05051A.pdf
PDF filename C8CY01483C.pdf
PDF filename C4CP00753K.pdf
PDF filename C7CY00875A.pdf
PDF filename cssc.201501264.pdf
PDF filename acscatal.7b03805.pdf
PDF filename 1-s2.0-S0021951718302124-main.pdf
PDF filename C6DT03565E.pdf
PDF filename 1-s2.0-S092633731930400X-main.pdf
PDF filename cctc.201701840.pdf
PDF filename 1-s2.0-S1572665717306963-main.pdf
PDF filename acscatal.6b00589.pdf
PDF filename chem.201700496.pdf
PDF filename acscatal.5b01936.pdf
PDF filename adfm.201400338.pdf
PDF filename C7FD00216E.pdf
PDF filename C6TA00293E.pdf
PDF filename Scott_Rogers_thesis.pdf
PDF filename Al-Nayili_A_final_PhD_thesis.pdf
PDF filename C9SE01103J.pdf
PDF filename C9CP02968K.pdf
PDF filename C9DT03590G.pdf
PDF filename cctc.201901955.pdf
PDF filename C9SC0490

## Use pdfminer to get metadata from pdf file

In [5]:
pdf_file = "pdf_files/" + 's41929-019-0334-3.pdf'
import pdfminer
from pdfminer.high_level import extract_text
text = extract_text(pdf_file)

In [12]:
sentences = text.split("\n")
for sentence in sentences:
    if pr_fns.is_data_stmt(sentence.lower()):
        
        print(sentences.index(sentence), ":" , len(sentence), sentence )
        
#for sentence in sentences:
#    if 'data' in sentence.lower():
#        print (len(sentence), sentence, sentences.index(sentence))


610 : 69 single Pt atoms that are distributed over the support (Supplementary 
812 : 65 STEM data shown in Fig. 4 and Supplementary Fig. 6, respectively.
1048 : 65 EXAFS data (see Supplementary Fig. 9 and Supplementary Table 3)  
1551 : 67 microscopy and spectroscopic data obtained from all of the Pt/TiO2 
1664 : 80 Information on the data supporting the results presented here, including how to 
1665 : 83 access them, can be found in the Cardiff University data catalogue at https://doi. 
1929 : 81 Supplementary information is available for this paper at https://doi.org/10.1038/


In [7]:
for sentence in sentences:
    print(sentences.index(sentence), ":" , len(sentence), sentence ) 

0 : 34 There are amendments to this paper
1 : 0 
2 : 47 Tuning of catalytic sites in Pt/TiO2 catalysts 
3 : 40 for the chemoselective hydrogenation of 
4 : 14 3-nitrostyrene
1 : 0 
6 : 61  1, Alexandra J. Barnes1, Sultan M. Althahban2,3, Ruiyang Qu 
1 : 0 
8 : 7  4,5,  
1 : 0 
10 : 18 Margherita Macino 
11 : 19  6, David J Morgan 
12 : 15 Emma K. Gibson 
13 : 55 Christopher J. Kiely1,2, Xiang Gao4,5, Andrew M. Beale 
14 : 51 Meenakshisundaram Sankar1* and Graham J. Hutchings 
1 : 0 
16 : 3  1*
1 : 0 
18 : 48  1, Simon J. Freakley1, Nikolaos Dimitratos1,7, 
1 : 0 
20 : 34  8,9, Donald Bethell10, Qian He1, 
1 : 0 
22 : 128 The catalytic activities of supported metal nanoparticles can be tuned by appropriate design of synthesis strategies. Each step 
23 : 124 in a catalyst synthesis method can play an important role in preparing the most efficient catalyst. Here we report the care-
24 : 122 ful manipulation of the post-synthetic heat treatment procedure—together with control over the meta

1 : 0 
487 : 14 Mean = 1.0 nm,
420 : 10 σ = 0.2 nm
1 : 0 
419 : 14 Mean = 1.2 nm,
491 : 10 σ = 0.4 nm
1 : 0 
493 : 14 Mean = 1.6 nm,
494 : 10 σ = 0.6 nm
1 : 0 
493 : 14 Mean = 1.6 nm,
494 : 10 σ = 0.6 nm
1 : 0 
499 : 1 g
1 : 0 
160 : 1 h
1 : 0 
439 : 4 5 nm
1 : 0 
439 : 4 5 nm
1 : 0 
439 : 4 5 nm
1 : 0 
439 : 4 5 nm
1 : 0 
121 : 1 1
1 : 0 
202 : 1 2
1 : 0 
139 : 1 3
1 : 0 
200 : 1 4
1 : 0 
210 : 1 5
1 : 0 
121 : 1 1
1 : 0 
202 : 1 2
1 : 0 
139 : 1 3
1 : 0 
200 : 1 4
1 : 0 
210 : 1 5
1 : 0 
121 : 1 1
1 : 0 
202 : 1 2
1 : 0 
139 : 1 3
1 : 0 
200 : 1 4
1 : 0 
210 : 1 5
1 : 0 
121 : 1 1
1 : 0 
202 : 1 2
1 : 0 
139 : 1 3
1 : 0 
483 : 2 45
1 : 0 
485 : 19 Particle size (nm) 
1 : 0 
551 : 143 Fig. 3 | Representative HAADF–STEM images and the derived particle size distributions of the unused Pt/TiO2 catalysts, binned according to the 
552 : 155 Mackay model. a–h, 0.05 wt% Pt/TiO2 ‘calc. + red.’ (a) and ‘red.’ (b); 0.08 wt% Pt/TiO2 ‘calc. + red.’ (c) and ‘red.’ (d); 0.2 wt% Pt/TiO2 ‘calc. + red

1 : 0 
200 : 1 4
1 : 0 
210 : 1 5
1 : 0 
198 : 1 6
1 : 0 
139 : 1 3
1 : 0 
1037 : 5 R (Å)
1 : 0 
139 : 1 3
1 : 0 
1037 : 5 R (Å)
1 : 0 
1043 : 150 Fig. 5 | XANES spectra. a–d, Pt L3-edge XANES (a,b) and the magnitude component of the k3 weighted Fourier Transform of the EXAFS data (c,d) recorded 
1044 : 155 on Pt/TiO2 samples after ‘red.’ (a,c) and ‘calc. + red.’ (b,d). The dashed arrows indicate the decrease in edge position with increased loading. The solid 
1045 : 143 arrows indicate the increased rising absorption edge. Labels above the peaks indicate the scattering pairs that give rise to that contribution.
1 : 0 
1047 : 79 intensity  at  ~2.73 Å,  which  is  typical  of  Pt  metal.  Analysis  of  the 
1048 : 65 EXAFS data (see Supplementary Fig. 9 and Supplementary Table 3)  
1049 : 78 for  this  sample  suggests  the  existence  of  Pt  nanoparticles  that  are 
1050 : 70 ~1 nm in diameter40. For the 0.05 and 0.2 wt% Pt/TiO2 ‘red.’ samples, 
1051 : 65 weak peaks at 2.87 and 2.81

1535 : 4 0.40
1 : 0 
1537 : 26 Total surface Pt (wt% cat)
1 : 0 
1539 : 77 Fig. 7 | The correlation between peripheral Pt sites and catalytic activity. 
1540 : 74 a, The correlation between the 3-NS initial conversion rate and the total 
1541 : 74 amount of Pt present in peripheral sites; R2 = 0.89 and p < 0.001. b, The 
1542 : 71 correlation between the 3-NS initial conversion rate and total exposed 
1543 : 71 surface Pt atoms for the different Pt/TiO2 catalysts as estimated from 
1544 : 71 analysis of HAADF–STEM images; R2 = 0.71 and p < 0.02. The s.d. values 
1545 : 78 for the initial rates were calculated from three sets of catalytic data under 
1546 : 70 identical reaction conditions and were used to produce the error bars.
1 : 0 
1548 : 74 treatment,  there  is  little  agglomeration  of  the  Pt  nanoparticles  
1549 : 68 in spite of Pt mobility. Hence, the calcination pre-step is not nec-
1550 : 74 essary  for  samples  with  low  Pt  loadings.  The  collective  electron 
1551 

In [11]:
sentence = "Supplementary information is available for this paper at https://doi.org/10.1038/"
pr_fns.is_data_stmt(sentence.lower())

True

## File name match

In [9]:
# check if file name matches some part of a doi
files_list = get_not_matched_files(ukchapp_db)

not_assigned = []
for a_file in tqdm_notebook(files_list):
    search_this = a_file.name.replace(".pdf", "").lower()
    print(a_file.name,"\t",search_this)
    close_dois = get_close_dois(search_this, ukchapp_db)
    print(len(close_dois))
    
    if len(close_dois) == 1 :
        doi_dat = close_dois[0]
        selected = False
        if doi_dat[3] == None:
            while not selected:
                print("Assign file: ", a_file.name, " to:\n\t", doi_dat[0],doi_dat[1],doi_dat[2], doi_dat[3])
                print('***************************************************************')
                print("Options:\n\ta) assign\n\tb)go to next")
                print("selection:")
                usr_select = input()
                if usr_select == 'a':
                    selected = True
                    set_pdf_file_value(a_file.name, doi_dat[0], ukchapp_db)
                    print("assing and go to next")
                elif usr_select == 'b':
                    #working_file[art_num]['ignore']=3 # visual inspection
                    selected = True
                    print("going to next")
        else:
            print("Assigned in db: ",  doi_dat[0],doi_dat[1],doi_dat[2], doi_dat[3])
    else:
        not_assigned.append(a_file)        

NameError: name 'get_not_matched_files' is not defined

In [None]:
files_list = get_files_list(Path("pdf_files"))
db_pubs = get_pub_app_data(ukchapp_db)
missing=[]
# check which files are really missing linking
for file in files_list:
    found_in_db = False
    for db_pub in db_pubs:
        if file.name == db_pub[4]:
            found_in_db = True
            break
    if not found_in_db:
        missing.append(file)

# check if all linked files are in the folder
missing2=[]
for db_pub in db_pubs:
    found_in_system = False
    for file in files_list:
        if file.name == db_pub[4] or db_pub[4] == None:
            found_in_system = True
            break
    if not found_in_system:
        missing2.append(db_pub)


In [None]:
# use ChemDataExtractor to read pdf and get DOIs in document
for a_file in tqdm_notebook(not_assigned):
    pdf_doc = cde_read_pdfs(a_file)
    print(a_file.name)
    dois_list = []
    for element in pdf_doc.elements:
        if 'doi' in str(element):
            found_doi = find_doi(str(element))
            if found_doi[-1:] == ".":
                found_doi = found_doi[:-1]
            if not found_doi in dois_list:
                dois_list.append(found_doi)       
    
    if dois_list != [] and len(dois_list) == 1:
        for a_doi in dois_list:
            close_dois = get_close_dois(a_doi, ukchapp_db)
            selected = False
            if len(close_dois) == 1:
                doi_dat = close_dois[0]
                if doi_dat[3] == None:
                    while not selected:
                        print("Assign file: ",a_file.name, " to:\n\t", doi_dat[0],doi_dat[1],doi_dat[2], doi_dat[3])
                        print('***************************************************************')
                        print("Options:\n\ta) assign\n\tb)go to next")
                        print("selection:")
                        usr_select = input()
                        if usr_select == 'a':
                            selected = True
                            set_pdf_file_value(a_file.name, doi_dat[0], ukchapp_db)
                            print("assing and go to next")
                        elif usr_select == 'b':
                            #working_file[art_num]['ignore']=3 # visual inspection
                            selected = True
                            print("going to next")
                else: 
                    print("Already assingned to:\n\t", doi_dat[0],doi_dat[1],doi_dat[2], doi_dat[3])
                