# Get PDF Files for publications in the UK Catalysis Hub app db
A list of publications is obtainded from the app database. This list will contain a titles, IDs and DOIs which need to be explored to look for the corresponding pdf files. 
The steps of the process are: 
 1. get a Title, DOI, and URL for each publication
 2. convert the DOI to a pdf file name and try to open de file
 3. use pdfMiner and/or CDE to get the reference to data
 4. add a new dataset entry each time a new data object is found

In [4]:
# Libraries
# library containign functions that read and write to csv files
import lib.handle_csv as csvh
# library for connecting to the db
import lib.handle_db as dbh
# library for handling text matchings
import lib.text_comp as txtc
# library for getting data from crossref
import lib.crossref_api as cr_api
# library for handling url searchs
import lib.handle_urls as urlh
# managing files and file paths
from pathlib import Path
# add aprogress bar
from tqdm import tqdm_notebook 
#library for handling json files
import json
# library for using regular expressions
import re
# library for handling http requests
import requests

# import custom functions (common to various notebooks)
import processing_functions as pr_fns



In [None]:
# functions for ChemDataExtractor
# not used for mining data references (suplementary/raw) or to get pdf metadata
from chemdataextractor import Document

# A function for getting a list of files from the directory
# This will be modified to get the list from a csv file
def get_files_list (source_dir):
    i_counter = 0
    files_list = []
    for filepath in sorted(source_dir.glob('*.pdf')):
        i_counter += 1
        files_list.append(filepath)
    return files_list

def cde_read_pdfs(a_file):
    pdf_f = open(a_file, 'rb')
    doc = Document.from_file(pdf_f)
    return doc

def find_doi(element_text):
    cr_re_01 = '10.\d{4,9}/[-._;()/:A-Z0-9]+'
    compare = re.search(cr_re_01, element_text, re.IGNORECASE)
    if compare != None:
        return compare.group()
    return ""

def get_db_id(doi_value, db_name = "app_db.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    table = 'articles'   
    id_val = db_conn.get_value(table, "id", "doi", doi_value)
    db_conn.close()
    if id_val != None:
        return id_val[0]
    else:
        return 0

def get_db_title(doi_value, db_name = "app_db.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    table = 'articles'   
    id_val = db_conn.get_value(table, "title", "doi", doi_value)
    db_conn.close()
    if id_val != None:
        return id_val[0]
    else:
        return 0

def get_close_dois(str_name, db_name = "prev_search.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    search_in = 'articles'
    fields_required = "id, doi, title, pdf_file"
    filter_str = "doi like '%"+str_name+"%';"

    db_titles = db_conn.get_values(search_in, fields_required, filter_str)
    db_conn.close()
    return db_titles

Get the name of the current app db file:

In [None]:
# app db file with path: db_files/app_db.sqlite3
ukchapp_db = "db_files/app_db.sqlite3"
while not Path(ukchapp_db).is_file():
    print('Please enter the name of app db file:')
    ukchapp_db = input()

## Get pdf files for publications

Read database and try to recover pdf files

In [None]:
# get publication data from the ukch app
db_pubs = get_pub_app_data(ukchapp_db)

for a_pub in tqdm_notebook(db_pubs):
    pub_id = a_pub[0]
    pub_title = a_pub[1]
    pub_doi = a_pub[2]
    pub_url = a_pub[3]
    pub_pdf = a_pub[4]
    pub_html = a_pub[5]
    if pub_pdf == None:
        not_in_url = True
        print("ID: ", pub_id, "Publication: ",pub_title,
              "\n\tDOI: ", pub_doi, " URL: ", pub_url)
        if "pdf" in pub_url:
            print ("\tTry to get the pdf from URL: ", pub_url)
            try:
                response = requests.get(pub_url)
                content_type = response.headers['content-type']
                if not 'text' in content_type:
                    #print(response.headers)
                    cd= response.headers['content-disposition']
                    #print(cd)
                    fname = re.findall("filename=(.+)", cd)[0]
                    #print(fname)
                    if not Path('pdf_files/' + pdf_file).is_file():
                        with open('pdf_files/'+ fname +'.pdf', 'wb') as f:
                            f.write(response.content)
                    else:
                        set_pdf_file_value(pdf_file, pub_id, ukchapp_db)
                    not_in_url = False
            except:
                print("ID: ", pub_id, "\nPublication: ",pub_title, 
                       "\nDOI: ", pub_doi, "\nDOI: ", pub_url) 
        if not_in_url:
            print("\tTry to see if json file has link to pdf: ")
            if valid_doi(pub_doi):
                crjd, doi_file = get_cr_json_object(pub_doi)
                got_pdf = False
                if "link" in crjd.keys():
                    for a_link in crjd["link"]:
                        if "\tURL" in a_link.keys() and ("pdf" in a_link["URL"] or "pdf" in a_link["content-type"]):
                            cr_url = a_link["URL"]
                            #print("URL: ", cr_url)
                            pdf_file = get_pdf_from_url(cr_url)
                            # if the name corresponds to a existing file, assign value to db_record
                            if Path('pdf_files/' + pdf_file).is_file():
                                print("\tFile name:", pdf_file)
                                set_pdf_file_value(pdf_file, pub_id, ukchapp_db)
                                got_pdf = True
                            else:
                                print("\tcould not get file from", cr_url)
                else: 
                    print("\tno links in json", pub_doi)
            if not got_pdf and "elsevier" in pub_url:
                print("\tTrying elsevier doi:" )
                pdf_file = get_elsevier_pdf(pub_doi)
                if Path('pdf_files/' + pdf_file).is_file():
                    print("\tFile name:", pdf_file)
                    set_pdf_file_value(pdf_file, pub_id, ukchapp_db)
                    got_pdf = True
            elif not got_pdf and "wiley" in pub_url:
                print("\tTrying elsevier doi:" )
                pdf_file = get_wiley_pdf(pub_doi)
                if Path('pdf_files/' + pdf_file).is_file():
                    print("\tFile name:", pdf_file)
                    set_pdf_file_value(pdf_file, pub_id, ukchapp_db)
                    got_pdf = True
                
            if not got_pdf:
                print("\tTry doi:  https://doi.org/" + pub_doi)
                

## File name match

In [None]:
# check if file name matches some part of a doi
files_list = get_not_matched_files(ukchapp_db)

not_assigned = []
for a_file in tqdm_notebook(files_list):
    search_this = a_file.name.replace(".pdf", "").lower()
    print(a_file.name,"\t",search_this)
    close_dois = get_close_dois(search_this, ukchapp_db)
    print(len(close_dois))
    
    if len(close_dois) == 1 :
        doi_dat = close_dois[0]
        selected = False
        if doi_dat[3] == None:
            while not selected:
                print("Assign file: ", a_file.name, " to:\n\t", doi_dat[0],doi_dat[1],doi_dat[2], doi_dat[3])
                print('***************************************************************')
                print("Options:\n\ta) assign\n\tb)go to next")
                print("selection:")
                usr_select = input()
                if usr_select == 'a':
                    selected = True
                    set_pdf_file_value(a_file.name, doi_dat[0], ukchapp_db)
                    print("assing and go to next")
                elif usr_select == 'b':
                    #working_file[art_num]['ignore']=3 # visual inspection
                    selected = True
                    print("going to next")
        else:
            print("Assigned in db: ",  doi_dat[0],doi_dat[1],doi_dat[2], doi_dat[3])
    else:
        not_assigned.append(a_file)        

## Use pdfminer to get metadata from pdf file

In [None]:
files_list = get_files_list(Path("pdf_files"))
db_pubs = get_pub_app_data(ukchapp_db)
missing=[]
# check which files are really missing linking
for file in files_list:
    found_in_db = False
    for db_pub in db_pubs:
        if file.name == db_pub[4]:
            found_in_db = True
            break
    if not found_in_db:
        missing.append(file)

# check if all linked files are in the folder
missing2=[]
for db_pub in db_pubs:
    found_in_system = False
    for file in files_list:
        if file.name == db_pub[4] or db_pub[4] == None:
            found_in_system = True
            break
    if not found_in_system:
        missing2.append(db_pub)


In [2]:
# use ChemDataExtractor to read pdf and get DOIs in document
for a_file in tqdm_notebook(not_assigned):
    pdf_doc = cde_read_pdfs(a_file)
    print(a_file.name)
    dois_list = []
    for element in pdf_doc.elements:
        if 'doi' in str(element):
            found_doi = find_doi(str(element))
            if found_doi[-1:] == ".":
                found_doi = found_doi[:-1]
            if not found_doi in dois_list:
                dois_list.append(found_doi)       
    
    if dois_list != [] and len(dois_list) == 1:
        for a_doi in dois_list:
            close_dois = get_close_dois(a_doi, ukchapp_db)
            selected = False
            if len(close_dois) == 1:
                doi_dat = close_dois[0]
                if doi_dat[3] == None:
                    while not selected:
                        print("Assign file: ",a_file.name, " to:\n\t", doi_dat[0],doi_dat[1],doi_dat[2], doi_dat[3])
                        print('***************************************************************')
                        print("Options:\n\ta) assign\n\tb)go to next")
                        print("selection:")
                        usr_select = input()
                        if usr_select == 'a':
                            selected = True
                            set_pdf_file_value(a_file.name, doi_dat[0], ukchapp_db)
                            print("assing and go to next")
                        elif usr_select == 'b':
                            #working_file[art_num]['ignore']=3 # visual inspection
                            selected = True
                            print("going to next")
                else: 
                    print("Already assingned to:\n\t", doi_dat[0],doi_dat[1],doi_dat[2], doi_dat[3])
                

NameError: name 'tqdm_notebook' is not defined