# Get PDF Files for publications in the UK Catalysis Hub app db
A list of publications is obtainded from the app database. This list will contain a titles, IDs and DOIs which need to be explored to look for asociated data (suplementary data, raw data, processed data). 

The steps of the process are: 

 -a) get a Title, DOI, and URL for each publication
 -b) get the DOI landing page and see if it contains references to data
 -c) add a new dataset entry each time a new ds is found
 -d) link the dataset to the publication.

In [1]:
# Libraries
# library containign functions that read and write to csv files
import lib.handle_csv as csvh
# library for connecting to the db
import lib.handle_db as dbh
# library for handling text matchings
import lib.text_comp as txtc
# library for getting data from crossref
import lib.crossref_api as cr_api
# library for handling url searchs
import lib.handle_urls as urlh
# managing files and file paths
from pathlib import Path
# add aprogress bar
from tqdm import tqdm_notebook 
# library for getting data from crossref
import lib.crossref_api as cr_api
#library for handling json files
import json
# library for using regular expressions
import re
# library for handling http requests
import requests

In [7]:
# Custom Functions
# get the crossreference json page from doi
def get_cr_json_object(cr_doi):
  crjd = None
  doi_file = 'json_files/' + cr_doi.replace('/','_').lower() + '.json'
  if not Path(doi_file).is_file():
    crjd = cr_api.getBibData(cr_doi)
    with open(doi_file, 'w', encoding='utf-8-sig', errors='ignore') as f:
                json.dump(crjd, f, ensure_ascii=False, indent=4)
  else:
    with open(doi_file, 'r', encoding='utf-8-sig') as jf:
        crjd = json.load(jf)
  # return the content and the file name 
  return crjd, doi_file

# get the landing page for the publication from uri
def get_pub_html_doi(cr_doi):
    html_file = 'html_files/' + cr_doi.replace('/','_').lower() + '.html'
    if not Path(html_file).is_file():
        page_content = urlh.getPageFromDOI(doi_text)
        with open(html_file, 'w', encoding='utf-8') as f:
            f.write(page_content.decode("utf-8") )
    else:
        f = open(html_file, "r")
        page_content = f.read()
    return page_content, html_file
             
def get_titles(str_pub_title, db_name = "prev_search.sqlite3"):
    print(db_name)
    db_conn = dbh.DataBaseAdapter(db_name)
    search_in = 'prev_pop_searches'
    fields_required = "Num, Title"
    filter_str = "Title like '"+str_pub_title[0]+"%';"

    db_titles = db_conn.get_values(search_in, fields_required, filter_str)
    db_conn.close()
    return db_titles

def get_titles_and_dois(str_pub_title, db_name = "app_db.sqlite3"):
    print(db_name)
    db_conn = dbh.DataBaseAdapter(db_name)
    search_in = 'articles'
    fields_required = "id, title, doi"
    filter_str = "Title like '"+str_pub_title[0]+"%';"
    db_titles = db_conn.get_values(search_in, fields_required, filter_str)
    db_conn.close()
    return db_titles

def get_pub_app_data(db_name = "app_db.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    search_in = 'articles'
    fields_required = "id, title, doi, link, pdf_file, html_file"
    filter_str = "status = 'Added'"
    db_titles = db_conn.get_values(search_in, fields_required, filter_str)
    db_conn.close()
    return db_titles

# get the current csv working file
def get_working_file(nr_wf):
    working_file = wf_fields = None
    current_pass = 0
    if Path(nr_wf).is_file():
        working_file, wf_fields = csvh.get_csv_data(nr_wf,'Num')
        for art_num in tqdm_notebook(working_file):
            if 'ignore' in working_file[art_num].keys():
                if current_pass < int(working_file[art_num]['ignore']):
                    current_pass = int(working_file[art_num]['ignore'])
            else:
                break
    print("Current pass:", current_pass)
    return working_file, wf_fields, current_pass

def get_pub_html_url(text_url, entry_id):
    html_file = 'html_files/' +  entry_id + '.html'
    if not Path(html_file).is_file():
        print("")
        page_content = urlh.getPageFromURL(text_url)
        with open(html_file, 'w', encoding='utf-8') as f:
            f.write(page_content)
    else:
        f = open(html_file, "r")
        page_content = f.read()
    return page_content, html_file

def valid_doi(cr_doi):
    # CR DOIS: https://www.crossref.org/blog/dois-and-matching-regular-expressions/
    # CR DOIs re1
    # /^10.\d{4,9}/[-._;()/:A-Z0-9]+$/i
    if cr_doi == None:
        return False
    cr_re_01 = '^10.\d{4,9}/[-._;()/:A-Z0-9]+'
    compare = re.match(cr_re_01, cr_doi, re.IGNORECASE)
    if compare != None and cr_doi == compare.group():
        return True
    else:
        return False
    
# get a semicolon separated list of authors from CR json data
def get_cr_author_list(article_data):
    authors = []
    if 'author' in article_data.keys():
        for author in article_data['author']:
            new_author=""
            new_author = author['family']
            if 'given' in author.keys():
                new_author += ", " + author['given']
            authors.append(new_author)
    return ("; ").join(authors)

# get the publication date from CR json data
def get_cr_year_published(article_data):
    year_print = 0
    if 'published-print' in article_data.keys() \
        and article_data['published-print'] != None \
        and article_data['published-print']['date-parts'][0] != None:
        year_print = int(article_data['published-print']['date-parts'][0][0])    
    elif 'journal-issue' in article_data.keys() \
        and article_data['journal-issue'] != None \
        and 'published-print' in article_data['journal-issue'].keys() \
        and article_data['journal-issue']['published-print'] != None \
        and article_data['journal-issue']['published-print']['date-parts'][0] != None:
        year_print = int(article_data['journal-issue']['published-print']['date-parts'][0][0])

    year_online = 0
    if 'published-online' in article_data.keys() \
        and article_data['published-online'] != None \
        and article_data['published-online']['date-parts'][0] != None:
        year_online = int(article_data['published-online']['date-parts'][0][0])    
    elif 'journal-issue' in article_data.keys() \
        and article_data['journal-issue'] != None \
        and 'published-online' in article_data['journal-issue'].keys() \
        and article_data['journal-issue']['published-online'] != None \
        and article_data['journal-issue']['published-online']['date-parts'][0] != None:
        year_print = int(article_data['journal-issue']['published-online']['date-parts'][0][0])
    
    if year_print != 0 and year_online != 0:
        return year_print if year_print < year_online else year_online
    else:
        return year_print if year_online == 0 else year_online
    return 0

def get_pdf_from_url(pdf_url):
    fname = ""
    try:
        response = requests.get(pdf_url)
        content_type = response.headers['content-type']
        if not 'text' in content_type:
            #print(response.headers)
            cd= response.headers['content-disposition']
            #print(cd)
            fname = re.findall("filename=(.+)", cd)[0]
            #print(fname)
            with open('pdf_files/'+ fname +'.pdf', 'wb') as f:
                f.write(response.content)
    except:
        print("Error getting file from: ", pdf_url)
    finally:
        return fname
    
def set_pdf_file_value(file_name, pub_id, db_name = "app_db.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    table = 'articles'   
    done = db_conn.set_value_table(table, pub_id, "pdf_file", file_name)
    db_conn.close()
    return done

def get_elsevier_pdf(doi):
    pdf_url = f'http://api.elsevier.com/content/article/doi:{doi}?view=FULL'
    print("\t", pdf_url) 
    return get_pdf_from_url(pdf_url)

def get_wiley_pdf(doi):
    pdf_url = f'https://onlinelibrary.wiley.com/doi/pdf/{doi}'
    print("\t", pdf_url) 
    return get_pdf_from_url(pdf_url)

Get the name of the current app db file:

In [8]:
# app db file with path: db_files/app_db.sqlite3
ukchapp_db = "db_files/app_db.sqlite3"
while not Path(ukchapp_db).is_file():
    print('Please enter the name of app db file:')
    ukchapp_db = input()

## Get pdf files for publications

Read database and try to recover pdf files

In [10]:
# get publication data from the ukch app
db_pubs = get_pub_app_data(ukchapp_db)

for a_pub in tqdm_notebook(db_pubs):
    pub_id = a_pub[0]
    pub_title = a_pub[1]
    pub_doi = a_pub[2]
    pub_url = a_pub[3]
    pub_pdf = a_pub[4]
    pub_html = a_pub[5]
    if pub_pdf == None:
        not_in_url = True
        print("ID: ", pub_id, "Publication: ",pub_title,
              "\n\tDOI: ", pub_doi, " URL: ", pub_url)
        if "pdf" in pub_url:
            print ("\tTry to get the pdf from URL: ", pub_url)
            try:
                response = requests.get(pub_url)
                content_type = response.headers['content-type']
                if not 'text' in content_type:
                    #print(response.headers)
                    cd= response.headers['content-disposition']
                    #print(cd)
                    fname = re.findall("filename=(.+)", cd)[0]
                    #print(fname)
                    if not Path('pdf_files/' + pdf_file).is_file():
                        with open('pdf_files/'+ fname +'.pdf', 'wb') as f:
                            f.write(response.content)
                    else:
                        set_pdf_file_value(pdf_file, pub_id, ukchapp_db)
                    not_in_url = False
            except:
                print("ID: ", pub_id, "\nPublication: ",pub_title, 
                       "\nDOI: ", pub_doi, "\nDOI: ", pub_url) 
        if not_in_url:
            print("\tTry to see if json file has link to pdf: ")
            if valid_doi(pub_doi):
                crjd, doi_file = get_cr_json_object(pub_doi)
                got_pdf_link = False
                if "link" in crjd.keys():
                    for a_link in crjd["link"]:
                        if "\tURL" in a_link.keys() and ("pdf" in a_link["URL"] or "pdf" in a_link["content-type"]):
                            cr_url = a_link["URL"]
                            #print("URL: ", cr_url)
                            pdf_file = get_pdf_from_url(cr_url)
                            # if the name corresponds to a existing file, assign value to db_record
                            if Path('pdf_files/' + pdf_file).is_file():
                                print("\tFile name:", pdf_file)
                                set_pdf_file_value(pdf_file, pub_id, ukchapp_db)
                                got_pdf_link = True
                            else:
                                print("\tcould not get file from", cr_url)
                    if not got_pdf_link and "elsevier" in pub_url:
                        print("\tTrying elsevier doi:" )
                        pdf_file = get_elsevier_pdf(pub_doi)
                    elif not got_pdf_link and "wiley" in pub_url:
                        print("\tTrying elsevier doi:" )
                        pdf_file = get_wiley_pdf(pub_doi)
                else: 
                    print("\tno links in json", pub_doi)
            
                

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=342.0), HTML(value='')))

ID:  5 Publication:  Simplified lipid II-binding antimicrobial peptides: Design, synthesis and antimicrobial activity of bioconjugates of nisin rings A and B with pore-forming peptides 
	DOI:  10.1016/j.bmc.2018.10.015  URL:  https://api.elsevier.com/content/article/PII:S0968089618313233?httpAccept=text/xml
	Try to see if json file has link to pdf: 
	Trying elsevier doi:
	 http://api.elsevier.com/content/article/doi:10.1016/j.bmc.2018.10.015?view=FULL
ID:  9 Publication:  Concise synthesis of artemisinin from a farnesyl diphosphate analogue 
	DOI:  10.1016/j.bmc.2017.03.068  URL:  https://api.elsevier.com/content/article/PII:S0968089617301268?httpAccept=text/xml
	Try to see if json file has link to pdf: 
	Trying elsevier doi:
	 http://api.elsevier.com/content/article/doi:10.1016/j.bmc.2017.03.068?view=FULL
ID:  19 Publication:  Operando HERFD-XANES/XES studies reveal differences in the activity of Fe-species in MFI and CHA structures for the standard selective catalytic reduction of NO

ID:  151 Publication:  Nb2O5/SBA-15 catalyzed propanoic acid esterification 
	DOI:  10.1016/j.apcatb.2016.12.066  URL:  https://api.elsevier.com/content/article/PII:S0926337316310025?httpAccept=text/xml
	Try to see if json file has link to pdf: 
	Trying elsevier doi:
	 http://api.elsevier.com/content/article/doi:10.1016/j.apcatb.2016.12.066?view=FULL
ID:  153 Publication:  Tellurium-doped lanthanum manganite as catalysts for the oxygen reduction reaction 
	DOI:  10.1557/mrc.2017.22  URL:  https://www.cambridge.org/core/services/aop-cambridge-core/content/view/S2159685917000222
	Try to see if json file has link to pdf: 
ID:  154 Publication:  Photoelectrochemical properties of BiOCl microplatelets 
	DOI:  10.1016/j.jelechem.2017.10.024  URL:  https://api.elsevier.com/content/article/PII:S1572665717307269?httpAccept=text/xml
	Try to see if json file has link to pdf: 
	Trying elsevier doi:
	 http://api.elsevier.com/content/article/doi:10.1016/j.jelechem.2017.10.024?view=FULL
ID:  155 Publ

ID:  252 Publication:  Supercritical antisolvent precipitation of TiO2 with tailored anatase/rutile composition for applications in redox catalysis and photocatalysis 
	DOI:  10.1016/j.apcata.2015.02.023  URL:  https://api.elsevier.com/content/article/PII:S0926860X15001155?httpAccept=text/xml
	Try to see if json file has link to pdf: 
	Trying elsevier doi:
	 http://api.elsevier.com/content/article/doi:10.1016/j.apcata.2015.02.023?view=FULL
ID:  255 Publication:  An Investigation of the Effect of the Addition of Tin to 5\u2009%Pd/TiO2for the Hydrogenation of Furfuryl Alcohol 
	DOI:  10.1002/cctc.201500242  URL:  https://api.wiley.com/onlinelibrary/tdm/v1/articles/10.1002%2Fcctc.201500242
	Try to see if json file has link to pdf: 
	Trying elsevier doi:
	 https://onlinelibrary.wiley.com/doi/pdf/10.1002/cctc.201500242
ID:  259 Publication:  Molecular Basis of Class A β-Lactamase Inhibition by Relebactam 
	DOI:  10.1128/aac.00564-19  URL:  https://syndication.highwire.org/content/doi/10.112

In [11]:
from chemdataextractor import Document

# A function for getting a list of files from the directory
# This will be modified to get the list from a csv file
def get_files_list (source_dir):
    i_counter = 0
    files_list = []
    for filepath in sorted(source_dir.glob('*.pdf')):
        i_counter += 1
        files_list.append(filepath)
    return files_list

def cde_read_pdfs(a_file):
    pdf_f = open(a_file, 'rb')
    doc = Document.from_file(pdf_f)
    return doc

def find_doi(element_text):
    cr_re_01 = '10.\d{4,9}/[-._;()/:A-Z0-9]+'
    compare = re.search(cr_re_01, element_text, re.IGNORECASE)
    if compare != None:
        return compare.group()
    return ""

def get_db_id(doi_value, db_name = "app_db.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    table = 'articles'   
    id_val = db_conn.get_value(table, "id", "doi", doi_value)
    db_conn.close()
    if id_val != None:
        return id_val[0]
    else:
        return 0

def get_db_title(doi_value, db_name = "app_db.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    table = 'articles'   
    id_val = db_conn.get_value(table, "title", "doi", doi_value)
    db_conn.close()
    if id_val != None:
        return id_val[0]
    else:
        return 0

def get_close_dois(str_name, db_name = "prev_search.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    search_in = 'articles'
    fields_required = "id, doi, title, pdf_file"
    filter_str = "doi like '%"+str_name+"%';"

    db_titles = db_conn.get_values(search_in, fields_required, filter_str)
    db_conn.close()
    return db_titles

In [16]:


files_list = get_files_list(Path("pdf_files"))
not_assigned = []
for a_file in tqdm_notebook(files_list):
    search_this = a_file.name.replace(".pdf", "").lower()
    print(a_file.name,"\t",search_this)
    close_dois = get_close_dois(search_this, ukchapp_db)
    print(len(close_dois))
    
    if len(close_dois) == 1 :
        doi_dat = close_dois[0]
        selected = False
        if doi_dat[3] == None:
            while not selected:
                print("Assign file: ", a_file.name, " to:\n\t", doi_dat[0],doi_dat[1],doi_dat[2], doi_dat[3])
                print('***************************************************************')
                print("Options:\n\ta) assign\n\tb)go to next")
                print("selection:")
                usr_select = input()
                if usr_select == 'a':
                    selected = True
                    set_pdf_file_value(a_file.name, doi_dat[0], ukchapp_db)
                    print("assing and go to next")
                elif usr_select == 'b':
                    #working_file[art_num]['ignore']=3 # visual inspection
                    selected = True
                    print("going to next")
        else:
            print("Assigned in db: ",  doi_dat[0],doi_dat[1],doi_dat[2], doi_dat[3])
    else:
        not_assigned.append(a_file)        

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=287.0), HTML(value='')))

1-s2.0-S0021951719301459-main.pdf 	 1-s2.0-s0021951719301459-main
0
1-s2.0-S0920586118300415-main.pdf 	 1-s2.0-s0920586118300415-main
0
1-s2.0-S0920586118316456-main.pdf 	 1-s2.0-s0920586118316456-main
0
1-s2.0-S0926860X18305003-main.pdf 	 1-s2.0-s0926860x18305003-main
0
1-s2.0-S1010603017313138-main.pdf 	 1-s2.0-s1010603017313138-main
0
1.5039294.pdf 	 1.5039294
1
Assigned in db:  29 10.1063/1.5039294 QENS study of methane diffusion in Mo/H-ZSM-5 used for the methane dehydroaromatisation reaction 1.5039294.pdf
2015_Article_.pdf 	 2015_article_
0
2017_Article_.pdf 	 2017_article_
0
2190-4286-10-191.pdf 	 2190-4286-10-191
0
acs.biochem.8b00169.pdf 	 acs.biochem.8b00169
1
Assigned in db:  10 10.1021/acs.biochem.8b00169 Biocatalytic Routes to Lactone Monomers for Polymer Production acs.biochem.8b00169.pdf
acs.chemmater.5b00866.pdf 	 acs.chemmater.5b00866
1
Assigned in db:  129 10.1021/acs.chemmater.5b00866 Restructuring of AuPd Nanoparticles Studied by a Combined XAFS/DRIFTS Approach acs.

a
UPDATE articles SET pdf_file = 'adfm.201400338.pdf' WHERE id = 569
assing and go to next
aic.15095.pdf 	 aic.15095
1
Assigned in db:  124 10.1002/aic.15095 Probing pore blocking effects on multiphase reactions within porous catalyst particles using a discrete model aic.15095.pdf
aic.15415.pdf 	 aic.15415
1
Assigned in db:  125 10.1002/aic.15415 Influence of catalyst pore network structure on the hysteresis of multiphase reactions aic.15415.pdf
aic.16687.pdf 	 aic.16687
1
Assigned in db:  494 10.1002/aic.16687 Optimizing catalyst pore network structure in the presence of deactivation by coking aic.16687.pdf
ange.201703550.pdf 	 ange.201703550
1
Assigned in db:  93 10.1002/ange.201703550 Probing the Role of a Non-Thermal Plasma (NTP) in the Hybrid NTP Catalytic Oxidation of Methane ange.201703550.pdf
ange.201713115.pdf 	 ange.201713115
0
anie.201602930.pdf 	 anie.201602930
1
Assigned in db:  201 10.1002/anie.201602930 Dizinc Lactide Polymerization Catalysts: Hyperactivity by Control of

1
Assigned in db:  8 10.1039/c8ob00066b QM/MM simulations identify the determinants of catalytic activity differences between type II dehydroquinase enzymes C8OB00066B.pdf
C8SC03312A.pdf 	 c8sc03312a
1
Assigned in db:  482 10.1039/c8sc03312a Selective and catalytic carbon dioxide and heteroallene activation mediated by cerium N-heterocyclic carbene complexes C8SC03312A.pdf
C8TA02908C.pdf 	 c8ta02908c
1
Assigned in db:  399 10.1039/c8ta02908c Determining the importance of the electrode support and fabrication method during the initial screening process of an active catalyst for the oxygen evolution reaction C8TA02908C.pdf
C8TA12263F.pdf 	 c8ta12263f
1
Assigned in db:  269 10.1039/c8ta12263f In situ synthesized low-PtCo@porous carbon catalyst for highly efficient hydrogen evolution C8TA12263F.pdf
C9CC02088H.pdf 	 c9cc02088h
1
Assigned in db:  270 10.1039/c9cc02088h Evidence for tetranuclear bis-μ-oxo cubane species in molecular iridium-based water oxidation catalysts from XAS analysis C9

a
UPDATE articles SET pdf_file = 'cphc.201600149.pdf' WHERE id = 168
assing and go to next
cplu.201500195.pdf 	 cplu.201500195
1
Assign file:  cplu.201500195.pdf  to:
	 135 10.1002/cplu.201500195 Utilizing Benign Oxidants for Selective Aerobic Oxidations Using Heterogenized Platinum Nanoparticle Catalysts None
***************************************************************
Options:
	a) assign
	b)go to next
selection:
a
UPDATE articles SET pdf_file = 'cplu.201500195.pdf' WHERE id = 135
assing and go to next
cs400683e.pdf 	 cs400683e
1
Assigned in db:  358 10.1021/cs400683e Molybdenum Oxide on Fe2O3 Core–Shell Catalysts: Probing the Nature of the Structural Motifs Responsible for Methanol Oxidation Catalysis cs400683e.pdf
cs502038y.pdf 	 cs502038y
1
Assigned in db:  212 10.1021/cs502038y From Organometallic Zinc and Copper Complexes to Highly Active Colloidal Catalysts for the Conversion of CO2 to Methanol cs502038y.pdf
cssc.201403190.pdf 	 cssc.201403190
1
Assign file:  cssc.201403190.p

pssa.201600440.pdf 	 pssa.201600440
1
Assigned in db:  97 10.1002/pssa.201600440 Heterostructures of GaN with SiC and ZnO enhance carrier stability and separation in framework semiconductors pssa.201600440.pdf
rspa.2016.0054.pdf 	 rspa.2016.0054
1
Assigned in db:  178 10.1098/rspa.2016.0054 H2 production by the photocatalytic reforming of cellulose and raw biomass using Ni, Pd, Pt and Au on titania rspa.2016.0054.pdf
rspa.2016.0078.pdf 	 rspa.2016.0078
1
Assigned in db:  204 10.1098/rspa.2016.0078 Proteins as templates for complex synthetic metalloclusters: towards biologically programmed heterogeneous catalysis rspa.2016.0078.pdf
rspa.2016.0095.pdf 	 rspa.2016.0095
1
Assigned in db:  127 10.1098/rspa.2016.0095 Influence of dopant substitution mechanism on catalytic properties within hierarchical architectures rspa.2016.0095.pdf
rspa.2016.0126.pdf 	 rspa.2016.0126
1
Assigned in db:  110 10.1098/rspa.2016.0126 The reaction of formic acid with Raney  TM copper rspa.2016.0126.pdf
rsta.201

In [17]:
len(not_assigned)

34

In [18]:
for a_file in tqdm_notebook(not_assigned):
    pdf_doc = cde_read_pdfs(a_file)
    print(a_file.name)
    dois_list = []
    for element in pdf_doc.elements:
        if 'doi' in str(element):
            found_doi = find_doi(str(element))
            if found_doi[-1:] == ".":
                found_doi = found_doi[:-1]
            if not found_doi in dois_list:
                dois_list.append(found_doi)       
    
    if dois_list != [] and len(dois_list) == 1:
        for a_doi in dois_list:
            close_dois = get_close_dois(a_doi, ukchapp_db)
            selected = False
            if len(close_dois) == 1:
                doi_dat = close_dois[0]
                if doi_dat[3] == None:
                    while not selected:
                        print("Assign file: ",a_file.name, " to:\n\t", doi_dat[0],doi_dat[1],doi_dat[2], doi_dat[3])
                        print('***************************************************************')
                        print("Options:\n\ta) assign\n\tb)go to next")
                        print("selection:")
                        usr_select = input()
                        if usr_select == 'a':
                            selected = True
                            set_pdf_file_value(a_file.name, doi_dat[0], ukchapp_db)
                            print("assing and go to next")
                        elif usr_select == 'b':
                            #working_file[art_num]['ignore']=3 # visual inspection
                            selected = True
                            print("going to next")
                else: 
                    print("Already assingned to:\n\t", doi_dat[0],doi_dat[1],doi_dat[2], doi_dat[3])
                

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=34.0), HTML(value='')))

1-s2.0-S0021951719301459-main.pdf
Already assingned to:
	 365 10.1016/j.jcat.2019.03.037 Combined spatially resolved operando spectroscopy: New insights into kinetic oscillations of CO oxidation on Pd/γ-Al2O3 1-s2.0-S0021951719301459-main.pdf
1-s2.0-S0920586118300415-main.pdf
Already assingned to:
	 445 10.1016/j.cattod.2018.01.033 Preparation of bifunctional Au-Pd/TiO2 catalysts and research on methanol liquid phase one-step oxidation to methyl formate 1-s2.0-S0920586118300415-main.pdf
1-s2.0-S0920586118316456-main.pdf
Already assingned to:
	 282 10.1016/j.cattod.2019.01.065 In-depth characterisation of metal-support compounds in spent Co/SiO2 Fischer-Tropsch model catalysts 1-s2.0-S0920586118316456-main.pdf
1-s2.0-S0926860X18305003-main.pdf
Already assingned to:
	 18 10.1016/j.apcata.2018.10.010 Investigation of ZSM-5 catalysts for dimethylether conversion using inelastic neutron scattering 1-s2.0-S0926860X18305003-main.pdf
1-s2.0-S1010603017313138-main.pdf
Already assingned to:
	 44

In [19]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument

files_list = get_files_list(Path("pdf_files"))
db_pubs = get_pub_app_data(ukchapp_db)
missing=[]
# check which files are really missing linking
for file in files_list:
    found_in_db = False
    for db_pub in db_pubs:
        if file.name == db_pub[4]:
            found_in_db = True
            break
    if not found_in_db:
       missing.append(file) 

second_round = missing
not_assigned2=[]
for a_file in tqdm_notebook(second_round):
    fp = open(a_file, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    #print(doc.info)  # The "Info" metadata
    if 'Subject' in doc.info[0].keys():
        print("File name: ", a_file.name, " Subject ", doc.info[0]['Subject'])
        found_doi = find_doi(str(doc.info[0]['Subject']))
        if found_doi != None:
            print("DOI in metadata: ",found_doi)
            close_dois = get_close_dois(found_doi, ukchapp_db)
            if len(close_dois) == 1:
                doi_dat = close_dois[0]
                if doi_dat[3] == None:
                    selected = False
                    while not selected:
                        print("Assign file: ",a_file.name, " to:\n\t", doi_dat[0],doi_dat[1],doi_dat[2], doi_dat[3])
                        print('***************************************************************')
                        print("Options:\n\ta) assign\n\tb)go to next")
                        print("selection:")
                        usr_select = input()
                        if usr_select == 'a':
                            selected = True
                            set_pdf_file_value(a_file.name, doi_dat[0], ukchapp_db)
                            print("assing and go to next")
                        elif usr_select == 'b':
                            #working_file[art_num]['ignore']=3 # visual inspection
                            selected = True
                            print("going to next")
                else: 
                    print("Already assingned to:\n\t", doi_dat[0],doi_dat[1],doi_dat[2], doi_dat[3])
            
    else:
        print("********", a_file.name, "*******")
        not_assigned2.append(a_file.name)
        


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [20]:
not_assigned2

[]

In [21]:
second_round

[]

In [22]:
len(not_assigned2)

0

In [None]:
files_list = get_files_list(Path("pdf_files"))
db_pubs = get_pub_app_data(ukchapp_db)
missing=[]
# check which files are really missing linking
for file in files_list:
    found_in_db = False
    for db_pub in db_pubs:
        if file.name == db_pub[4]:
            found_in_db = True
            break
    if not found_in_db:
        missing.append(file)

# check if all linked files are in the folder
missing2=[]
for db_pub in db_pubs:
    found_in_system = False
    for file in files_list:
        if file.name == db_pub[4] or db_pub[4] == None:
            found_in_system = True
            break
    if not found_in_system:
        missing2.append(db_pub)


In [None]:
missing


In [None]:
len(missing)

In [None]:
files_list

In [None]:
db_pubs