# Get PDF Files for publications in the UK Catalysis Hub app db
A list of publications is obtainded from the app database. This list will contain a titles, IDs and DOIs which need to be explored to look for asociated data (suplementary data, raw data, processed data). 

The steps of the process are: 

 -a) get a Title, DOI, and URL for each publication
 -b) get the DOI landing page and see if it contains references to data
 -c) add a new dataset entry each time a new ds is found
 -d) link the dataset to the publication.

In [1]:
# Libraries
# library containign functions that read and write to csv files
import lib.handle_csv as csvh
# library for connecting to the db
import lib.handle_db as dbh
# library for handling text matchings
import lib.text_comp as txtc
# library for getting data from crossref
import lib.crossref_api as cr_api
# library for handling url searchs
import lib.handle_urls as urlh
# managing files and file paths
from pathlib import Path
# add aprogress bar
from tqdm import tqdm_notebook 
# library for getting data from crossref
import lib.crossref_api as cr_api
#library for handling json files
import json
# library for using regular expressions
import re
# library for handling http requests
import requests

In [2]:
# Custom Functions
# get the crossreference json page from doi
def get_cr_json_object(cr_doi):
  crjd = None
  doi_file = 'json_files/' + cr_doi.replace('/','_').lower() + '.json'
  if not Path(doi_file).is_file():
    crjd = cr_api.getBibData(cr_doi)
    with open(doi_file, 'w', encoding='utf-8-sig', errors='ignore') as f:
                json.dump(crjd, f, ensure_ascii=False, indent=4)
  else:
    with open(doi_file, 'r', encoding='utf-8-sig') as jf:
        crjd = json.load(jf)
  # return the content and the file name 
  return crjd, doi_file

# get the landing page for the publication from uri
def get_pub_html_doi(cr_doi):
    html_file = 'html_files/' + cr_doi.replace('/','_').lower() + '.html'
    if not Path(html_file).is_file():
        page_content = urlh.getPageFromDOI(doi_text)
        with open(html_file, 'w', encoding='utf-8') as f:
            f.write(page_content.decode("utf-8") )
    else:
        f = open(html_file, "r")
        page_content = f.read()
    return page_content, html_file
             
def get_titles(str_pub_title, db_name = "prev_search.sqlite3"):
    print(db_name)
    db_conn = dbh.DataBaseAdapter(db_name)
    search_in = 'prev_pop_searches'
    fields_required = "Num, Title"
    filter_str = "Title like '"+str_pub_title[0]+"%';"

    db_titles = db_conn.get_values(search_in, fields_required, filter_str)
    db_conn.close()
    return db_titles

def get_titles_and_dois(str_pub_title, db_name = "app_db.sqlite3"):
    print(db_name)
    db_conn = dbh.DataBaseAdapter(db_name)
    search_in = 'articles'
    fields_required = "id, title, doi"
    filter_str = "Title like '"+str_pub_title[0]+"%';"
    db_titles = db_conn.get_values(search_in, fields_required, filter_str)
    db_conn.close()
    return db_titles

def get_pub_app_data(db_name = "app_db.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    search_in = 'articles'
    fields_required = "id, title, doi, link, pdf_file, html_file"
    filter_str = "status = 'Added'"
    db_titles = db_conn.get_values(search_in, fields_required, filter_str)
    db_conn.close()
    return db_titles

# get the current csv working file
def get_working_file(nr_wf):
    working_file = wf_fields = None
    current_pass = 0
    if Path(nr_wf).is_file():
        working_file, wf_fields = csvh.get_csv_data(nr_wf,'Num')
        for art_num in tqdm_notebook(working_file):
            if 'ignore' in working_file[art_num].keys():
                if current_pass < int(working_file[art_num]['ignore']):
                    current_pass = int(working_file[art_num]['ignore'])
            else:
                break
    print("Current pass:", current_pass)
    return working_file, wf_fields, current_pass

def get_pub_html_url(text_url, entry_id):
    html_file = 'html_files/' +  entry_id + '.html'
    if not Path(html_file).is_file():
        print("")
        page_content = urlh.getPageFromURL(text_url)
        with open(html_file, 'w', encoding='utf-8') as f:
            f.write(page_content)
    else:
        f = open(html_file, "r")
        page_content = f.read()
    return page_content, html_file

def valid_doi(cr_doi):
    # CR DOIS: https://www.crossref.org/blog/dois-and-matching-regular-expressions/
    # CR DOIs re1
    # /^10.\d{4,9}/[-._;()/:A-Z0-9]+$/i
    if cr_doi == None:
        return False
    cr_re_01 = '^10.\d{4,9}/[-._;()/:A-Z0-9]+'
    compare = re.match(cr_re_01, cr_doi, re.IGNORECASE)
    if compare != None and cr_doi == compare.group():
        return True
    else:
        return False
    
# get a semicolon separated list of authors from CR json data
def get_cr_author_list(article_data):
    authors = []
    if 'author' in article_data.keys():
        for author in article_data['author']:
            new_author=""
            new_author = author['family']
            if 'given' in author.keys():
                new_author += ", " + author['given']
            authors.append(new_author)
    return ("; ").join(authors)

# get the publication date from CR json data
def get_cr_year_published(article_data):
    year_print = 0
    if 'published-print' in article_data.keys() \
        and article_data['published-print'] != None \
        and article_data['published-print']['date-parts'][0] != None:
        year_print = int(article_data['published-print']['date-parts'][0][0])    
    elif 'journal-issue' in article_data.keys() \
        and article_data['journal-issue'] != None \
        and 'published-print' in article_data['journal-issue'].keys() \
        and article_data['journal-issue']['published-print'] != None \
        and article_data['journal-issue']['published-print']['date-parts'][0] != None:
        year_print = int(article_data['journal-issue']['published-print']['date-parts'][0][0])

    year_online = 0
    if 'published-online' in article_data.keys() \
        and article_data['published-online'] != None \
        and article_data['published-online']['date-parts'][0] != None:
        year_online = int(article_data['published-online']['date-parts'][0][0])    
    elif 'journal-issue' in article_data.keys() \
        and article_data['journal-issue'] != None \
        and 'published-online' in article_data['journal-issue'].keys() \
        and article_data['journal-issue']['published-online'] != None \
        and article_data['journal-issue']['published-online']['date-parts'][0] != None:
        year_print = int(article_data['journal-issue']['published-online']['date-parts'][0][0])
    
    if year_print != 0 and year_online != 0:
        return year_print if year_print < year_online else year_online
    else:
        return year_print if year_online == 0 else year_online
    return 0

def get_pdf_from_url(pdf_url):
    fname = ""
    try:
        response = requests.get(pdf_url)
        content_type = response.headers['content-type']
        if not 'text' in content_type:
            #print(response.headers)
            cd= response.headers['content-disposition']
            #print(cd)
            fname = re.findall("filename=(.+)", cd)[0]
            #print(fname)
            with open('pdf_files/'+ fname +'.pdf', 'wb') as f:
                f.write(response.content)
    except:
        print("Error getting file from: ", pdf_url)
    finally:
        return fname
    
def set_pdf_file_value(file_name, pub_id, db_name = "app_db.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    table = 'articles'   
    done = db_conn.set_value_table(table, pub_id, "pdf_file", file_name)
    db_conn.close()
    return done

Get the name of the current app db file:

In [3]:
# app db file with path: db_files/app_db.sqlite3
ukchapp_db = "db_files/app_db.sqlite3"
while not Path(ukchapp_db).is_file():
    print('Please enter the name of app db file:')
    ukchapp_db = input()

## Get pdf files for publications

Read database and try to recover pdf files

In [4]:
# get publication data from the ukch app
db_pubs = get_pub_app_data(ukchapp_db)

for a_pub in tqdm_notebook(db_pubs):
    pub_id = a_pub[0]
    pub_title = a_pub[1]
    pub_doi = a_pub[2]
    pub_url = a_pub[3]
    pub_pdf = a_pub[4]
    pub_html = a_pub[5]
    if "pdf" in pub_url and pub_pdf == None :
        print ("try to get the pdf from url")
        try:
            response = requests.get(pub_url)
            content_type = response.headers['content-type']
            if not 'text' in content_type:
                #print(response.headers)
                cd= response.headers['content-disposition']
                #print(cd)
                fname = re.findall("filename=(.+)", cd)[0]
                #print(fname)
                if not Path('pdf_files/' + pdf_file).is_file():
                    with open('pdf_files/'+ fname +'.pdf', 'wb') as f:
                        f.write(response.content)
                else:
                    set_pdf_file_value(pdf_file, pub_id, ukchapp_db)
        except:
            print("ID: ", pub_id, "\nPublication: ",pub_title, 
                   "\nDOI: ", pub_doi, "\nDOI: ", pub_url)
    elif pub_pdf == None:
        print("try to see if json file has link to pdf", pub_doi)
        print(pub_title)
        if valid_doi(pub_doi):
            crjd, doi_file = get_cr_json_object(pub_doi)
            if "link" in crjd.keys():
                for a_link in crjd["link"]:
                    if "URL" in a_link.keys() and ("pdf" in a_link["URL"] or "pdf" in a_link["content-type"]):
                        cr_url = a_link["URL"]
                        #print("URL: ", cr_url)
                        pdf_file = get_pdf_from_url(cr_url)
                        # if the name corresponds to a existing file, assign value to db_record
                        if Path('pdf_files/' + pdf_file).is_file():
                            print("File name:", pdf_file)
                            set_pdf_file_value(pdf_file, pub_id, ukchapp_db)
                        else:
                            print("could not get file from", cr_url)
            else: 
                print("no links in json", pub_doi)
                

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=342.0), HTML(value='')))

try to see if json file has link to pdf 10.1016/j.bmc.2018.10.015
Simplified lipid II-binding antimicrobial peptides: Design, synthesis and antimicrobial activity of bioconjugates of nisin rings A and B with pore-forming peptides
try to get the pdf from url
try to see if json file has link to pdf 10.1016/j.bmc.2017.03.068
Concise synthesis of artemisinin from a farnesyl diphosphate analogue
try to get the pdf from url
try to get the pdf from url
try to get the pdf from url
try to get the pdf from url
try to see if json file has link to pdf 10.1016/j.apcata.2018.11.026
Operando HERFD-XANES/XES studies reveal differences in the activity of Fe-species in MFI and CHA structures for the standard selective catalytic reduction of NO with NH3
try to get the pdf from url
try to see if json file has link to pdf 10.1016/j.apcatb.2018.07.008
Directed aqueous-phase reforming of glycerol through tailored platinum nanoparticles
try to see if json file has link to pdf 10.1016/j.micromeso.2017.12.015
S

try to get the pdf from url
try to get the pdf from url
try to get the pdf from url
try to see if json file has link to pdf 10.1016/j.susc.2015.12.024
XPS and STM studies of the oxidation of hydrogen chloride at Cu(100) surfaces
try to get the pdf from url
try to see if json file has link to pdf 10.1016/j.susc.2015.11.010
The surface of iron molybdate catalysts used for the selective oxidation of methanol
try to see if json file has link to pdf 10.1016/j.jcat.2016.03.017
Pd/ZnO catalysts for direct CO2 hydrogenation to methanol
try to see if json file has link to pdf 10.1016/j.susc.2016.01.001
From surface science to catalysis: The importance of methoxy and formate species on Cu single crystals and industrial catalysts
try to see if json file has link to pdf 10.1016/j.apcata.2015.10.023
Exploring the mechanisms of metal co-catalysts in photocatalytic reduction reactions: Is Ag a good candidate?
try to see if json file has link to pdf 10.1002/cssc.201500503
Oxidation of Aliphatic Alcoho

In [8]:
from chemdataextractor import Document

# A function for getting a list of files from the directory
# This will be modified to get the list from a csv file
def get_files_list (source_dir):
    i_counter = 0
    files_list = []
    for filepath in sorted(source_dir.glob('*.pdf')):
        i_counter += 1
        files_list.append(filepath)
    return files_list

def cde_read_pdfs(a_file):
    pdf_f = open(a_file, 'rb')
    doc = Document.from_file(pdf_f)
    return doc

def find_doi(element_text):
    cr_re_01 = '10.\d{4,9}/[-._;()/:A-Z0-9]+'
    compare = re.search(cr_re_01, element_text, re.IGNORECASE)
    if compare != None:
        return compare.group()
    return ""

def get_db_id(doi_value, db_name = "app_db.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    table = 'articles'   
    id_val = db_conn.get_value(table, "id", "doi", doi_value)
    db_conn.close()
    if id_val != None:
        return id_val[0]
    else:
        return 0

def get_db_title(doi_value, db_name = "app_db.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    table = 'articles'   
    id_val = db_conn.get_value(table, "title", "doi", doi_value)
    db_conn.close()
    if id_val != None:
        return id_val[0]
    else:
        return 0

def get_close_dois(str_name, db_name = "prev_search.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    search_in = 'articles'
    fields_required = "id, doi, title, pdf_file"
    filter_str = "doi like '%"+str_name+"%';"

    db_titles = db_conn.get_values(search_in, fields_required, filter_str)
    db_conn.close()
    return db_titles

In [6]:


files_list = get_files_list(Path("pdf_files"))
not_assigned = []
for a_file in tqdm_notebook(files_list):
    search_this = a_file.name.replace(".pdf", "").lower()
    print(a_file.name,"\t",search_this)
    close_dois = get_close_dois(search_this, ukchapp_db)
    print(len(close_dois))
    
    if len(close_dois) == 1 :
        doi_dat = close_dois[0]
        selected = False
        if doi_dat[3] == None:
            while not selected:
                print("Assign file: ", a_file.name, " to:\n\t", doi_dat[0],doi_dat[1],doi_dat[2], doi_dat[3])
                print('***************************************************************')
                print("Options:\n\ta) assign\n\tb)go to next")
                print("selection:")
                usr_select = input()
                if usr_select == 'a':
                    selected = True
                    set_pdf_file_value(a_file.name, doi_dat[0], ukchapp_db)
                    print("assing and go to next")
                elif usr_select == 'b':
                    #working_file[art_num]['ignore']=3 # visual inspection
                    selected = True
                    print("going to next")
        else:
            print("Assigned in db: ",  doi_dat[0],doi_dat[1],doi_dat[2], doi_dat[3])
    else:
        not_assigned.append(a_file)        

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=244.0), HTML(value='')))

1-s2.0-S0021951719301459-main.pdf 	 1-s2.0-s0021951719301459-main
0
1-s2.0-S0920586118316456-main.pdf 	 1-s2.0-s0920586118316456-main
0
1-s2.0-S1385894719325884-main.pdf 	 1-s2.0-s1385894719325884-main
0
1.5039294.pdf.pdf 	 1.5039294
1
Assigned in db:  29 10.1063/1.5039294 QENS study of methane diffusion in Mo/H-ZSM-5 used for the methane dehydroaromatisation reaction 1.5039294.pdf.pdf
2015_Article_.pdf 	 2015_article_
0
2017_Article_.pdf 	 2017_article_
0
2018_Article_.pdf 	 2018_article_
0
2190-4286-10-191.pdf 	 2190-4286-10-191
0
287598212.pdf 	 287598212
0
acs.chemmater.5b00866.pdf.pdf 	 acs.chemmater.5b00866
1
Assigned in db:  129 10.1021/acs.chemmater.5b00866 Restructuring of AuPd Nanoparticles Studied by a Combined XAFS/DRIFTS Approach acs.chemmater.5b00866.pdf.pdf
acs.chemmater.7b02552.pdf.pdf 	 acs.chemmater.7b02552
1
Assigned in db:  94 10.1021/acs.chemmater.7b02552 Combined In Situ XAFS/DRIFTS Studies of the Evolution of Nanoparticle Structures from Molecular Precursors acs.

1
Assigned in db:  148 10.1039/c3cp52653d Molecular dynamics simulations of longer n-alkanes in silicalite: a comparison of framework and hydrocarbon models C3CP52653D.pdf
C4CC04024D.pdf 	 c4cc04024d
1
Assigned in db:  142 10.1039/c4cc04024d Selective photocatalytic oxidation of benzene for the synthesis of phenol using engineered Au–Pd alloy nanoparticles supported on titanium dioxide C4CC04024D.pdf
C4CP00753K.pdf 	 c4cp00753k
1
Assigned in db:  550 10.1039/c4cp00753k Segregation effects on the properties of (AuAg)147 C4CP00753K.pdf
C4CP04693E.pdf 	 c4cp04693e
1
Assigned in db:  145 10.1039/c4cp04693e Optimised photocatalytic hydrogen production using core–shell AuPd promoters with controlled shell thickness C4CP04693E.pdf
C4DT01309C.pdf 	 c4dt01309c
1
Assigned in db:  146 10.1039/c4dt01309c Well-controlled metal co-catalysts synthesised by chemical vapour impregnation for photocatalytic hydrogen production and water purification C4DT01309C.pdf
C4GC00087K.pdf 	 c4gc00087k
1
Assigned i

1
Assigned in db:  558 10.1039/c7cy00875a Reactivity of cationic α-diimine cyclopentadienyl nickel complexes towards AlEt2Cl: synthesis, characterisation and ethylene polymerisation C7CY00875A.pdf
C7CY01553D.pdf 	 c7cy01553d
1
Assigned in db:  417 10.1039/c7cy01553d Towards the upgrading of fermentation broths to advanced biofuels: a water tolerant catalyst for the conversion of ethanol to isobutanol C7CY01553D.pdf
C7DT01022B.pdf 	 c7dt01022b
1
Assigned in db:  89 10.1039/c7dt01022b A ruthenium(ii) bis(phosphinophosphinine) complex as a precatalyst for transfer-hydrogenation and hydrogen-borrowing reactions C7DT01022B.pdf
C7DT02167D.pdf 	 c7dt02167d
1
Assigned in db:  195 10.1039/c7dt02167d Ring opening polymerisation of lactide with uranium(iv) and cerium(iv) phosphinoaryloxide complexes C7DT02167D.pdf
C7DT03395H.pdf 	 c7dt03395h
1
Assigned in db:  67 10.1039/c7dt03395h Catalytic applications of small bite-angle diphosphorus ligands with single-atom linkers C7DT03395H.pdf
c7dt04805j.p

Assigned in db:  55 10.1002/cctc.201900795 Carbidisation of Pd Nanoparticles by Ethene Decomposition with Methane Production cctc.201900795.pdf
cctc.201901166.pdf 	 cctc.201901166
1
Assigned in db:  596 10.1002/cctc.201901166 Implications of the Molybdenum Coordination Environment in MFI Zeolites on Methane Dehydroaromatisation Performance cctc.201901166.pdf
cctc.201901268.pdf 	 cctc.201901268
1
Assigned in db:  3 10.1002/cctc.201901268 In Situ Monitoring of Nanoparticle Formation during Iridium‐Catalysed Oxygen Evolution by Real‐Time Small Angle X‐Ray Scattering cctc.201901268.pdf
cctc.201901955.pdf 	 cctc.201901955
1
Assigned in db:  583 10.1002/cctc.201901955 Influence of Synthesis Conditions on the Structure of Nickel Nanoparticles and their Reactivity in Selective Asymmetric Hydrogenation cctc.201901955.pdf
celc.201800052.pdf 	 celc.201800052
1
Assigned in db:  36 10.1002/celc.201800052 Effect of Ba Content on the Activity of La1-xBaxMnO3 Towards the Oxygen Reduction Reaction celc

1
Assigned in db:  586 10.1038/s41467-020-15445-z Synergistic ultraviolet and visible light photo-activation enables intensified low-temperature methanol synthesis over copper/zinc oxide/alumina s41467-020-15445-z.pdf
s41467-020-17852-8.pdf 	 s41467-020-17852-8
0
s41563-019-0562-6.pdf 	 s41563-019-0562-6
1
Assigned in db:  601 10.1038/s41563-019-0562-6 Quantitative production of butenes from biomass-derived γ-valerolactone catalysed by hetero-atomic MFI zeolite s41563-019-0562-6.pdf
s41563-020-0800-y.pdf 	 s41563-020-0800-y
1
Assigned in db:  604 10.1038/s41563-020-0800-y Insight into the effects of confined hydrocarbon species on the lifetime of methanol conversion catalysts s41563-020-0800-y.pdf
s41586-020-2733-7.pdf 	 s41586-020-2733-7
0
s41589-018-0154-9.pdf 	 s41589-018-0154-9
1
Assigned in db:  50 10.1038/s41589-018-0154-9 Functional and informatics analysis enables glycosyltransferase activity prediction s41589-018-0154-9.pdf
s41929-018-0197-z.pdf 	 s41929-018-0197-z
1
Assigned 

In [7]:
len(not_assigned)

40

In [9]:
for a_file in tqdm_notebook(not_assigned):
    pdf_doc = cde_read_pdfs(a_file)
    print(a_file.name)
    dois_list = []
    for element in pdf_doc.elements:
        if 'doi' in str(element):
            found_doi = find_doi(str(element))
            if found_doi[-1:] == ".":
                found_doi = found_doi[:-1]
            if not found_doi in dois_list:
                dois_list.append(found_doi)       
    
    if dois_list != [] and len(dois_list) == 1:
        for a_doi in dois_list:
            close_dois = get_close_dois(a_doi, ukchapp_db)
            selected = False
            if len(close_dois) == 1:
                doi_dat = close_dois[0]
                if doi_dat[3] == None:
                    while not selected:
                        print("Assign file: ",a_file.name, " to:\n\t", doi_dat[0],doi_dat[1],doi_dat[2], doi_dat[3])
                        print('***************************************************************')
                        print("Options:\n\ta) assign\n\tb)go to next")
                        print("selection:")
                        usr_select = input()
                        if usr_select == 'a':
                            selected = True
                            set_pdf_file_value(a_file.name, doi_dat[0], ukchapp_db)
                            print("assing and go to next")
                        elif usr_select == 'b':
                            #working_file[art_num]['ignore']=3 # visual inspection
                            selected = True
                            print("going to next")
                else: 
                    print("Already assingned to:\n\t", doi_dat[0],doi_dat[1],doi_dat[2], doi_dat[3])
                

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=40.0), HTML(value='')))

1-s2.0-S0021951719301459-main.pdf
Already assingned to:
	 365 10.1016/j.jcat.2019.03.037 Combined spatially resolved operando spectroscopy: New insights into kinetic oscillations of CO oxidation on Pd/γ-Al2O3 1-s2.0-S0021951719301459-main.pdf
1-s2.0-S0920586118316456-main.pdf
Already assingned to:
	 282 10.1016/j.cattod.2019.01.065 In-depth characterisation of metal-support compounds in spent Co/SiO2 Fischer-Tropsch model catalysts 1-s2.0-S0920586118316456-main.pdf
1-s2.0-S1385894719325884-main.pdf
2015_Article_.pdf
2017_Article_.pdf
2018_Article_.pdf
2190-4286-10-191.pdf
287598212.pdf
acssuschemeng.8b03268.pdf
Adams_Metal_Oxide_Catalysts_for_Solar_Driven_Water_Splitting.pdf
aic.17007.pdf
Bahruji2018_Article_SolventFreeSynthesisOfPdZnTiO2.pdf.pdf
Already assingned to:
	 338 10.1007/s11244-018-0885-6 Solvent Free Synthesis of PdZn/TiO2 Catalysts for the Hydrogenation of CO2 to Methanol Bahruji2018_Article_SolventFreeSynthesisOfPdZnTiO2.pdf.pdf
Bowker2015_Article_ThePhotocatalyticWindowP

In [15]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument

files_list = get_files_list(Path("pdf_files"))
db_pubs = get_pub_app_data(ukchapp_db)
missing=[]
# check which files are really missing linking
for file in files_list:
    found_in_db = False
    for db_pub in db_pubs:
        if file.name == db_pub[4]:
            found_in_db = True
            break
    if not found_in_db:
       missing.append(file) 

second_round = missing
not_assigned2=[]
for a_file in tqdm_notebook(second_round):
    fp = open(a_file, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    #print(doc.info)  # The "Info" metadata
    if 'Subject' in doc.info[0].keys():
        print("File name: ", a_file.name, " Subject ", doc.info[0]['Subject'])
        found_doi = find_doi(str(doc.info[0]['Subject']))
        if found_doi != None:
            print("DOI in metadata: ",found_doi)
            close_dois = get_close_dois(found_doi, ukchapp_db)
            if len(close_dois) == 1:
                doi_dat = close_dois[0]
                if doi_dat[3] == None:
                    selected = False
                    while not selected:
                        print("Assign file: ",a_file.name, " to:\n\t", doi_dat[0],doi_dat[1],doi_dat[2], doi_dat[3])
                        print('***************************************************************')
                        print("Options:\n\ta) assign\n\tb)go to next")
                        print("selection:")
                        usr_select = input()
                        if usr_select == 'a':
                            selected = True
                            set_pdf_file_value(a_file.name, doi_dat[0], ukchapp_db)
                            print("assing and go to next")
                        elif usr_select == 'b':
                            #working_file[art_num]['ignore']=3 # visual inspection
                            selected = True
                            print("going to next")
                else: 
                    print("Already assingned to:\n\t", doi_dat[0],doi_dat[1],doi_dat[2], doi_dat[3])
            
    else:
        print("********", a_file.name, "*******")
        not_assigned2.append(a_file.name)
        


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=24.0), HTML(value='')))

File name:  1-s2.0-S1385894719325884-main.pdf  Subject  b'Chemical Engineering Journal, 383 (2019) 123176. doi:10.1016/j.cej.2019.123176'
DOI in metadata:  10.1016/j.cej.2019.123176
File name:  acsaem.8b00873.pdf  Subject  b'ACS Appl. Energy Mater. 2018.1:5233-5244'
DOI in metadata:  
File name:  acscatal.9b00685.pdf  Subject  b'ACS Catal. 2019.9:7166-7178'
DOI in metadata:  
File name:  acssuschemeng.8b03268.pdf  Subject  b'ACS Sustainable Chem. Eng. 2018.6:14704-14712'
DOI in metadata:  
File name:  aic.17007.pdf  Subject  b'AIChE Journal 2020.66:e17007'
DOI in metadata:  
File name:  ange.201703550.pdf  Subject  b'Angewandte Chemie 2017.129:9479-9483'
DOI in metadata:  
File name:  anie.201609557.pdf  Subject  b'Angew. Chem. Int. Ed. 2017.56:4347-4350'
DOI in metadata:  
File name:  anie.201705753.pdf  Subject  b'Angew. Chem. Int. Ed. 2017.56:13596-13600'
DOI in metadata:  
File name:  cctc.201700516.pdf  Subject  b'ChemCatChem 2017.9:1897-1900'
DOI in metadata:  
File name:  cctc.2

In [12]:
not_assigned2

['287598212.pdf',
 'c6nr00053c.pdf',
 'd0sc02253e1.pdf',
 'EDANN Final thesis 25-04-19.pdf',
 's23.pdf',
 's5.pdf']

In [13]:
not_assigned2

['287598212.pdf',
 'c6nr00053c.pdf',
 'd0sc02253e1.pdf',
 'EDANN Final thesis 25-04-19.pdf',
 's23.pdf',
 's5.pdf']

In [14]:
len(not_assigned2)

6

In [None]:
files_list = get_files_list(Path("pdf_files"))
db_pubs = get_pub_app_data(ukchapp_db)
missing=[]
# check which files are really missing linking
for file in files_list:
    found_in_db = False
    for db_pub in db_pubs:
        if file.name == db_pub[4]:
            found_in_db = True
            break
    if not found_in_db:
       missing.append(file) 



In [None]:
missing


In [None]:
len(missing)