# Verification of references to UK Catalysis Hub 
A list of articles is obtainded from publish or perish. This list will contain a titles and some IDs whic need to be verified. 

The criteria for adding a publication to the database are: 
a) has an explicit acknowledgement of UK Catalysis Hub
b) mentions one of the UK Catalysis Hub grants
c) has two or more authors with affiliation to UK Catalysis Hub
d) acknowledges support from a scientist affiliated to UK Catalysis Hub.

In [14]:
# Libraries
# library containign functions that read and write to csv files
import lib.handle_csv as csvh
# library for connecting to the db
import lib.handle_db as dbh
# library for handling text matchings
import lib.text_comp as txtc
# library for getting data from crossref
import lib.crossref_api as cr_api
# library for handling url searchs
import lib.handle_urls as urlh
# managing files and file paths
from pathlib import Path
# add aprogress bar
from tqdm import tqdm_notebook 
# library for getting data from crossref
import lib.crossref_api as cr_api
#library for handling json files
import json
# library for using regular expressions
import re


In [29]:
# get the crossreference json page from doi
def get_cr_json_object(cr_doi):
  crjd = None
  doi_file = 'json_files/' + cr_doi.replace('/','_').lower() + '.json'
  if not Path(doi_file).is_file():
    crjd = cr_api.getBibData(cr_doi)
    with open(doi_file, 'w', encoding='utf-8') as f:
                json.dump(crjd, f, ensure_ascii=False, indent=4)
  else:
    jf = open(doi_file, 'r')
    crjd = json.load(jf)
  # return the content and the file name 
  return crjd, doi_file

# get the landing page for the publication from uri
def get_pub_html_doi(cr_doi):
    html_file = 'html_files/' + cr_doi.replace('/','_').lower() + '.html'
    if not Path(html_file).is_file():
        page_content = urlh.getPageFromDOI(doi_text)
        with open(html_file, 'w', encoding='utf-8') as f:
            f.write(page_content.decode("utf-8") )
    else:
        f = open(html_file, "r")
        page_content = f.read()
    return page_content, html_file
             
def get_titles(str_pub_title, db_name = "prev_search.sqlite3"):
    print(db_name)
    db_conn = dbh.DataBaseAdapter(db_name)
    search_in = 'prev_pop_searches'
    fields_required = "Num, Title"
    filter_str = "Title like '"+str_pub_title[0]+"%';"

    db_titles = db_conn.get_values(search_in, fields_required, filter_str)
    db_conn.close()
    return db_titles

def get_titles_and_dois(str_pub_title, db_name = "app_db.sqlite3"):
    print(db_name)
    db_conn = dbh.DataBaseAdapter(db_name)
    search_in = 'articles'
    fields_required = "id, title, doi"
    filter_str = "Title like '"+str_pub_title[0]+"%';"
    db_titles = db_conn.get_values(search_in, fields_required, filter_str)
    db_conn.close()
    return db_titles

# get the current csv working file
def get_working_file(nr_wf):
    working_file = wf_fields = None
    current_pass = 0
    if Path(nr_wf).is_file():
        working_file, wf_fields = csvh.get_csv_data(nr_wf,'Num')
        for art_num in tqdm_notebook(working_file):
            if 'ignore' in working_file[art_num].keys():
                if current_pass < int(working_file[art_num]['ignore']):
                    current_pass = int(working_file[art_num]['ignore'])
            else:
                break
    print("Current pass:", current_pass)
    return working_file, wf_fields, current_pass



def get_pub_html_url(text_url, entry_id):
    html_file = 'html_files/' +  entry_id + '.html'
    if not Path(html_file).is_file():
        print("")
        page_content = urlh.getPageFromURL(text_url)
        with open(html_file, 'w', encoding='utf-8') as f:
            f.write(page_content)
    else:
        f = open(html_file, "r")
        page_content = f.read()
    return page_content, html_file

def valid_doi(cr_doi):
    # CR DOIS: https://www.crossref.org/blog/dois-and-matching-regular-expressions/
    # CR DOIs re1
    # /^10.\d{4,9}/[-._;()/:A-Z0-9]+$/i
    cr_re_01 = '^10.\d{4,9}/[-._;()/:A-Z0-9]+'
    compare = re.match(cr_re_01, cr_doi, re.IGNORECASE)
    if compare != None and cr_doi == compare.group():
        return True
    else:
        return False
    
# get a semicolon separated list of authors from CR json data
def get_cr_author_list(article_data):
    authors = []
    if 'author' in article_data.keys():
        for author in article_data['author']:
            new_author=""
            new_author = author['family']
            if 'given' in author.keys():
                new_author += ", " + author['given']
            authors.append(new_author)
    return ("; ").join(authors)

# get the publication date from CR json data
def get_cr_year_published(article_data):
    year_print = 0
    if 'published-print' in article_data.keys() \
        and article_data['published-print'] != None \
        and article_data['published-print']['date-parts'][0] != None:
        year_print = int(article_data['published-print']['date-parts'][0][0])    
    elif 'journal-issue' in article_data.keys() \
        and article_data['journal-issue'] != None \
        and 'published-print' in article_data['journal-issue'].keys() \
        and article_data['journal-issue']['published-print'] != None \
        and article_data['journal-issue']['published-print']['date-parts'][0] != None:
        year_print = int(article_data['journal-issue']['published-print']['date-parts'][0][0])

    year_online = 0
    if 'published-online' in article_data.keys() \
        and article_data['published-online'] != None \
        and article_data['published-online']['date-parts'][0] != None:
        year_online = int(article_data['published-online']['date-parts'][0][0])    
    elif 'journal-issue' in article_data.keys() \
        and article_data['journal-issue'] != None \
        and 'published-online' in article_data['journal-issue'].keys() \
        and article_data['journal-issue']['published-online'] != None \
        and article_data['journal-issue']['published-online']['date-parts'][0] != None:
        year_print = int(article_data['journal-issue']['published-online']['date-parts'][0][0])
    
    if year_print != 0 and year_online != 0:
        return year_print if year_print < year_online else year_online
    else:
        return year_print if year_online == 0 else year_online
    return 0


Get the name of the file with the results of the PoP search:

In [3]:
# input file with path: pop_searches/PoPCites20201017.csv
new_results_file = ""
while not Path(new_results_file).is_file():
    print('Please enter the name of the input file:')
    new_results_file = input()

Please enter the name of the input file:
pop_searches/PoPCites20201017CR.csv


Get the name of the db file with previous results of the PoP search:

In [4]:
# previous results db file with path: db_files/prev_search.sqlite3
previous_db = ""
while not Path(previous_db).is_file():
    print('Please enter the name of the previous results file:')
    previous_db = input()

Please enter the name of the previous results file:
db_files/prev_search.sqlite3


Get the name of the current app db file:

In [5]:
# app db file with path: db_files/app_db.sqlite3
ukchapp_db = ""
while not Path(ukchapp_db).is_file():
    print('Please enter the name of app db file:')
    ukchapp_db = input()

Please enter the name of the previous results file:
db_files/app_db.sqlite3


Set the name of the output file


In [16]:
nr_wf = new_results_file[:-4]+"_wf.csv"
print("Verifying if the articles listed in: \n\t", Path(new_results_file).name)
print("where included in previous searches: \n\t", Path(previous_db).name)

print("The results will bt saved in: \n\t", nr_wf)

Verifying if the articles listed in: 
	 PoPCites20201017CR.csv
where included in previous searches: 
	 prev_search.sqlite3
The results will bt saved in: 
	 pop_searches/PoPCites20201017CR_wf.csv


In [30]:
# get the working file before each step
working_file = wf_fields = None
working_file, wf_fields, current_pass = get_working_file(nr_wf)
# in first pass then make working file = new results
if working_file == None:
    working_file, wf_fields, current_pass = get_working_file(new_results_file)
    csvh.write_csv_data(working_file, nr_wf) 

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Current pass: 1


## Verify if already processed titles are included
Read data and verify if results in file have already been included in previous searches


In [31]:
working_file, wf_fields, current_pass = get_working_file(nr_wf)
if current_pass == 0:
    current_initial = ""
    db_titles = []
    for art_num in tqdm_notebook(working_file):
        new_title = working_file[art_num]['Title'].lower()
        working_file[art_num]['ignore'] = 0 
        working_file[art_num]['previous'] = 0 
        working_file[art_num]['similarity'] = 0.0
        if current_initial == "" or current_initial != new_title[0]:
            print("new intital ", new_title[0])
            current_initial = new_title[0]
            db_titles = get_titles(current_initial, previous_db)
            
        for prev_pair in db_titles:
            prev_num = prev_pair[0]
            used_title = prev_pair[1].lower()
            # if titles match exactly or simialarity > 0.8 ignore
            title_similarity = txtc.similar(new_title, used_title)
            if title_similarity > 0.80:
                #print(art_num, 'Title:', new_title, "already processed", prev_num, used_title)
                working_file[art_num]['ignore'] = 1
                working_file[art_num]['previous'] = prev_num
                working_file[art_num]['similarity'] = title_similarity
                break

    csvh.write_csv_data(working_file, nr_wf)  
    print(nr_wf)

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Current pass: 1


## Check Titles in app
Verify if the title is in the app

In [32]:
working_file, wf_fields, current_pass = get_working_file(nr_wf)
# verify that titles are not in the app_db (if they are  also get DOI)
if current_pass == 1: 
    db_titles = []
    current_initial = ""
    for art_num in tqdm_notebook(working_file):
        if working_file[art_num]['ignore']=='0':
            new_title = working_file[art_num]['Title'].lower()
            if current_initial == "" or current_initial != new_title[0]:
                print("new intital ", new_title[0])
                current_initial = new_title[0]
                db_titles = get_titles_and_dois(current_initial, ukchapp_db)
            for art_in_db in db_titles:
                prev_num = art_in_db[0]
                used_title = art_in_db[1].lower()
                # if titles match exactly or simialarity > 0.8 ignore
                title_similarity = txtc.similar(new_title, used_title)
                if title_similarity > 0.80:
                    #print(art_num, 'Title:', new_title, "already processed", prev_num, used_title)
                    working_file[art_num]['ignore'] = 2
                    working_file[art_num]['previous'] = prev_num
                    working_file[art_num]['similarity'] = title_similarity
                    working_file[art_num]['DOIcr'] = art_in_db[2]
                    break                
    csvh.write_csv_data(working_file, nr_wf)

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Current pass: 1


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))

new intital  0
db_files/app_db.sqlite3
new intital  1
db_files/app_db.sqlite3
new intital  2
db_files/app_db.sqlite3
new intital  4
db_files/app_db.sqlite3
new intital  a
db_files/app_db.sqlite3
new intital  b
db_files/app_db.sqlite3
new intital  c
db_files/app_db.sqlite3
new intital  d
db_files/app_db.sqlite3
new intital  e
db_files/app_db.sqlite3
new intital  f
db_files/app_db.sqlite3
new intital  g
db_files/app_db.sqlite3
new intital  h
db_files/app_db.sqlite3
new intital  i
db_files/app_db.sqlite3
new intital  j
db_files/app_db.sqlite3
new intital  l
db_files/app_db.sqlite3
new intital  m
db_files/app_db.sqlite3
new intital  n
db_files/app_db.sqlite3
new intital  o
db_files/app_db.sqlite3
new intital  p
db_files/app_db.sqlite3
new intital  r
db_files/app_db.sqlite3
new intital  s
db_files/app_db.sqlite3
new intital  t
db_files/app_db.sqlite3
new intital  u
db_files/app_db.sqlite3
new intital  v
db_files/app_db.sqlite3
new intital  w
db_files/app_db.sqlite3
new intital  n
db_files/a

## Check Title Wording
Using the workds in previous catalysis hub papers check if the title is likely to be a cat hub title

In [33]:
working_file, wf_fields, current_pass = get_working_file(nr_wf)
if current_pass < 3:
    # pass 2
    # check titles for likelihood of being catalysis articles using keywords from titles in current DB 
    print("Get word list from DB")
    x = dbh.DataBaseAdapter(ukchapp_db)
    db_titles = x.get_value_list('articles','title')
    title_words = set()
    ignore_words=set(['the','of','to','and','a','in','is','it', 'their', 'so', 'as'])
    average = 0
    words_sum = 0.0
    for title in db_titles:
        one_title = set(title.lower().split())
        one_title = one_title - ignore_words
        title_words = title_words.union(one_title)
        words_sum += len(one_title) 
        
    average = words_sum /len(db_titles)
    print("Average words per title:", average)
    title_words = title_words - ignore_words
    for art_num in tqdm_notebook(working_file):
        if 0 == int(working_file[art_num]['ignore']):
            art_title = working_file[art_num]['Title']
            art_words = set(art_title.lower().split())
            occurrences = len(title_words.intersection(art_words))
            working_file[art_num]['keywords']=occurrences
            if occurrences == 0:
                print("occurrences:", occurrences, "in title:", art_title)
                working_file[art_num]['ignore']=3
            else:
                print("occurrences:", occurrences, "in title:", art_title)
    csvh.write_csv_data(working_file, nr_wf)
    x.close()
    current_pass = 3

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Current pass: 1
Get word list from DB
Average words per title: 10.11271676300578


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))

occurrences: 6 in title: 0 2 0)-Textured tungsten trioxide nanostructure with enhanced photoelectrochemical activity
occurrences: 0 in title: 1. Dangerous Classes
occurrences: 1 in title: 2. The Exchange Hub
occurrences: 0 in title: 4. Hacking Infrastructures
occurrences: 2 in title: A Case of External Auditory Canal Sebaceous Carcinoma: Literature Review and Treatment Discussion
occurrences: 10 in title: A detailed speciation of iron on FCC catalysts based on an integrated use of advanced characterisation methods and thermodynamic equilibrium simulation
occurrences: 3 in title: A Non-Invasive Flexible Glucose Monitoring Sensor using a Broadband Reject Filter
occurrences: 4 in title: An allosteric interaction controls the activation mechanism of SHP2 tyrosine phosphatase
occurrences: 5 in title: Anylogic Simulation Research on Passenger Evacuation System of Urban Transportation Hub
occurrences: 11 in title: Beyond surface redox and oxygen mobility at pd-polar ceria (100) interface: Und

In [41]:
working_file, wf_fields, current_pass = get_working_file(nr_wf)
if current_pass == 3:
    i = 0
    for art_num in tqdm_notebook(working_file):
        if working_file[art_num]['ignore']=='0':
            inspected = False
            while not inspected:
                new_title = working_file[art_num]['Title']
                keywords = int(working_file[art_num]['keywords'])
                #print (keywords, new_title)
                if keywords <= 4 and not ("cataly" in new_title.lower()):
                # ignore  it because it does not contains cataly in title
                    working_file[art_num]['ignore']=4 # visual inspection
                    inspected = True
                else:
                    inspected = True
    print("To Process:", i, "Pass:", current_pass)
    csvh.write_csv_data(working_file, nr_wf)
    current_pass = 4

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Current pass: 4


## Get DOIs for Articles
The remaining titles need to be further analysed. Recovering their DOIs helps to obtain abstracts and acknowledgement statements. 

In [39]:
working_file, wf_fields, current_pass = get_working_file(nr_wf)
if current_pass == 4:
    i = 0
    for art_num in tqdm_notebook(working_file):
        if working_file[art_num]['ignore']=='0' and not ('DOIcr' in working_file[art_num].keys() \
        or working_file[art_num]['ignore']=='0' and working_file[art_num]['DOIcr']=="":
            new_title = working_file[art_num]['Title']
            new_doi = cr_api.getDOIForTitle(new_title)
            if new_doi == "":
                #print("Missing DOI:", new_title)
                working_file[art_num]['ignore'] = '5'
                i +=1
            else:
                #print("DOI found:", new_doi, "for:", new_title)
                working_file[art_num]['DOIcr'] = new_doi
                working_file[art_num]['ignore'] = '0'
    print("without DOI:", i)
    csvh.write_csv_data(working_file, nr_wf)
    current_pass = 5

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Current pass: 4


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


without DOI: 0


## Verify DOIs in DB
Verify that articles do not exist in the DB

In [44]:
working_file, wf_fields, current_pass = get_working_file(nr_wf)

if current_pass >= 4:
    i = 0
    db_conn = dbh.DataBaseAdapter(ukchapp_db)
    for art_num in tqdm_notebook(working_file):
        if working_file[art_num]['ignore']=='0':
            new_title = working_file[art_num]['Title']
            new_doi = working_file[art_num]['DOIcr'].strip()
            db_title = db_conn.get_title(new_doi)
            if db_title == None:
                print("Not in DB:", new_doi, new_title)
            else:
                print("Already in DB:", new_doi, "for:", new_title, db_title)
                working_file[art_num]['ignore'] = '6'
    print("without DOI:", i)
    csvh.write_csv_data(working_file, nr_wf)
    current_pass = 6
    dbh.close()

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Current pass: 4


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))

Not in DB: 10.1016/j.jcat.2020.06.012 0 2 0)-Textured tungsten trioxide nanostructure with enhanced photoelectrochemical activity
Not in DB: 10.1016/j.apcata.2020.117597 A detailed speciation of iron on FCC catalysts based on an integrated use of advanced characterisation methods and thermodynamic equilibrium simulation
Not in DB: 10.5220/0010002800050011 Anylogic Simulation Research on Passenger Evacuation System of Urban Transportation Hub
Not in DB: 10.1016/j.apcatb.2020.118843 Beyond surface redox and oxygen mobility at pd-polar ceria (100) interface: Underlying principle for strong metal-support interactions in green catalysis
Not in DB: 10.1016/j.cattod.2020.05.024 Carbon dioxide decomposition through gas exchange in barium calcium iron niobates
Not in DB: 10.1063/5.0012381 Computational prediction of muon stopping sites: A novel take on the unperturbed electrostatic potential method
Not in DB: 10.1016/j.biombioe.2020.105757 Decarbonising Kenya's domestic & industry Sectors throu

AttributeError: module 'lib.handle_db' has no attribute 'close'

## Get full json files for remaining articles 

In [45]:
working_file, wf_fields, current_pass = get_working_file(nr_wf)

if current_pass >= 4:
    i = 0
    for art_num in tqdm_notebook(working_file):
        if working_file[art_num]['ignore']=='0':
            article_title = working_file[art_num]['Title']
            article_doi = working_file[art_num]['DOIcr']
            article_url =working_file[art_num]['ArticleURL']
            data, file_name = get_cr_json_object(article_doi)
            if data != {}:
                working_file[art_num]['file'] = file_name
    csvh.write_csv_data(working_file, nr_wf)

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Current pass: 4


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




## Check if CR json files contain funder details for UKCH grants

In [46]:
working_file, wf_fields, current_pass = get_working_file(nr_wf)

if current_pass >= 4:
    i = 1
    for art_num in tqdm_notebook(working_file):
        if working_file[art_num]['ignore']=='0':
            article_title = working_file[art_num]['Title']
            article_doi = working_file[art_num]['DOIcr']
            article_url =working_file[art_num]['ArticleURL']
            data, file_name = get_cr_json_object(article_doi)
            print(i, article_title, article_doi)
            #print(data.keys())
            epsrc_keys = ['EP/R026645/1', 'EP/K014668/1', 'EP/K014714/1', 'EP/R026815/1', 'EP/R026939/1',
                          'EP/M013219/1', 'EP/R027129/1', 'EP/K014854/1', 'EP/K014706/2']
            confirmed_in_cr = []
            if 'funder' in data.keys():
                for a_funder in data['funder']:
                    for an_award in a_funder['award']:
                        if an_award in epsrc_keys:
                            print("Found", an_award)
                            confirmed_in_cr.append(an_award)
                working_file[art_num]['award_in_cr'] = ', '.join(confirmed_in_cr)
            i += 1
    csvh.write_csv_data(working_file, nr_wf)
        

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Current pass: 4


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))

1 0 2 0)-Textured tungsten trioxide nanostructure with enhanced photoelectrochemical activity 10.1016/j.jcat.2020.06.012
2 A detailed speciation of iron on FCC catalysts based on an integrated use of advanced characterisation methods and thermodynamic equilibrium simulation 10.1016/j.apcata.2020.117597
3 Anylogic Simulation Research on Passenger Evacuation System of Urban Transportation Hub 10.5220/0010002800050011
4 Beyond surface redox and oxygen mobility at pd-polar ceria (100) interface: Underlying principle for strong metal-support interactions in green catalysis 10.1016/j.apcatb.2020.118843
5 Carbon dioxide decomposition through gas exchange in barium calcium iron niobates 10.1016/j.cattod.2020.05.024
6 Computational prediction of muon stopping sites: A novel take on the unperturbed electrostatic potential method 10.1063/5.0012381
7 Decarbonising Kenya's domestic & industry Sectors through bioenergy: An assessment of biomass resource potential & GHG performances 10.1016/j.biombio

## Get full HTML files for remaining articles 

In [47]:
nr_wf = "pop_searches/PoPCites20201017_wf.csv"
working_file, wf_fields, current_pass = get_working_file(nr_wf)

if current_pass >= 6:
    i = 0
    for art_num in tqdm_notebook(working_file):
        if working_file[art_num]['ignore']=='0':
            article_id = working_file[art_num]['Num']
            article_title = working_file[art_num]['Title']
            article_doi = working_file[art_num]['DOIcr'].strip().lower()
            article_url =working_file[art_num]['ArticleURL']
            article_type =working_file[art_num]['type']
            html_content = file_name = None
            if valid_doi(article_doi):
                html_content, file_name = get_pub_html_doi(article_doi)
            else:
                #try with url
                html_content = None
                #identifier = "id" + str((1000000 + int(article_id)))[1,6] + article_type 
                #html_content, file_name = get_pub_html_doi(article_url, identifier)
            if html_content != None:
                working_file[art_num]['html_file'] = file_name
                
    csvh.write_csv_data(working_file, nr_wf)



HBox(children=(IntProgress(value=0, max=999), HTML(value='')))


Current pass: 7


HBox(children=(IntProgress(value=0, max=999), HTML(value='')))




## Get HTML page from DOI and verify if it contains UKCH acknowledgement

In [48]:
working_file, wf_fields, current_pass = get_working_file(nr_wf)

from IPython.display import IFrame
from IPython.display import HTML


if current_pass >= 6:
    i = 0
    for art_num in tqdm_notebook(working_file):
        if working_file[art_num]['ignore']=='0' and working_file[art_num]['ack_fragment'] == "":
            article_id = working_file[art_num]['Num']
            article_title = working_file[art_num]['Title']
            article_doi = working_file[art_num]['DOIcr']
            request_str = "https://doi.org/" + article_doi 
            if valid_doi(article_doi):
                request_str = "https://doi.org/" + article_doi 
                print(request_str)
                
                #display(HTML('<h1>Hello, world!</h1>'))
                #%%html
                #<iframe src=request_str  width="600" height="400"></iframe>
                IFrame(request_str, width=700, height=350)
                inspected = False
                while not inspected:
                    #new_title = working_file[art_num]['Title']
                    print('Title: ', article_title)
                    print('***************************************************************')
                    print("Options:\n\ta) add ack text\n\tb) mark as not relevant\n\tc) go to next")
                    print("selection:")
                    usr_select = input()
                    if usr_select == 'c':
                        #working_file[art_num]['ignore']=3 # visual inspection
                        inspected = True
                        working_file[art_num]['send_to_corinne'] = 'no'
                        working_file[art_num]['reason_send'] = "not acknowledged, no UKCH authors"
                        print("going to next")
                    elif usr_select == 'b':
                        #working_file[art_num]['ignore']=3 # visual inspection
                        inspected = True
                        print("going to next")
                    elif usr_select == 'a':
                        inspected = True
                        ack_text = ""
                        while ack_text == "":
                            print("Enter ack text: ")
                            ack_text = input()
                            working_file[art_num]['ack_fragment'] = ack_text
                            working_file[art_num]['send_to_corinne'] = 'yes'
                            working_file[art_num]['reason_send'] = "confirmed in acknowledgements"
            else:
                print(article_doi, "is not a valid DOI")
    csvh.write_csv_data(working_file, nr_wf)  
    print(nr_wf)

HBox(children=(IntProgress(value=0, max=999), HTML(value='')))


Current pass: 7


HBox(children=(IntProgress(value=0, max=999), HTML(value='')))

https://doi.org/10.1177/1468087420962294
Title:  A comparative study into the effects of pre and post catalyst exhaust gas recirculation on the onset of knock
***************************************************************
Options:
	a) add ack text
	b) mark as not relevant
	c) go to next
selection:
b
going to next
https://doi.org/10.1039/d0cp03852k
Title:  An approach to calculate the free energy changes of surface reactions using free energy decomposition on ab initio brute-force molecular dynamics …
***************************************************************
Options:
	a) add ack text
	b) mark as not relevant
	c) go to next
selection:
b
going to next
https://doi.org/10.1016/j.respol.2020.104045
Title:  Anchor entrepreneurship and industry catalysis: The rise of the Italian Biomedical Valley
***************************************************************
Options:
	a) add ack text
	b) mark as not relevant
	c) go to next
selection:
b
going to next
https://doi.org/10.1039/c8fd90016g
T

KeyboardInterrupt: 

## Get bib data from CR to send for validation

In [None]:
#nr_wf = "pop_searches/PoPCites20201017_wf.csv"
working_file, wf_fields, current_pass = get_working_file(nr_wf)
article_title = ""
article_doi = ""
article_url = ""
data = None    
try:
    if current_pass >= 6:
        for art_num in tqdm_notebook(working_file):
            if working_file[art_num]['send_to_corinne'] == 'yes':
                article_title = working_file[art_num]['Title']
                article_doi = working_file[art_num]['DOIcr']
                article_url =working_file[art_num]['ArticleURL']
                if valid_doi(article_doi):
                    data, file_name = get_cr_json_object(article_doi)
                    # get authors
                    working_file[art_num]['cr_authors'] = get_cr_author_list(data)
                    # get article year
                    working_file[art_num]['cr_year'] = get_cr_year_published(data)
                    working_file[art_num]['cr_title'] = data['title']
                    working_file[art_num]['cr_journal'] = data['container-title']
    csvh.write_csv_data(working_file, nr_wf)
except:
    print(article_title, article_doi, article_url)
    print(data)
    csvh.write_csv_data(working_file, nr_wf)
        

In [None]:
#***************************************************************************************************************
# Wait do not run this yet
#***************************************************************************************************************
if current_pass >= 6:
    i = 0
    for art_num in tqdm_notebook(working_file):
        if working_file[art_num]['ignore']=='0':
            article_title = working_file[art_num]['Title']
            article_doi = working_file[art_num]['DOIcr']
            article_url =working_file[art_num]['ArticleURL']
            print("Analysing:", article_title, article_doi, article_url)
            # try to retrive html page for article using link from crossref first
            # and if not try url from pop
            # find reference to uk catalysis hub in html text
            # if found mark as relevant
            found = ""
            referents = ["uk catalysis hub", "uk catalysis", "catalysis hub",
                 'EP/R026645/1', 'resources', 'EP/K014668/1', 'EPSRC', 'EP/K014714/1',
                 'Hub','provided', 'grant', 'biocatalysis', 'EP/R026815/1', 'EP/R026939/1',
                 'support', 'membership', 'EP/M013219/1', 'UK', 'kindly', 'Catalysis',
                 'funded', 'EP/R027129/1', 'Consortium', 'thanked', 'EP/K014854/1', 'EP/K014706/2']
            found = urlh.findFromDOI(article_title, article_doi, referents)
            working_file[art_num]['checked_doi'] = 1
            working_file[art_num]['ack_doi'] = found
            found = urlh.findFromURI(article_title, article_url, referents)
            working_file[art_num]['checked_url'] = 1
            working_file[art_num]['ack_url'] = found
            print("Ack:", found)
    csvh.write_csv_data(working_file, nr_wf)

In [None]:
doi_text = '10.1039/d0cy00036a'

url_text = "https://ethos.bl.uk/OrderDetails.do?uin=uk.bl.ethos.808495"

html_content, file_name = get_pub_html_doi(doi_text)

print(file_name)#, html_content)

import re

#print(len(doi_text))
# CR DOIS: https://www.crossref.org/blog/dois-and-matching-regular-expressions/
# CR DOIs re1
# /^10.\d{4,9}/[-._;()/:A-Z0-9]+$/i

cr_re_01 = '^10.\d{4,9}/[-._;()/:A-Z0-9]+'

compare = re.match(cr_re_01, doi_text, re.IGNORECASE)

print(compare)
print(compare.start())
print(compare.end())
print(compare.group())

if compare != None and doi_text == compare.group():
    print("This is a DOI: ", doi_text)
else:
    print("This is not a DOI: ", doi_text)

compare = re.match(cr_re_01, url_text, re.IGNORECASE)
    
print(url_text, valid_doi(url_text))
print(doi_text, valid_doi(doi_text))

# url_text = "https://ethos.bl.uk/OrderDetails.do?uin=uk.bl.ethos.808495"
# id = id000069_thesis
entry_id = 'id000069_thesis'

html_content, file_name = get_pub_html_url(url_text, entry_id)
print(file_name, html_content)

In [None]:
doi_text = '10.1039/d0cy00036a'
print(doi_text, valid_doi(doi_text))

In [None]:
epsrc_keys = ['EP/R026645/1', 'EP/K014668/1', 'EP/K014714/1', 'EP/R026815/1', 'EP/R026939/1',
                          'EP/M013219/1', 'EP/R027129/1', 'EP/K014854/1', 'EP/K014706/2']
', '.join(epsrc_keys)