# Verification of references to UK Catalysis Hub 
A list of articles is obtainded from publish or perish. This list will contain a titles and some IDs whic need to be verified. 

The criteria for adding a publication to the database are: 
a) has an explicit acknowledgement of UK Catalysis Hub
b) mentions one of the UK Catalysis Hub grants
c) has two or more authors with affiliation to UK Catalysis Hub
d) acknowledges support from a scientist affiliated to UK Catalysis Hub.

In [2]:
# Libraries
# library containign functions that read and write to csv files
import lib.handle_csv as csvh
# library for connecting to the db
import lib.handle_db as dbh
# library for handling text matchings
import lib.text_comp as txtc
# library for getting data from crossref
import lib.crossref_api as cr_api
# library for handling url searchs
import lib.handle_urls as urlh
# managing files and file paths
from pathlib import Path
# add aprogress bar
from tqdm import tqdm_notebook 
#library for handling json files
import json
# library for using regular expressions
import re


In [3]:
# get the crossreference json page from doi
def get_cr_json_object(cr_doi):
  crjd = None
  doi_file = 'json_files/' + cr_doi.replace('/','_').lower() + '.json'
  if not Path(doi_file).is_file():
    crjd = cr_api.getBibData(cr_doi)
    with open(doi_file, 'w', encoding='utf-8') as f:
                json.dump(crjd, f, ensure_ascii=False, indent=4)
  else:
    jf = open(doi_file, 'r', encoding='utf-8')
    crjd = json.load(jf)
  # return the content and the file name 
  return crjd, doi_file

# get the landing page for the publication from uri
def get_pub_html_doi(cr_doi):
    html_file = 'html_files/' + cr_doi.replace('/','_').lower() + '.html'
    if not Path(html_file).is_file():
        page_content = urlh.getPageFromDOI(doi_text)
        with open(html_file, 'w', encoding='utf-8') as f:
            f.write(page_content.decode("utf-8") )
    else:
        f = open(html_file, "r")
        page_content = f.read()
    return page_content, html_file
             
def get_titles(str_pub_title, db_name = "prev_search.sqlite3"):
    print(db_name)
    db_conn = dbh.DataBaseAdapter(db_name)
    search_in = 'prev_pop_searches'
    fields_required = "Num, Title"
    filter_str = "Title like '"+str_pub_title[0]+"%';"

    db_titles = db_conn.get_values(search_in, fields_required, filter_str)
    db_conn.close()
    return db_titles

def get_titles_and_dois(str_pub_title, db_name = "app_db.sqlite3"):
    print(db_name)
    db_conn = dbh.DataBaseAdapter(db_name)
    search_in = 'articles'
    fields_required = "id, title, doi"
    filter_str = "Title like '"+str_pub_title[0]+"%';"
    db_titles = db_conn.get_values(search_in, fields_required, filter_str)
    db_conn.close()
    return db_titles

# get the current csv working file
def get_working_file(nr_wf):
    working_file = wf_fields = None
    current_pass = 0
    if Path(nr_wf).is_file():
        working_file, wf_fields = csvh.get_csv_data(nr_wf,'Num')
        for art_num in tqdm_notebook(working_file):
            if 'ignore' in working_file[art_num].keys():
                if current_pass < int(working_file[art_num]['ignore']):
                    current_pass = int(working_file[art_num]['ignore'])
            else:
                break
    print("Current pass:", current_pass)
    return working_file, wf_fields, current_pass



def get_pub_html_url(text_url, entry_id):
    html_file = 'html_files/' +  entry_id + '.html'
    if not Path(html_file).is_file():
        print("")
        page_content = urlh.getPageFromURL(text_url)
        with open(html_file, 'w', encoding='utf-8') as f:
            f.write(page_content)
    else:
        f = open(html_file, "r")
        page_content = f.read()
    return page_content, html_file

def valid_doi(cr_doi):
    # CR DOIS: https://www.crossref.org/blog/dois-and-matching-regular-expressions/
    # CR DOIs re1
    # /^10.\d{4,9}/[-._;()/:A-Z0-9]+$/i
    cr_re_01 = '^10.\d{4,9}/[-._;()/:A-Z0-9]+'
    compare = re.match(cr_re_01, cr_doi, re.IGNORECASE)
    if compare != None and cr_doi == compare.group():
        return True
    else:
        return False
    
# get a semicolon separated list of authors from CR json data
def get_cr_author_list(article_data):
    authors = []
    if 'author' in article_data.keys():
        for author in article_data['author']:
            new_author=""
            new_author = author['family']
            if 'given' in author.keys():
                new_author += ", " + author['given']
            authors.append(new_author)
    return ("; ").join(authors)

# get the publication date from CR json data
def get_cr_year_published(article_data):
    year_print = 0
    if 'published-print' in article_data.keys() \
        and article_data['published-print'] != None \
        and article_data['published-print']['date-parts'][0] != None:
        year_print = int(article_data['published-print']['date-parts'][0][0])    
    elif 'journal-issue' in article_data.keys() \
        and article_data['journal-issue'] != None \
        and 'published-print' in article_data['journal-issue'].keys() \
        and article_data['journal-issue']['published-print'] != None \
        and article_data['journal-issue']['published-print']['date-parts'][0] != None:
        year_print = int(article_data['journal-issue']['published-print']['date-parts'][0][0])

    year_online = 0
    if 'published-online' in article_data.keys() \
        and article_data['published-online'] != None \
        and article_data['published-online']['date-parts'][0] != None:
        year_online = int(article_data['published-online']['date-parts'][0][0])    
    elif 'journal-issue' in article_data.keys() \
        and article_data['journal-issue'] != None \
        and 'published-online' in article_data['journal-issue'].keys() \
        and article_data['journal-issue']['published-online'] != None \
        and article_data['journal-issue']['published-online']['date-parts'][0] != None:
        year_print = int(article_data['journal-issue']['published-online']['date-parts'][0][0])
    
    if year_print != 0 and year_online != 0:
        return year_print if year_print < year_online else year_online
    else:
        return year_print if year_online == 0 else year_online
    return 0


Get the name of the file with the results of the PoP search:

In [10]:
# input file with path: pop_searches/PoPCites20201017.csv
new_results_file = "pub_search_pop/PoPCites20202023.csv"
while not Path(new_results_file).is_file():
    print('Please enter the name of the input file:')
    new_results_file = input()

Get the name of the db file with previous results of the PoP search:

In [11]:
# previous results db file with path: db_files/prev_search.sqlite3
previous_db = "db_files/prev_search.sqlite3"
while not Path(previous_db).is_file():
    print('Please enter the name of the previous results file:')
    previous_db = input()

Get the name of the current app db file:

In [12]:
# app db file with path: db_files/app_db.sqlite3
ukchapp_db = "db_files/production2023.sqlite3"
while not Path(ukchapp_db).is_file():
    print('Please enter the name of app db file:')
    ukchapp_db = input()

Set the name of the output file


In [13]:
nr_wf = new_results_file[:-4]+"_wf.csv"
print("Verifying if the articles listed in: \n\t", Path(new_results_file).name)
print("where included in previous searches: \n\t", Path(previous_db).name)

print("The results will bt saved in: \n\t", nr_wf)

Verifying if the articles listed in: 
	 PoPCites20202023.csv
where included in previous searches: 
	 prev_search.sqlite3
The results will bt saved in: 
	 pub_search_pop/PoPCites20202023_wf.csv


In [14]:
# get the working file before each step
working_file = wf_fields = None
working_file, wf_fields, current_pass = get_working_file(nr_wf)
# in first pass then make working file = new results
if working_file == None:
    working_file, wf_fields, current_pass = get_working_file(new_results_file)
    csvh.write_csv_data(working_file, nr_wf) 

Current pass: 0


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for art_num in tqdm_notebook(working_file):


  0%|          | 0/1000 [00:00<?, ?it/s]

Current pass: 0


## Verify if already processed titles are included
Read data and verify if results in file have already been included in previous searches


In [15]:
working_file, wf_fields, current_pass = get_working_file(nr_wf)
if current_pass == 0:
    current_initial = ""
    db_titles = []
    for art_num in tqdm_notebook(working_file):
        new_title = working_file[art_num]['Title'].lower()
        if new_title != "":
            working_file[art_num]['ignore'] = 0 
            working_file[art_num]['previous'] = 0 
            working_file[art_num]['similarity'] = 0.0
            if current_initial == "" or current_initial != new_title[0]:
                print("new intital ", new_title[0])
                current_initial = new_title[0]
                db_titles = get_titles(current_initial, previous_db)

            for prev_pair in db_titles:
                prev_num = prev_pair[0]
                used_title = prev_pair[1].lower()
                # if titles match exactly or simialarity > 0.8 ignore
                title_similarity = txtc.similar(new_title, used_title)
                if title_similarity > 0.80:
                    #print(art_num, 'Title:', new_title, "already processed", prev_num, used_title)
                    working_file[art_num]['ignore'] = 1
                    working_file[art_num]['previous'] = prev_num
                    working_file[art_num]['similarity'] = title_similarity
                    break
        else:
            working_file[art_num]['ignore'] = 1

    csvh.write_csv_data(working_file, nr_wf)  
    print(nr_wf)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for art_num in tqdm_notebook(working_file):


  0%|          | 0/1000 [00:00<?, ?it/s]

Current pass: 0


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for art_num in tqdm_notebook(working_file):


  0%|          | 0/1000 [00:00<?, ?it/s]

new intital  p
db_files/prev_search.sqlite3
new intital  d
db_files/prev_search.sqlite3
new intital  a
db_files/prev_search.sqlite3
new intital  i
db_files/prev_search.sqlite3
new intital  t
db_files/prev_search.sqlite3
new intital  i
db_files/prev_search.sqlite3
new intital  s
db_files/prev_search.sqlite3
new intital  a
db_files/prev_search.sqlite3
new intital  c
db_files/prev_search.sqlite3
new intital  n
db_files/prev_search.sqlite3
new intital  p
db_files/prev_search.sqlite3
new intital  c
db_files/prev_search.sqlite3
new intital  t
db_files/prev_search.sqlite3
new intital  s
db_files/prev_search.sqlite3
new intital  a
db_files/prev_search.sqlite3
new intital  p
db_files/prev_search.sqlite3
new intital  a
db_files/prev_search.sqlite3
new intital  s
db_files/prev_search.sqlite3
new intital  c
db_files/prev_search.sqlite3
new intital  m
db_files/prev_search.sqlite3
new intital  i
db_files/prev_search.sqlite3
new intital  c
db_files/prev_search.sqlite3
new intital  s
db_files/prev_sea

new intital  p
db_files/prev_search.sqlite3
new intital  u
db_files/prev_search.sqlite3
new intital  r
db_files/prev_search.sqlite3
new intital  i
db_files/prev_search.sqlite3
new intital  r
db_files/prev_search.sqlite3
new intital  m
db_files/prev_search.sqlite3
new intital  d
db_files/prev_search.sqlite3
new intital  m
db_files/prev_search.sqlite3
new intital  p
db_files/prev_search.sqlite3
new intital  e
db_files/prev_search.sqlite3
new intital  h
db_files/prev_search.sqlite3
new intital  v
db_files/prev_search.sqlite3
new intital  i
db_files/prev_search.sqlite3
new intital  4
db_files/prev_search.sqlite3
new intital  b
db_files/prev_search.sqlite3
new intital  i
db_files/prev_search.sqlite3
new intital  a
db_files/prev_search.sqlite3
new intital  s
db_files/prev_search.sqlite3
new intital  d
db_files/prev_search.sqlite3
new intital  a
db_files/prev_search.sqlite3
new intital  m
db_files/prev_search.sqlite3
new intital  b
db_files/prev_search.sqlite3
new intital  p
db_files/prev_sea

new intital  a
db_files/prev_search.sqlite3
new intital  u
db_files/prev_search.sqlite3
new intital  m
db_files/prev_search.sqlite3
new intital  s
db_files/prev_search.sqlite3
new intital  t
db_files/prev_search.sqlite3
new intital  d
db_files/prev_search.sqlite3
new intital  s
db_files/prev_search.sqlite3
new intital  o
db_files/prev_search.sqlite3
new intital  t
db_files/prev_search.sqlite3
new intital  h
db_files/prev_search.sqlite3
new intital  c
db_files/prev_search.sqlite3
new intital  r
db_files/prev_search.sqlite3
new intital  q
db_files/prev_search.sqlite3
new intital  s
db_files/prev_search.sqlite3
new intital  d
db_files/prev_search.sqlite3
new intital  s
db_files/prev_search.sqlite3
new intital  a
db_files/prev_search.sqlite3
new intital  d
db_files/prev_search.sqlite3
new intital  s
db_files/prev_search.sqlite3
new intital  b
db_files/prev_search.sqlite3
new intital  a
db_files/prev_search.sqlite3
new intital  f
db_files/prev_search.sqlite3
new intital  q
db_files/prev_sea

new intital  t
db_files/prev_search.sqlite3
new intital  m
db_files/prev_search.sqlite3
new intital  c
db_files/prev_search.sqlite3
new intital  p
db_files/prev_search.sqlite3
new intital  r
db_files/prev_search.sqlite3
new intital  t
db_files/prev_search.sqlite3
new intital  l
db_files/prev_search.sqlite3
new intital  …
db_files/prev_search.sqlite3
new intital  g
db_files/prev_search.sqlite3
new intital  c
db_files/prev_search.sqlite3
new intital  f
db_files/prev_search.sqlite3
new intital  s
db_files/prev_search.sqlite3
new intital  t
db_files/prev_search.sqlite3
new intital  h
db_files/prev_search.sqlite3
new intital  t
db_files/prev_search.sqlite3
new intital  l
db_files/prev_search.sqlite3
new intital  m
db_files/prev_search.sqlite3
new intital  r
db_files/prev_search.sqlite3
new intital  t
db_files/prev_search.sqlite3
new intital  i
db_files/prev_search.sqlite3
new intital  t
db_files/prev_search.sqlite3
new intital  i
db_files/prev_search.sqlite3
new intital  c
db_files/prev_sea

new intital  n
db_files/prev_search.sqlite3
new intital  i
db_files/prev_search.sqlite3
new intital  e
db_files/prev_search.sqlite3
new intital  m
db_files/prev_search.sqlite3
new intital  e
db_files/prev_search.sqlite3
new intital  m
db_files/prev_search.sqlite3
new intital  a
db_files/prev_search.sqlite3
new intital  i
db_files/prev_search.sqlite3
new intital  s
db_files/prev_search.sqlite3
new intital  c
db_files/prev_search.sqlite3
new intital  m
db_files/prev_search.sqlite3
new intital  t
db_files/prev_search.sqlite3
new intital  a
db_files/prev_search.sqlite3
new intital  m
db_files/prev_search.sqlite3
new intital  t
db_files/prev_search.sqlite3
new intital  m
db_files/prev_search.sqlite3
new intital  b
db_files/prev_search.sqlite3
new intital  t
db_files/prev_search.sqlite3
new intital  g
db_files/prev_search.sqlite3
new intital  i
db_files/prev_search.sqlite3
new intital  t
db_files/prev_search.sqlite3
new intital  h
db_files/prev_search.sqlite3
new intital  w
db_files/prev_sea

## Check Titles in app
Verify if the title is in the app

In [16]:
working_file, wf_fields, current_pass = get_working_file(nr_wf)
# verify that titles are not in the app_db (if they are  also get DOI)
if current_pass == 1: 
    db_titles = []
    current_initial = ""
    for art_num in tqdm_notebook(working_file):
        if working_file[art_num]['ignore']=='0':
            new_title = working_file[art_num]['Title'].lower()
            if new_title != "":
                if current_initial == "" or current_initial != new_title[0]:
                    print("new intital ", new_title[0])
                    current_initial = new_title[0]
                    db_titles = get_titles_and_dois(current_initial, ukchapp_db)
                for art_in_db in db_titles:
                    prev_num = art_in_db[0]
                    used_title = art_in_db[1].lower()
                    # if titles match exactly or simialarity > 0.8 ignore
                    title_similarity = txtc.similar(new_title, used_title)
                    if title_similarity > 0.80:
                        #print(art_num, 'Title:', new_title, "already processed", prev_num, used_title)
                        working_file[art_num]['ignore'] = 2
                        working_file[art_num]['previous'] = prev_num
                        working_file[art_num]['similarity'] = title_similarity
                        working_file[art_num]['DOIcr'] = art_in_db[2]
                        break                
    csvh.write_csv_data(working_file, nr_wf)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for art_num in tqdm_notebook(working_file):


  0%|          | 0/1000 [00:00<?, ?it/s]

Current pass: 1


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for art_num in tqdm_notebook(working_file):


  0%|          | 0/1000 [00:00<?, ?it/s]

new intital  d
db_files/production2023.sqlite3
new intital  a
db_files/production2023.sqlite3
new intital  i
db_files/production2023.sqlite3
new intital  t
db_files/production2023.sqlite3
new intital  a
db_files/production2023.sqlite3
new intital  c
db_files/production2023.sqlite3
new intital  n
db_files/production2023.sqlite3
new intital  c
db_files/production2023.sqlite3
new intital  t
db_files/production2023.sqlite3
new intital  p
db_files/production2023.sqlite3
new intital  a
db_files/production2023.sqlite3
new intital  s
db_files/production2023.sqlite3
new intital  c
db_files/production2023.sqlite3
new intital  i
db_files/production2023.sqlite3
new intital  c
db_files/production2023.sqlite3
new intital  s
db_files/production2023.sqlite3
new intital  d
db_files/production2023.sqlite3
new intital  o
db_files/production2023.sqlite3
new intital  e
db_files/production2023.sqlite3
new intital  i
db_files/production2023.sqlite3
new intital  m
db_files/production2023.sqlite3
new intital  

new intital  a
db_files/production2023.sqlite3
new intital  g
db_files/production2023.sqlite3
new intital  a
db_files/production2023.sqlite3
new intital  m
db_files/production2023.sqlite3
new intital  o
db_files/production2023.sqlite3
new intital  c
db_files/production2023.sqlite3
new intital  a
db_files/production2023.sqlite3
new intital  c
db_files/production2023.sqlite3
new intital  i
db_files/production2023.sqlite3
new intital  a
db_files/production2023.sqlite3
new intital  v
db_files/production2023.sqlite3
new intital  d
db_files/production2023.sqlite3
new intital  e
db_files/production2023.sqlite3
new intital  a
db_files/production2023.sqlite3
new intital  s
db_files/production2023.sqlite3
new intital  p
db_files/production2023.sqlite3
new intital  c
db_files/production2023.sqlite3
new intital  a
db_files/production2023.sqlite3
new intital  h
db_files/production2023.sqlite3
new intital  m
db_files/production2023.sqlite3
new intital  t
db_files/production2023.sqlite3
new intital  

new intital  e
db_files/production2023.sqlite3
new intital  s
db_files/production2023.sqlite3
new intital  i
db_files/production2023.sqlite3
new intital  m
db_files/production2023.sqlite3
new intital  s
db_files/production2023.sqlite3
new intital  a
db_files/production2023.sqlite3
new intital  r
db_files/production2023.sqlite3
new intital  w
db_files/production2023.sqlite3
new intital  p
db_files/production2023.sqlite3
new intital  m
db_files/production2023.sqlite3
new intital  q
db_files/production2023.sqlite3
new intital  i
db_files/production2023.sqlite3
new intital  a
db_files/production2023.sqlite3
new intital  t
db_files/production2023.sqlite3
new intital  r
db_files/production2023.sqlite3
new intital  d
db_files/production2023.sqlite3
new intital  t
db_files/production2023.sqlite3
new intital  w
db_files/production2023.sqlite3
new intital  m
db_files/production2023.sqlite3
new intital  c
db_files/production2023.sqlite3
new intital  b
db_files/production2023.sqlite3
new intital  

new intital  m
db_files/production2023.sqlite3
new intital  a
db_files/production2023.sqlite3
new intital  e
db_files/production2023.sqlite3
new intital  c
db_files/production2023.sqlite3
new intital  f
db_files/production2023.sqlite3
new intital  s
db_files/production2023.sqlite3
new intital  t
db_files/production2023.sqlite3
new intital  a
db_files/production2023.sqlite3
new intital  p
db_files/production2023.sqlite3
new intital  a
db_files/production2023.sqlite3
new intital  t
db_files/production2023.sqlite3
new intital  p
db_files/production2023.sqlite3
new intital  r
db_files/production2023.sqlite3
new intital  e
db_files/production2023.sqlite3
new intital  d
db_files/production2023.sqlite3
new intital  i
db_files/production2023.sqlite3
new intital  t
db_files/production2023.sqlite3
new intital  p
db_files/production2023.sqlite3
new intital  c
db_files/production2023.sqlite3
new intital  i
db_files/production2023.sqlite3
new intital  h
db_files/production2023.sqlite3
new intital  

new intital  a
db_files/production2023.sqlite3
new intital  z
db_files/production2023.sqlite3
new intital  o
db_files/production2023.sqlite3
new intital  w
db_files/production2023.sqlite3
new intital  a
db_files/production2023.sqlite3
new intital  s
db_files/production2023.sqlite3
new intital  c
db_files/production2023.sqlite3
new intital  e
db_files/production2023.sqlite3
new intital  a
db_files/production2023.sqlite3
new intital  g
db_files/production2023.sqlite3
new intital  a
db_files/production2023.sqlite3
new intital  s
db_files/production2023.sqlite3
new intital  c
db_files/production2023.sqlite3
new intital  s
db_files/production2023.sqlite3
new intital  n
db_files/production2023.sqlite3
new intital  t
db_files/production2023.sqlite3
new intital  r
db_files/production2023.sqlite3
new intital  a
db_files/production2023.sqlite3
new intital  g
db_files/production2023.sqlite3
new intital  i
db_files/production2023.sqlite3
new intital  w
db_files/production2023.sqlite3
new intital  

## Check Title Wording
Using the workds in previous catalysis hub papers check if the title is likely to be a cat hub title

In [17]:
working_file, wf_fields, current_pass = get_working_file(nr_wf)
if current_pass < 3:
    # pass 2
    # check titles for likelihood of being catalysis articles using keywords from titles in current DB 
    print("Get word list from DB")
    x = dbh.DataBaseAdapter(ukchapp_db)
    db_titles = x.get_value_list('articles','title')
    title_words = set()
    ignore_words=set(['the','of','to','and','a','in','is','it', 'their', 'so', 'as'])
    average = 0
    words_sum = 0.0
    for title in db_titles:
        one_title = set(title.lower().split())
        one_title = one_title - ignore_words
        title_words = title_words.union(one_title)
        words_sum += len(one_title) 
        
    average = words_sum /len(db_titles)
    print("Average words per title:", average)
    title_words = title_words - ignore_words
    for art_num in tqdm_notebook(working_file):
        if 0 == int(working_file[art_num]['ignore']):
            art_title = working_file[art_num]['Title']
            art_words = set(art_title.lower().split())
            occurrences = len(title_words.intersection(art_words))
            working_file[art_num]['keywords']=occurrences
            if occurrences == 0:
                print("occurrences:", occurrences, "in title:", art_title)
                working_file[art_num]['ignore']=3
            else:
                print("occurrences:", occurrences, "in title:", art_title)
    csvh.write_csv_data(working_file, nr_wf)
    x.close()
    current_pass = 3

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for art_num in tqdm_notebook(working_file):


  0%|          | 0/1000 [00:00<?, ?it/s]

Current pass: 2
Get word list from DB
Average words per title: 10.359861591695502


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for art_num in tqdm_notebook(working_file):


  0%|          | 0/1000 [00:00<?, ?it/s]

occurrences: 4 in title: A portal for indexing distributed FAIR digital objects for catalysis research
occurrences: 8 in title: Ammonia Synthesis: Photocatalytic Nitrogen Reduction by Ti3C2 MXene Derived Oxygen Vacancy‐Rich C/TiO2 (Adv. Sustainable Syst. 4/2021)
occurrences: 8 in title: In the Lab: Heterogeneous Catalysis Mediated Interconversion between NAD (P)+ and NAD (P) H Accompanied by Consumption and Generation of Hydrogen
occurrences: 4 in title: Tailored approaches for the reusage of catalytic data
occurrences: 9 in title: A Bayesian framework for adsorption energy prediction on bimetallic alloy catalysts
occurrences: 2 in title: Catalysis Science &Technology
occurrences: 7 in title: Case Study 1: Data and simulation driven understanding of catalytic activity
occurrences: 6 in title: The Electrophilicity of Surface Carbon Species in the Redox Reactions of CuO‐CeO2 Catalysts
occurrences: 6 in title: Photocatalytic Nitrogen Reduction by Ti3C2 MXene Derived Oxygen Vacancy‐Rich C/

In [18]:
working_file, wf_fields, current_pass = get_working_file(nr_wf)
if current_pass == 3:
    i = 0
    for art_num in tqdm_notebook(working_file):
        if working_file[art_num]['ignore']=='0':
            inspected = False
            while not inspected:
                new_title = working_file[art_num]['Title']
                keywords = int(working_file[art_num]['keywords'])
                #print (keywords, new_title)
                if keywords <= 4 and not ("cataly" in new_title.lower()):
                # ignore  it because it does not contains cataly in title
                    working_file[art_num]['ignore']=4 # visual inspection
                    inspected = True
                else:
                    inspected = True
    print("To Process:", i, "Pass:", current_pass)
    csvh.write_csv_data(working_file, nr_wf)
    current_pass = 4

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for art_num in tqdm_notebook(working_file):


  0%|          | 0/1000 [00:00<?, ?it/s]

Current pass: 3


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for art_num in tqdm_notebook(working_file):


  0%|          | 0/1000 [00:00<?, ?it/s]

To Process: 0 Pass: 3


## Get DOIs for Articles
The remaining titles need to be further analysed. Recovering their DOIs helps to obtain abstracts and acknowledgement statements. 

In [19]:
working_file, wf_fields, current_pass = get_working_file(nr_wf)
if current_pass == 4:
    i = 0
    for art_num in tqdm_notebook(working_file):
        if (working_file[art_num]['ignore']=='0' and not 'DOIcr' in working_file[art_num].keys()) \
        or (working_file[art_num]['ignore']=='0' and working_file[art_num]['DOIcr']==""):
            new_title = working_file[art_num]['Title']
            new_doi = cr_api.getDOIForTitle(new_title)
            if new_doi == "":
                #print("Missing DOI:", new_title)
                working_file[art_num]['ignore'] = '5'
                i +=1
            else:
                #print("DOI found:", new_doi, "for:", new_title)
                working_file[art_num]['DOIcr'] = new_doi
                working_file[art_num]['ignore'] = '0'
    print("without DOI:", i)
    csvh.write_csv_data(working_file, nr_wf)
    current_pass = 5

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for art_num in tqdm_notebook(working_file):


  0%|          | 0/1000 [00:00<?, ?it/s]

Current pass: 4


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for art_num in tqdm_notebook(working_file):


  0%|          | 0/1000 [00:00<?, ?it/s]

without DOI: 52


## Verify DOIs in DB
Verify that articles do not exist in the DB

In [20]:
working_file, wf_fields, current_pass = get_working_file(nr_wf)

if current_pass >= 4:
    i = 0
    db_conn = dbh.DataBaseAdapter(ukchapp_db)
    for art_num in tqdm_notebook(working_file):
        if working_file[art_num]['ignore']=='0':
            new_title = working_file[art_num]['Title']
            new_doi = working_file[art_num]['DOIcr'].strip()
            db_title = db_conn.get_title(new_doi)
            if db_title == None:
                print("Not in DB:", new_doi, new_title)
            else:
                print("Already in DB:", new_doi, "for:", new_title, db_title)
                working_file[art_num]['ignore'] = '6'
    print("without DOI:", i)
    csvh.write_csv_data(working_file, nr_wf)
    current_pass = 6
    db_conn.close()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for art_num in tqdm_notebook(working_file):


  0%|          | 0/1000 [00:00<?, ?it/s]

Current pass: 5


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for art_num in tqdm_notebook(working_file):


  0%|          | 0/1000 [00:00<?, ?it/s]

Not in DB: 10.3897/rio.8.e95770 A portal for indexing distributed FAIR digital objects for catalysis research
Not in DB: 10.1595/205651323x16686913816837 In the Lab: Heterogeneous Catalysis Mediated Interconversion between NAD (P)+ and NAD (P) H Accompanied by Consumption and Generation of Hydrogen
Not in DB: 10.1038/s41524-020-00447-8 A Bayesian framework for adsorption energy prediction on bimetallic alloy catalysts
Not in DB: 10.1002/anie.202102570 The Electrophilicity of Surface Carbon Species in the Redox Reactions of CuO‐CeO2 Catalysts
Not in DB: 10.1039/d2cp01572b A transferable prediction model of molecular adsorption on metals based on adsorbate and substrate properties
Not in DB: 10.1002/aoc.6352 Structure and photocatalytic performance comparison of two distinctive copper phenylacetylides
Not in DB: 10.1080/08940886.2020.1701366 Synchrotron Consortia for Catalysis and Electrocatalysis Research
Not in DB: 10.1038/s41467-021-25263-6 Blurring the boundary between homogenous and

Not in DB: 10.1002/anie.202014960 Palladium‐Catalysed C− H Bond Zincation of Arenes: Scope, Mechanism, and the Role of Heterometallic Intermediates
Not in DB: 10.1080/1360144x.2021.1899931 Building integrated networks to develop teaching and learning: the critical role of hubs
Not in DB: 10.1002/anie.202202933 Amine‐Catalyzed Copper‐Mediated C− H Sulfonylation of Benzaldehydes via a Transient Imine Directing Group
Not in DB: 10.7717/peerj.10594 Identification of hub genes and biological pathways in hepatocellular carcinoma by integrated bioinformatics analysis
Not in DB: 10.1007/7355_2021_116 Biocatalysis in Flow for Drug Discovery
Not in DB: 10.1007/s00018-020-03695-5 Human RNase3 immune modulation by catalytic-dependent and independent modes in a macrophage-cell line infection model
Not in DB: 10.1038/s41578-020-00268-7 Artificial channels for confined mass transport at the sub-nanometre scale
Not in DB: 10.1038/s41396-020-0689-0 Potential utilization of terrestrially derived dissolv

## Get full json files for remaining articles 

In [21]:
working_file, wf_fields, current_pass = get_working_file(nr_wf)

if current_pass >= 4:
    i = 0
    for art_num in tqdm_notebook(working_file):
        if working_file[art_num]['ignore']=='0':
            article_title = working_file[art_num]['Title']
            article_doi = working_file[art_num]['DOIcr']
            article_url =working_file[art_num]['ArticleURL']
            data, file_name = get_cr_json_object(article_doi)
            if data != {}:
                working_file[art_num]['file'] = file_name
    csvh.write_csv_data(working_file, nr_wf)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for art_num in tqdm_notebook(working_file):


  0%|          | 0/1000 [00:00<?, ?it/s]

Current pass: 6


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for art_num in tqdm_notebook(working_file):


  0%|          | 0/1000 [00:00<?, ?it/s]

## Check if CR json files contain funder details for UKCH grants

In [35]:
working_file, wf_fields, current_pass = get_working_file(nr_wf)

if current_pass >= 4:
    i = 1
    for art_num in tqdm_notebook(working_file):
        if working_file[art_num]['ignore']=='0':
            article_title = working_file[art_num]['Title']
            article_doi = working_file[art_num]['DOIcr']
            article_url =working_file[art_num]['ArticleURL']
            try:
                data, file_name = get_cr_json_object(article_doi)
                print(i, article_title, article_doi)
                #print(data.keys())
                epsrc_keys = ['EP/R026645/1', 'EP/K014668/1', 'EP/K014714/1', 'EP/R026815/1', 'EP/R026939/1',
                              'EP/M013219/1', 'EP/R027129/1', 'EP/K014854/1', 'EP/K014706/2']
                confirmed_in_cr = []
                if 'funder' in data.keys():
                    for a_funder in data['funder']:
                        for an_award in a_funder['award']:
                            if an_award in epsrc_keys:
                                print("Found", an_award)
                                confirmed_in_cr.append(an_award)
                    working_file[art_num]['award_in_cr'] = ', '.join(confirmed_in_cr)
                i += 1
            except:
                working_file[art_num]['ignore'] = 7
                
    csvh.write_csv_data(working_file, nr_wf)
        

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for art_num in tqdm_notebook(working_file):


  0%|          | 0/1000 [00:00<?, ?it/s]

Current pass: 9


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for art_num in tqdm_notebook(working_file):


  0%|          | 0/1000 [00:00<?, ?it/s]

1 In the Lab: Heterogeneous Catalysis Mediated Interconversion between NAD (P)+ and NAD (P) H Accompanied by Consumption and Generation of Hydrogen 10.1595/205651323x16686913816837
2 The Electrophilicity of Surface Carbon Species in the Redox Reactions of CuO‐CeO2 Catalysts 10.1002/anie.202102570
3 Blurring the boundary between homogenous and heterogeneous catalysis using palladium nanoclusters with dynamic surfaces 10.1038/s41467-021-25263-6
4 Amine-Catalysed Suzuki–Miyaura-Type Coupling? the Identification and Isolation of the Palladium Culprits. 10.26434/chemrxiv.14237288
5 Integrated Electro‐Biocatalysis for Amine Alkylation with Alcohols 10.1002/cctc.202001757
6 Palladium-doped hierarchical ZSM-5 for catalytic selective oxidation of allylic and benzylic alcohols 10.1098/rsos.211086
7 Facile synthesis of a porous 3D g-C3N4 photocatalyst for the degradation of organics in shale gas brines 10.1016/j.catcom.2022.106480
8 Green synthesis of glycerol carbonate via transesterification of

77 Dynamics of water within Cu-loaded zeolites: A quasielastic neutron scattering study 10.1016/j.catcom.2022.106429
78 3D printed, plastic photocatalytic flow reactors for water purification 10.1007/s43630-022-00242-y
79 The Landscape and Roadmap of the Research and Innovation Infrastructures in Energy: A Review of the Case Study of the UK 10.3390/su14127197
80 Investigations of hydrocarbon species on solid catalysts by inelastic neutron scattering 10.1007/s11244-020-01389-7
81 Stick or Spill? Scaling Relationships for the Binding Energies of Adsorbates on Single-Atom Alloy Catalysts 10.1021/acs.jpclett.2c01519
82 Structure of the native pyruvate dehydrogenase complex reveals the mechanism of substrate insertion 10.1038/s41467-021-25570-y
83 Efficient electrocatalytic valorization of chlorinated organic water pollutant to ethylene 10.1038/s41565-022-01277-z
84 Natural Aluminosilicate-based Y Zeolite for Catalytic Cracking of n-Hexadecane 10.7454/mss.v24i1.11861
85 Environmental and ec

171 Structural snapshots of the minimal PKS system responsible for octaketide biosynthesis 10.1038/s41557-020-0491-7
172 T cell-specific deletion of Pgam1 reveals a critical role for glycolysis in T cell responses 10.1038/s42003-020-01122-w
173 Compensatory mutations modulate the competitiveness and dynamics of plasmid-mediated colistin resistance in Escherichia coli clones 10.1038/s41396-019-0578-6
174 Identification of hub genes and potential biomarkers of neutrophilic asthma: evidence from a bioinformatics analysis 10.1080/02770903.2022.2051544
175 Integrated analysis of microarray and RNA-Seq data for the identification of hub genes and networks involved in the pancreatic cancer 10.3389/fgene.2021.663787
176 Identification of hub genes as potential prognostic biomarkers in cervical cancer using comprehensive bioinformatics analysis and validation studies 10.2147/cmar.s282989
177 An integrated empirical analysis of UK rail industry's carbon assessment: An industry perspective 10.101

## Get full HTML files for remaining articles 

In [None]:
#nr_wf = "pop_searches/PoPCites20201017_wf.csv"
working_file, wf_fields, current_pass = get_working_file(nr_wf)

if current_pass >= 6:
    i = 0
    for art_num in tqdm_notebook(working_file):
        if working_file[art_num]['ignore']=='0':
            article_id = working_file[art_num]['Num']
            article_title = working_file[art_num]['Title']
            article_doi = working_file[art_num]['DOIcr'].strip().lower()
            article_url =working_file[art_num]['ArticleURL']
            article_type =working_file[art_num]['type']
            html_content = file_name = None
            if valid_doi(article_doi):
                html_content, file_name = get_pub_html_doi(article_doi)
            else:
                #try with url
                html_content = None
                #identifier = "id" + str((1000000 + int(article_id)))[1,6] + article_type 
                #html_content, file_name = get_pub_html_doi(article_url, identifier)
            if html_content != None:
                working_file[art_num]['html_file'] = file_name
                
    csvh.write_csv_data(working_file, nr_wf)



## Get HTML page from DOI and verify if it contains UKCH acknowledgement

In [42]:
cworking_file, wf_fields, current_pass = get_working_file(nr_wf)

from IPython.display import IFrame
from IPython.display import HTML

end_now = False

if current_pass >= 4:
    for art_num in tqdm_notebook(working_file):
        if (working_file[art_num]['ignore']=='0' and not 'ack_fragment' in working_file[art_num].keys()) or \
           (working_file[art_num]['ignore']=='0' and working_file[art_num]['ack_fragment'] == ""):
            article_id = working_file[art_num]['Num']
            article_title = working_file[art_num]['Title']
            article_kws = working_file[art_num]['keywords']
            article_doi = working_file[art_num]['DOIcr']
            request_str = "https://doi.org/" + article_doi 
            if valid_doi(article_doi):
                request_str = "https://doi.org/" + article_doi 
                print(request_str)
                #display(HTML('<h1>Hello, world!</h1>'))
                #%%html
                #<iframe src=request_str  width="600" height="400"></iframe>
                IFrame(request_str, width=700, height=350)
                inspected = False
                while not inspected:
                    #new_title = working_file[art_num]['Title']
                    print('Title: ', article_title)
                    print('Keywords:', article_kws )
                    print('***************************************************************')
                    print("Options:\n\ta) add ack text\n\tb) mark as not relevant\n\tc) go to next\n\td) end now")
                    print("selection:")
                    usr_select = input()
                    if usr_select == 'b':
                        #working_file[art_num]['ignore']=3 # visual inspection
                        inspected = True
                        working_file[art_num]['ignore'] = '9'
                        working_file[art_num]['send_to_corinne'] = 'no'
                        working_file[art_num]['reason_send'] = "not acknowledged, no UKCH authors"
                        print("going to next")
                    elif usr_select == 'c':
                        #working_file[art_num]['ignore']=3 # visual inspection
                        inspected = True
                        print("going to next")
                    elif usr_select == 'd':
                        #working_file[art_num]['ignore']=3 # visual inspection
                        end_now = True
                        inspected = True
                        print("going to next")
                    elif usr_select == 'a':
                        inspected = True
                        ack_text = ""
                        while ack_text == "":
                            print("Enter ack text: ")
                            ack_text = input()
                            working_file[art_num]['ack_fragment'] = ack_text
                            working_file[art_num]['send_to_corinne'] = 'yes'
                            working_file[art_num]['reason_send'] = "confirmed in acknowledgements"
            else:
                print(article_doi, "is not a valid DOI")
            if end_now:
                break
    csvh.write_csv_data(working_file, nr_wf)  
    print(nr_wf)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for art_num in tqdm_notebook(working_file):


  0%|          | 0/1000 [00:00<?, ?it/s]

Current pass: 9


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for art_num in tqdm_notebook(working_file):


  0%|          | 0/1000 [00:00<?, ?it/s]

pub_search_pop/PoPCites20202023_wf.csv


## Get bib data from CR to send for validation

In [43]:
#nr_wf = "pop_searches/PoPCites20201017_wf.csv"
working_file, wf_fields, current_pass = get_working_file(nr_wf)
article_title = ""
article_doi = ""
article_url = ""
data = None    
try:
    if current_pass >= 6:
        for art_num in tqdm_notebook(working_file):
            if working_file[art_num]['send_to_corinne'] == 'yes':
                article_title = working_file[art_num]['Title']
                article_doi = working_file[art_num]['DOIcr']
                article_url =working_file[art_num]['ArticleURL']
                if valid_doi(article_doi):
                    data, file_name = get_cr_json_object(article_doi)
                    # get authors
                    working_file[art_num]['cr_authors'] = get_cr_author_list(data)
                    # get article year
                    working_file[art_num]['cr_year'] = get_cr_year_published(data)
                    working_file[art_num]['cr_title'] = data['title']
                    working_file[art_num]['cr_journal'] = data['container-title']
    csvh.write_csv_data(working_file, nr_wf)
except:
    print(article_title, article_doi, article_url)
    print(data)
    csvh.write_csv_data(working_file, nr_wf)
        

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for art_num in tqdm_notebook(working_file):


  0%|          | 0/1000 [00:00<?, ?it/s]

Current pass: 9


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for art_num in tqdm_notebook(working_file):


  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
#***************************************************************************************************************
# Wait do not run this yet
#***************************************************************************************************************
if current_pass >= 6:
    i = 0
    for art_num in tqdm_notebook(working_file):
        if working_file[art_num]['ignore']=='0':
            article_title = working_file[art_num]['Title']
            article_doi = working_file[art_num]['DOIcr']
            article_url =working_file[art_num]['ArticleURL']
            print("Analysing:", article_title, article_doi, article_url)
            # try to retrive html page for article using link from crossref first
            # and if not try url from pop
            # find reference to uk catalysis hub in html text
            # if found mark as relevant
            found = ""
            referents = ["uk catalysis hub", "uk catalysis", "catalysis hub",
                 'EP/R026645/1', 'resources', 'EP/K014668/1', 'EPSRC', 'EP/K014714/1',
                 'Hub','provided', 'grant', 'biocatalysis', 'EP/R026815/1', 'EP/R026939/1',
                 'support', 'membership', 'EP/M013219/1', 'UK', 'kindly', 'Catalysis',
                 'funded', 'EP/R027129/1', 'Consortium', 'thanked', 'EP/K014854/1', 'EP/K014706/2']
            found = urlh.findFromDOI(article_title, article_doi, referents)
            working_file[art_num]['checked_doi'] = 1
            working_file[art_num]['ack_doi'] = found
            found = urlh.findFromURI(article_title, article_url, referents)
            working_file[art_num]['checked_url'] = 1
            working_file[art_num]['ack_url'] = found
            print("Ack:", found)
    csvh.write_csv_data(working_file, nr_wf)

In [None]:
doi_text = '10.1039/d0cy00036a'

url_text = "https://ethos.bl.uk/OrderDetails.do?uin=uk.bl.ethos.808495"

html_content, file_name = get_pub_html_doi(doi_text)

print(file_name)#, html_content)

import re

#print(len(doi_text))
# CR DOIS: https://www.crossref.org/blog/dois-and-matching-regular-expressions/
# CR DOIs re1
# /^10.\d{4,9}/[-._;()/:A-Z0-9]+$/i

cr_re_01 = '^10.\d{4,9}/[-._;()/:A-Z0-9]+'

compare = re.match(cr_re_01, doi_text, re.IGNORECASE)

print(compare)
print(compare.start())
print(compare.end())
print(compare.group())

if compare != None and doi_text == compare.group():
    print("This is a DOI: ", doi_text)
else:
    print("This is not a DOI: ", doi_text)

compare = re.match(cr_re_01, url_text, re.IGNORECASE)
    
print(url_text, valid_doi(url_text))
print(doi_text, valid_doi(doi_text))

# url_text = "https://ethos.bl.uk/OrderDetails.do?uin=uk.bl.ethos.808495"
# id = id000069_thesis
entry_id = 'id000069_thesis'

html_content, file_name = get_pub_html_url(url_text, entry_id)
print(file_name, html_content)

In [None]:
doi_text = '10.1039/d0cy00036a'
print(doi_text, valid_doi(doi_text))

In [None]:
epsrc_keys = ['EP/R026645/1', 'EP/K014668/1', 'EP/K014714/1', 'EP/R026815/1', 'EP/R026939/1',
                          'EP/M013219/1', 'EP/R027129/1', 'EP/K014854/1', 'EP/K014706/2']
', '.join(epsrc_keys)