# Verification of references to UK Catalysis Hub 
A list of articles is obtainded from publish or perish. This list will contain a titles and some IDs whic need to be verified. 

The criteria for adding a publication to the database are: 
a) has an explicit acknowledgement of UK Catalysis Hub
b) mentions one of the UK Catalysis Hub grants
c) has two or more authors with affiliation to UK Catalysis Hub
d) acknowledges support from a scientist affiliated to UK Catalysis Hub.

In [5]:
# Libraries
# library containign functions that read and write to csv files
import lib.handle_csv as csvh
# library for connecting to the db
import lib.handle_db as dbh
# library for handling text matchings
import lib.text_comp as txtc
# library for getting data from crossref
import lib.crossref_api as cr_api
# library for handling url searchs
import lib.handle_urls as urlh
# managing files and file paths
from pathlib import Path
# add aprogress bar
from tqdm import tqdm_notebook 


# input files
new_results_file = 'pop_searches/PoPCites20201017.csv'
#previous_results = 'pop_searches/ukch_pop_prev_res.csv'

In [36]:
def get_titles_list(str_pub_title, ukchapp_db = "prev_search.sqlite3"):
    
    ukchapp_db = "prev_search.sqlite3"

    db_conn = dbh.DataBaseAdapter(ukchapp_db)
    search_in = 'prev_pop_searches'
    fields_required = "Num, Title"
    filter_str = "Title like '"+str_pub_title[0]+"%';"

    db_titles = db_conn.get_values(search_in, fields_required, filter_str)
    
    return db_titles

#first = "p62: linking protein homeostasis and the DNA damage response during ageing"[0]
#db_titles = get_titles_list(first)
#print (db_titles)

Get the name of the file with the results of the PoP search:

In [None]:
new_results_file = ""
while not Path(new_results_file).is_file():
    print('Please enter the name of the input file:')
    new_results_file = input()

Get the name of the file with previous results of the PoP search:

In [None]:
previous_results = ""
while not Path(previous_results).is_file():
    print('Please enter the name of the previous results file:')
    previous_results = input()

Set the name of the output file


In [37]:
nr_wf = new_results_file[:-4]+"a_wf.csv"
print("Verifying if the articles listed in: \n\t", Path(new_results_file).name)
print("where included in previous searches: \n\t", Path(previous_results).name)

print("The results will bt saves in: \n\t", nr_wf)

Verifying if the articles listed in: 
	 PoPCites20201017.csv
where included in previous searches: 
	 ukch_pop_prev_res.csv
The results will bt saves in: 
	 pop_searches/PoPCites20201017a_wf.csv


In [39]:
working_file = wf_fields = None
current_pass = 0
if Path(nr_wf).is_file():
    working_file, wf_fields = csvh.get_csv_data(nr_wf,'Num')
    for art_num in tqdm_notebook(working_file):
        if current_pass < int(working_file[art_num]['ignore']):
            current_pass = int(working_file[art_num]['ignore'])

HBox(children=(IntProgress(value=0, max=998), HTML(value='')))




## Verify if already processed titles are included
Read data and verify if results in file have already been included in previous searches


In [40]:
if current_pass == 0:
    csv_articles, fn_articles = csvh.get_csv_data(new_results_file,'Num')
    prev_articles, fn_prev = csvh.get_csv_data(previous_results,'Num')
    current_initial = ""
    db_titles = []
    for art_num in tqdm_notebook(csv_articles):
        new_title = csv_articles[art_num]['Title'].lower()
        csv_articles[art_num]['ignore'] = 0 
        csv_articles[art_num]['previous'] = 0 
        csv_articles[art_num]['similarity'] = 0.0
        if current_initial == "" or current_initial != new_title[0]:
            print("new intital ", new_title[0])
            current_initial = new_title[0]
            db_titles = get_titles_list(current_initial)
            
        for prev_pair in db_titles:
            prev_num = prev_pair[0]
            used_title = prev_pair[1].lower()
            # if titles match exactly or simialarity > 0.8 ignore
            title_similarity = txtc.similar(new_title, used_title)
            if title_similarity > 0.80:
                #print(art_num, 'Title:', new_title, "already processed", prev_num, used_title)
                csv_articles[art_num]['ignore'] = 1
                csv_articles[art_num]['previous'] = prev_num
                csv_articles[art_num]['similarity'] = title_similarity
                break

    csvh.write_csv_data(csv_articles, nr_wf)
    if Path(nr_wf).is_file():
        working_file, wf_fields = csvh.get_csv_data(nr_wf,'Num')
        for art_num in working_file:
            if current_pass < int(working_file[art_num]['ignore']) :
                current_pass = int(working_file[art_num]['ignore'])
    
    print(nr_wf)

## Check Title Wording
Using the workds in previous catalysis hub papers check if the title is likely to be a cat hub title

In [41]:
if current_pass in [0,1]:
    # pass 2
    # check titles for likelihood of being catalysis articles using keywords from titles in current DB 
    print("Get word list from DB")
    x = dbh.DataBaseAdapter('ukch_articles.sqlite')
    db_titles = x.get_value_list('articles','title')
    title_words = set()
    ignore_words=set(['the','of','to','and','a','in','is','it', 'their', 'so', 'as'])
    average = 0
    words_sum = 0.0
    for title in db_titles:
        one_title = set(title.lower().split())
        one_title = one_title - ignore_words
        title_words = title_words.union(one_title)
        words_sum += len(one_title) 
        
    average = words_sum /len(db_titles)
    print("Average words per title:", average)
    title_words = title_words - ignore_words
    for art_num in working_file:
        if 0 == int(working_file[art_num]['ignore']):
            art_title = working_file[art_num]['Title']
            art_words = set(art_title.lower().split())
            occurrences = len(title_words.intersection(art_words))
            working_file[art_num]['keywords']=occurrences
            if occurrences <= 4:
                print("occurrences:", occurrences, "in title:", art_title)
                working_file[art_num]['ignore']=2
            #elif occurrences <= 7:
            else:
                print("occurrences:", occurrences, "in title:", art_title)
    csvh.write_csv_data(working_file, nr_wf)
    current_pass = 2

In [42]:

if current_pass == 2:
    i = 0
    for art_num in tqdm_notebook(working_file):
        if working_file[art_num]['ignore']=='0':
            inspected = False
            while not inspected:
                new_title = working_file[art_num]['Title']
                keywords = working_file[art_num]['keywords']
                #print (keywords, new_title)
                if keywords < 9 and not ("cataly" in new_title.lower()):
                # ignore  it because it does not contains cataly in title
                    working_file[art_num]['ignore']=3 # visual inspection
                    inspected = True
                elif keywords >= 9 and not ("cataly" in new_title.lower()):
                    print('Title:', working_file[art_num]['Title'])
                    print('***************************************************************')
                    print("Options:\n\ta) add\n\tb) ignore")
                    print("selection:")
                    usr_select = input()
                    if usr_select == 'b':
                        working_file[art_num]['ignore']=3 # visual inspection
                        inspected = True
                    elif usr_select == 'a':
                        inspected = True
                        i += 1
                else:
                    inspected = True
    print("To Process:", i, "Pass:", current_pass)
    csvh.write_csv_data(working_file, nr_wf)
    current_pass = 3

## Get DOIs for Articles
The remaining titles need to be further analysed. Recovering their DOIs can help obtain abstracts and acknowledgement statements. 

In [43]:
if current_pass == 3:
    i = 0
    for art_num in tqdm_notebook(working_file):
        if working_file[art_num]['ignore']=='0' and working_file[art_num]['DOIcr']=="":
            new_title = working_file[art_num]['Title']
            new_doi = cr_api.getDOIForTitle(new_title)
            if new_doi == "":
                #print("Missing DOI:", new_title)
                working_file[art_num]['ignore'] = '4'
                i +=1
            else:
                #print("DOI found:", new_doi, "for:", new_title)
                working_file[art_num]['DOIcr'] = new_doi
                working_file[art_num]['ignore'] = '0'
    print("without DOI:", i)
    csvh.write_csv_data(working_file, nr_wf)
    current_pass = 4

## Verify DOIs in DB
Verify that articles do not exist in the DB

In [44]:
ukchapp_db = "../railsapp/ukchapp/db/development.sqlite3"
if current_pass >= 4:
    i = 0
    db_conn = dbh.DataBaseAdapter(ukchapp_db)
    for art_num in tqdm_notebook(working_file):
        if working_file[art_num]['ignore']=='0':
            new_title = working_file[art_num]['Title']
            new_doi = working_file[art_num]['DOIcr'].strip()
            db_title = db_conn.get_title(new_doi)
            if db_title == None:
                print("Not in DB:", new_doi, new_title)
            else:
                print("Already in DB:", new_doi, "for:", new_title, db_title)
                working_file[art_num]['ignore'] = '5'
    print("without DOI:", i)
    csvh.write_csv_data(working_file, nr_wf)
    current_pass = 5


HBox(children=(IntProgress(value=0, max=998), HTML(value='')))

Not in DB: 10.1007/s11244-020-01330-y Preface to Special Issue on 5th UK Catalysis Conference (UKCC 2019)
Not in DB: 10.1016/j.joule.2020.07.024 Improving Photocatalytic Energy Conversion via NAD (P) H
Not in DB: 10.1080/08940886.2018.1460180 Synchrotron Radiation and Neutrons for Catalysis, Materials Research and Development
Not in DB:  Application of “Smart” Amine Donors for Rapid Screening and Scale‐Up of Transaminase‐Mediated Biotransformations
Not in DB:  Methanation of Carbon Dioxide over Zeolite‐Encapsulated Nickel Nanoparticles
Not in DB:  Alcohol Dehydrogenase Triggered Oxa‐Michael Reaction for the Asymmetric Synthesis of Disubstituted Tetrahydropyrans and Tetrahydrofurans
Not in DB:  Hydrogen generation by photocatalytic reforming of potential biofuels: Polyols, cyclic alcohols, and saccharides
Not in DB: 10.1098/rsta.2020.0058 Methanol photo-reforming with water on pure titania for hydrogen production
Not in DB:  Operando Spectroscopic Studies of Cu–SSZ-13 for NH3–SCR deNOx 

In [46]:
ukchapp_db = "../railsapp/ukchapp/db/development.sqlite3"
#verify that titles are not in the app_db
if current_pass >= 5: 
    db_titles = []
    current_initial = ""
    for art_num in tqdm_notebook(working_file):
        if working_file[art_num]['ignore']=='0':
            new_title = working_file[art_num]['Title'].lower()
            new_doi = working_file[art_num]['DOIcr'].strip()
            if current_initial == "" or current_initial != new_title[0]:
                print("new intital ", new_title[0])
                current_initial = new_title[0]
                db_titles = get_titles_list(current_initial, ukchapp_db)
            for prev_pair in db_titles:
                prev_num = prev_pair[0]
                used_title = prev_pair[1].lower()
                # if titles match exactly or simialarity > 0.8 ignore
                title_similarity = txtc.similar(new_title, used_title)
                if title_similarity > 0.80:
                    #print(art_num, 'Title:', new_title, "already processed", prev_num, used_title)
                    csv_articles[art_num]['ignore'] = 6
                    csv_articles[art_num]['previous'] = prev_num
                    csv_articles[art_num]['similarity'] = title_similarity
                    break
                    
    csvh.write_csv_data(csv_articles, nr_wf)

HBox(children=(IntProgress(value=0, max=998), HTML(value='')))

new intital  p
new intital  i
new intital  s
new intital  a
new intital  m
new intital  a
new intital  h
new intital  m
new intital  o
new intital  c
new intital  a
new intital  c
new intital  w
new intital  t
new intital  e
new intital  o
new intital  a
new intital  e
new intital  s
new intital  r
new intital  c
new intital  s
new intital  p
new intital  s
new intital  m
new intital  l
new intital  b
new intital  e
new intital  p
new intital  m
new intital  o
new intital  h
new intital  t
new intital  a
new intital  u
new intital  r
new intital  m
new intital  p
new intital  d
new intital  a
new intital  s
new intital  p
new intital  o
new intital  s
new intital  e
new intital  r
new intital  t
new intital  e
new intital  c
new intital  e
new intital  a
new intital  i
new intital  s
new intital  v
new intital  c
new intital  s
new intital  g
new intital  p
new intital  r
new intital  n
new intital  h
new intital  a
new intital  c
new intital  i
new intital  m
new intital  c
new intita

## Get Acknowledgement statements

In [None]:
if current_pass >= 5:
    i = 0
    for art_num in tqdm_notebook(working_file):
        if working_file[art_num]['ignore']=='0':
            article_title = working_file[art_num]['Title']
            article_doi = working_file[art_num]['DOIcr']
            article_url =working_file[art_num]['ArticleURL']
            print("Analysing:", article_title, article_doi, article_url)
            # try to retrive html page for article using link from crossref first
            # and if not try url from pop
            # find reference to uk catalysis hub in html text
            # if found mark as relevant
            found = ""
            referents = ["uk catalysis hub", "uk catalysis", "catalysis hub",
                 'EP/R026645/1', 'resources', 'EP/K014668/1', 'EPSRC', 'EP/K014714/1',
                 'Hub','provided', 'grant', 'biocatalysis', 'EP/R026815/1', 'EP/R026939/1',
                 'support', 'membership', 'EP/M013219/1', 'UK', 'kindly', 'Catalysis',
                 'funded', 'EP/R027129/1', 'Consortium', 'thanked', 'EP/K014854/1', 'EP/K014706/2']
            found = urlh.findFromDOI(article_title, article_doi, referents)
            working_file[art_num]['checked_doi'] = 1
            working_file[art_num]['ack_doi'] = found
            found = urlh.findFromURI(article_title, article_url, referents)
            working_file[art_num]['checked_url'] = 1
            working_file[art_num]['ack_url'] = found
            print("Ack:", found)
    csvh.write_csv_data(working_file, nr_wf)