# Verification of references to UK Catalysis Hub 
A list of articles is obtainded from publish or perish. This list will contain a titles and some IDs whic need to be verified. 

The criteria for adding a publication to the database are: 
a) has an explicit acknowledgement of UK Catalysis Hub
b) mentions one of the UK Catalysis Hub grants
c) has two or more authors with affiliation to UK Catalysis Hub
d) acknowledges support from a scientist affiliated to UK Catalysis Hub.

In [1]:
# Libraries
# library containign functions that read and write to csv files
import lib.handle_csv as csvh
# library for connecting to the db
import lib.handle_db as dbh
# library for handling text matchings
import lib.text_comp as txtc
# library for getting data from crossref
import lib.crossref_api as cr_api
# library for handling url searchs
import lib.handle_urls as urlh
# managing files and file paths
from pathlib import Path
# add aprogress bar
from tqdm import tqdm_notebook 


# input files
new_results_file = 'pop_searches/PoPCites20201017.csv'
previous_results = 'pop_searches/ukch_pop_prev_res.csv'

#output files
nr_wf = new_results_file[:-4]+"_wf.csv"


Get the name of the file with the results of the PoP search:

In [None]:
new_results_file = ""
while not Path(new_results_file).is_file():
    print('Please enter the name of the input file:')
    new_results_file = input()

Get the name of the file with previous results of the PoP search:

In [None]:
previous_results = ""
while not Path(previous_results).is_file():
    print('Please enter the name of the previous results file:')
    previous_results = input()

Set the name of the output file


In [2]:
nr_wf = new_results_file[:-4]+"_wf.csv"
print("Verifying if the articles listed in: \n\t", Path(new_results_file).name)
print("where included in previous searches: \n\t", Path(previous_results).name)

print("The results will bt saves in: \n\t", nr_wf)

Verifying if the articles listed in: 
	 PoPCites20201017.csv
where included in previous searches: 
	 ukch_pop_prev_res.csv
The results will bt saves in: 
	 pop_searches/PoPCites20201017_wf.csv


In [3]:
working_file = wf_fields = None
current_pass = 0
if Path(nr_wf).is_file():
    working_file, wf_fields = csvh.get_csv_data(nr_wf,'Num')
    for art_num in tqdm_notebook(working_file):
        if current_pass < int(working_file[art_num]['ignore']) :
            current_pass = int(working_file[art_num]['ignore'])
#print(nr_wf)

HBox(children=(IntProgress(value=0, max=998), HTML(value='')))




## Verify if already processed titles are included
Read data and verify if results in file have already been included in previous searches


In [4]:
if current_pass == 0:
    csv_articles, fn_articles = csvh.get_csv_data(new_results_file,'Num')
    prev_articles, fn_prev = csvh.get_csv_data(previous_results,'Num')
    # print(prev_articles)
    # pass 1a exact match
    for art_num in tqdm_notebook(csv_articles):
        new_title = csv_articles[art_num]['Title']
        for prev_num in (prev_articles):
            if new_title == prev_articles[prev_num]['Title']:
                #print(art_num, 'Title:', csv_articles[art_num]['Title'], "already processed", prev_num, prev_articles[prev_num]['Title'])
                csv_articles[art_num]['ignore']=1
                break
        if not 'ignore' in csv_articles[art_num].keys():
            csv_articles[art_num]['ignore']=0
    # pass 1b approximate match
    for art_num in csv_articles:
        if csv_articles[art_num]['ignore']==0:
            new_title = csv_articles[art_num]['Title']
            for prev_num in prev_articles:
                if txtc.similar(new_title, prev_articles[prev_num]['Title'])> 0.80:
                    #print(art_num, 'Title:', csv_articles[art_num]['Title'], "already processed", prev_num, prev_articles[prev_num]['Title'])
                    csv_articles[art_num]['ignore']=1
                    break
    csvh.write_csv_data(csv_articles, nr_wf)
    if Path(nr_wf).is_file():
        working_file, wf_fields = csvh.get_csv_data(nr_wf,'Num')
        for art_num in working_file:
            if current_pass < int(working_file[art_num]['ignore']) :
                current_pass = int(working_file[art_num]['ignore'])
    
    print(nr_wf)

## Check Title Wording
Using the workds in previous catalysis hub papers check if the title is likely to be a cat hub title

In [5]:
if current_pass in [0,1]:
    # pass 2
    # check titles for likelihood of being catalysis articles using keywords from titles in current DB 
    print("Get word list from DB")
    x = dbh.DataBaseAdapter('ukch_articles.sqlite')
    db_titles = x.get_value_list('articles','title')
    title_words = set()
    ignore_words=set(['the','of','to','and','a','in','is','it', 'their', 'so', 'as'])
    average = 0
    words_sum = 0.0
    for title in db_titles:
        one_title = set(title.lower().split())
        one_title = one_title - ignore_words
        title_words = title_words.union(one_title)
        words_sum += len(one_title) 
        
    average = words_sum /len(db_titles)
    print("Average words per title:", average)
    title_words = title_words - ignore_words
    for art_num in working_file:
        if 0 == int(working_file[art_num]['ignore']):
            art_title = working_file[art_num]['Title']
            art_words = set(art_title.lower().split())
            occurrences = len(title_words.intersection(art_words))
            working_file[art_num]['keywords']=occurrences
            if occurrences <= 4:
                print("occurrences:", occurrences, "in title:", art_title)
                working_file[art_num]['ignore']=2
            #elif occurrences <= 7:
            else:
                print("occurrences:", occurrences, "in title:", art_title)
    csvh.write_csv_data(working_file, nr_wf)
    current_pass = 2

In [6]:

if current_pass == 2:
    i = 0
    for art_num in tqdm_notebook(working_file):
        if working_file[art_num]['ignore']=='0':
            inspected = False
            while not inspected:
                new_title = working_file[art_num]['Title']
                keywords = working_file[art_num]['keywords']
                #print (keywords, new_title)
                if keywords < 9 and not ("cataly" in new_title.lower()):
                # ignore  it because it does not contains cataly in title
                    working_file[art_num]['ignore']=3 # visual inspection
                    inspected = True
                elif keywords >= 9 and not ("cataly" in new_title.lower()):
                    print('Title:', working_file[art_num]['Title'])
                    print('***************************************************************')
                    print("Options:\n\ta) add\n\tb) ignore")
                    print("selection:")
                    usr_select = input()
                    if usr_select == 'b':
                        working_file[art_num]['ignore']=3 # visual inspection
                        inspected = True
                    elif usr_select == 'a':
                        inspected = True
                        i += 1
                else:
                    inspected = True
    print("To Process:", i, "Pass:", current_pass)
    csvh.write_csv_data(working_file, nr_wf)
    current_pass = 3

## Get DOIs for Articles
The remaining titles need to be further analysed. Recovering their DOIs can help obtain abstracts and acknowledgement statements. 

In [7]:
if current_pass == 3:
    i = 0
    for art_num in tqdm(working_file):
        if working_file[art_num]['ignore']=='0':
            new_title = working_file[art_num]['Title']
            new_doi = cr_api.getDOIForTitle(new_title)
            if new_doi == "":
                #print("Missing DOI:", new_title)
                working_file[art_num]['ignore'] = '4'
                i +=1
            else:
                #print("DOI found:", new_doi, "for:", new_title)
                working_file[art_num]['DOIcr'] = new_doi
                working_file[art_num]['ignore'] = '0'
    print("without DOI:", i)
    csvh.write_csv_data(working_file, nr_wf)
    current_pass = 4

## Verify DOIs in DB
Verify that articles do not exist in the DB

In [8]:
ukchapp_db = "../railsapp/ukchapp/db/development.sqlite3"
if current_pass == 4:
    i = 0
    db_conn = dbh.DataBaseAdapter(ukchapp_db)
    for art_num in tqdm_notebook(working_file):
        if working_file[art_num]['ignore']=='0':
            new_title = working_file[art_num]['Title']
            new_doi = working_file[art_num]['DOIcr']
            db_title = db_conn.get_title(new_doi)
            if db_title == None:
                print("Not in DB:", new_doi, new_title)
            else:
                print("Already in DB:", new_doi, "for:", new_title, db_title)
                working_file[art_num]['ignore'] = '5'
    print("without DOI:", i)
    csvh.write_csv_data(working_file, nr_wf)
    current_pass = 5


## Get Acknowledgement statements

In [None]:
if current_pass >= 5:
    i = 0
    for art_num in tqdm_notebook(working_file):
        if working_file[art_num]['ignore']=='0':
            article_title = working_file[art_num]['Title']
            article_doi = working_file[art_num]['DOIcr']
            article_url =working_file[art_num]['ArticleURL']
            print("Analysing:", article_title, article_doi, article_url)
            # try to retrive html page for article using link from crossref first
            # and if not try url from pop
            # find reference to uk catalysis hub in html text
            # if found mark as relevant
            found = ""
            referents = ["uk catalysis hub", "uk catalysis", "catalysis hub",
                 'EP/R026645/1', 'resources', 'EP/K014668/1', 'EPSRC', 'EP/K014714/1',
                 'Hub','provided', 'grant', 'biocatalysis', 'EP/R026815/1', 'EP/R026939/1',
                 'support', 'membership', 'EP/M013219/1', 'UK', 'kindly', 'Catalysis',
                 'funded', 'EP/R027129/1', 'Consortium', 'thanked', 'EP/K014854/1', 'EP/K014706/2']
            found = urlh.findFromDOI(article_title, article_doi, referents)
            working_file[art_num]['checked_doi'] = 1
            working_file[art_num]['ack_doi'] = found
            found = urlh.findFromURI(article_title, article_url, referents)
            working_file[art_num]['checked_url'] = 1
            working_file[art_num]['ack_url'] = found
            print("Ack:", found)
    csvh.write_csv_data(working_file, nr_wf)

HBox(children=(IntProgress(value=0, max=998), HTML(value='')))

Analysing: Preface to Special Issue on 5th UK Catalysis Conference (UKCC 2019) 10.1007/s11244-020-01330-y https://link.springer.com/article/10.1007/s11244-020-01330-y
Ack: 
Analysing: Improving Photocatalytic Energy Conversion via NAD (P) H 10.1016/j.joule.2020.07.024 https://www.sciencedirect.com/science/article/pii/S2542435120303421
Ack: <span class="anchor-text">Contact and support</span>
Analysing: Synchrotron Radiation and Neutrons for Catalysis, Materials Research and Development 10.1080/08940886.2018.1460180 https://www.tandfonline.com/doi/full/10.1080/08940886.2018.1460180
Ack: 
Analysing: Neutron spectroscopy studies of methanol to hydrocarbons catalysis over ZSM-5 10.1016/j.cattod.2020.05.030 https://www.sciencedirect.com/science/article/pii/S0920586120303096
Ack: <span class="anchor-text">Contact and support</span>
Analysing: Methanol photo-reforming with water on pure titania for hydrogen production 10.1098/rsta.2020.0058 https://royalsocietypublishing.org/doi/abs/10.1098/r

Ack: <span class="article_header-suppInfo-text">Supporting Info (1)</span>
Analysing: Al-doped Fe 2 O 3 as a support for molybdenum oxide methanol oxidation catalysts 10.1039/d0cp01192d https://pubs.rsc.org/en/content/articlehtml/2020/cp/d0cp01192d
Ack: <span>We are grateful to Diamond Light Source for a part studentship to PH and for support from the UK Catalysis Hub, funded through EPSRC grants EP/I038748/1 and EP/K014714/1, and to the Research Complex at Harwell (RCaH) for the provision of facilities. We thank the Swiss Light Source (SLS) for provision of beamtime under award number 20181979.</span>
Analysing: Towards a deeper understanding of catalytic activity in supported precious metal catalysts, EPSRC 10.21820/23987073.2018.5.16 https://www.ingentaconnect.com/content/sil/impact/2018/00002018/00000005/art00005
Ack: 
Analysing: Rapid synthesis of [Au25 (Cys) 18] nanoclusters via carbon monoxide in microfluidic liquid-liquid segmented flow system and their antimicrobial performanc

Ack: <span class="u-visually-hidden" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization"><meta content="The Grantham Institute, Imperial College London" itemprop="name"/><meta content="0000 0001 2113 8111, grid.7445.2, The Grantham Institute, Imperial College London, London, UK" itemprop="address"/></span>
Analysing: Isolated Pd sites as selective catalysts for electrochemical and direct hydrogen peroxide synthesis 10.1021/acscatal.0c01305 https://pubs.acs.org/doi/abs/10.1021/acscatal.0c01305
Ack: <span class="article_header-suppInfo-text">Supporting Info (1)</span>
Analysing: Asymmetric synthesis of primary amines catalyzed by thermotolerant fungal reductive aminases 10.1039/d0sc02253e https://pubs.rsc.org/en/content/articlehtml/2020/sc/d0sc02253e
Ack: <span>J. M.-S. and M. S. were funded by grant BB/M006832/1 from the UK Biotechnology and Biological Sciences Research Council. N. J. T. is grateful to the ERC for the award of an Advanced Grant (Grant 