# Verification of references to UK Catalysis Hub 
A list of articles is obtainded from publish or perish. This list will contain a titles and some IDs whic need to be verified. 

The criteria for adding a publication to the database are: 
a) has an explicit acknowledgement of UK Catalysis Hub
b) mentions one of the UK Catalysis Hub grants
c) has two or more authors with affiliation to UK Catalysis Hub
d) acknowledges support from a scientist affiliated to UK Catalysis Hub.

In [1]:
# Libraries
# library containign functions that read and write to csv files
import lib.handle_csv as csvh
# library for connecting to the db
import lib.handle_db as dbh
# library for handling text matchings
import lib.text_comp as txtc
# library for getting data from crossref
import lib.crossref_api as cr_api
# library for handling url searchs
import lib.handle_urls as urlh

from pathlib import Path


# input files
new_results_file = 'repo_searches/diamond_pub_search.csv'
previous_results = 'pop_searches/ukch_pop_prev_res.csv'

#output files
nr_wf = new_results_file[:-4]+"_wf.csv"
working_file = wf_fields = None
current_pass = 0
if Path(nr_wf).is_file():
    working_file, wf_fields = csvh.get_csv_data(nr_wf,'Num')
    for art_num in working_file:
        if current_pass < int(working_file[art_num]['ignore']) :
            current_pass = int(working_file[art_num]['ignore'])
#print(nr_wf)

## Verify if already processed titles are included
Read data and verify if results in file have already been included in previous searches


In [8]:
if current_pass == 0:
    csv_articles, fn_articles = csvh.get_csv_data(new_results_file,'Num')
    prev_articles, fn_prev = csvh.get_csv_data(previous_results,'Num')
    # print(prev_articles)
    # pass 1a exact match
    for art_num in csv_articles:
        new_title = csv_articles[art_num]['Title']
        for prev_num in prev_articles:
            if new_title == prev_articles[prev_num]['Title']:
                #print(art_num, 'Title:', csv_articles[art_num]['Title'], "already processed", prev_num, prev_articles[prev_num]['Title'])
                csv_articles[art_num]['ignore']=1
                break
        if not 'ignore' in csv_articles[art_num].keys():
            csv_articles[art_num]['ignore']=0
    # pass 1b approximate match
    for art_num in csv_articles:
        if csv_articles[art_num]['ignore']==0:
            new_title = csv_articles[art_num]['Title']
            for prev_num in prev_articles:
                if txtc.similar(new_title, prev_articles[prev_num]['Title'])> 0.80:
                    #print(art_num, 'Title:', csv_articles[art_num]['Title'], "already processed", prev_num, prev_articles[prev_num]['Title'])
                    csv_articles[art_num]['ignore']=1
                    break
    csvh.write_csv_data(csv_articles, nr_wf)
    if Path(nr_wf).is_file():
        working_file, wf_fields = csvh.get_csv_data(nr_wf,'Num')
        for art_num in working_file:
            if current_pass < int(working_file[art_num]['ignore']) :
                current_pass = int(working_file[art_num]['ignore'])
    
    print(nr_wf)

## Check Title Wording
Using the workds in previous catalysis hub papers check if the title is likely to be a cat hub title

In [9]:
if current_pass in [0,1]:
    # pass 2
    # check titles for likelihood of being catalysis articles using keywords from titles in current DB 
    print("Get word list from DB")
    x = dbh.DataBaseAdapter('ukch_articles.sqlite')
    db_titles = x.get_value_list('articles','title')
    title_words = set()
    ignore_words=set(['the','of','to','and','a','in','is','it', 'their', 'so', 'as'])
    average = 0
    words_sum = 0.0
    for title in db_titles:
        one_title = set(title.lower().split())
        one_title = one_title - ignore_words
        title_words = title_words.union(one_title)
        words_sum += len(one_title) 
        
    average = words_sum /len(db_titles)
    print("Average words per title:", average)
    title_words = title_words - ignore_words
    for art_num in working_file:
        if 0 == int(working_file[art_num]['ignore']):
            art_title = working_file[art_num]['Title']
            art_words = set(art_title.lower().split())
            occurrences = len(title_words.intersection(art_words))
            working_file[art_num]['keywords']=occurrences
            if occurrences <= 4:
                print("occurrences:", occurrences, "in title:", art_title)
                working_file[art_num]['ignore']=2
            #elif occurrences <= 7:
            else:
                print("occurrences:", occurrences, "in title:", art_title)
    csvh.write_csv_data(working_file, nr_wf)
    current_pass = 2

Get word list from DB
Average words per title: 10.132686084142394
occurrences: 3 in title: Synchrotron radiation and catalytic science
occurrences: 5 in title: Rewiring the ‘push-pull’ catalytic machinery of a heme enzyme using an expanded genetic code
occurrences: 9 in title: Correlation of the ratio of metallic to oxide species with activity of PdPt catalysts for methane oxidation
occurrences: 8 in title: Understanding the mechanochemical synthesis of the perovskite LaMnO 3 and its catalytic behaviour
occurrences: 6 in title: Capping agent effect on Pd-supported nanoparticles in the hydrogenation of furfural
occurrences: 9 in title: Tuning of catalytic sites in Pt/TiO2 catalysts for the chemoselective hydrogenation of 3-nitrostyrene
occurrences: 13 in title: In‐situ monitoring of nanoparticle formation during iridium‐catalysed oxygen evolution by real‐time small angle X‐ray scattering
occurrences: 14 in title: Elementary steps in the formation of hydrocarbons from surface methoxy gro

In [10]:
if current_pass == 2:
    i = 0
    for art_num in working_file:
        #print('Title:', working_file[art_num]['Title'],working_file[art_num]['ignore'])
        if working_file[art_num]['ignore']=='0':
            inspected = False
            while not inspected:
                new_title = working_file[art_num]['Title']
                print('Title:', working_file[art_num]['Title'])
                print('***************************************************************')
                print("Oprions:\n\ta) add\n\tb) ignore")
                print("selection:")
                usr_select = input()
                if usr_select == 'b':
                    working_file[art_num]['ignore']=3 # visual inspection
                    inspected = True
                elif usr_select == 'a':
                    inspected = True
            i += 1
    print("To Process:", i, "Pass:", current_pass)
    csvh.write_csv_data(working_file, nr_wf)
    current_pass = 3

Title: Rewiring the ‘push-pull’ catalytic machinery of a heme enzyme using an expanded genetic code
***************************************************************
Oprions:
	a) add
	b) ignore
selection:
a
Title: Correlation of the ratio of metallic to oxide species with activity of PdPt catalysts for methane oxidation
***************************************************************
Oprions:
	a) add
	b) ignore
selection:
a
Title: Understanding the mechanochemical synthesis of the perovskite LaMnO 3 and its catalytic behaviour
***************************************************************
Oprions:
	a) add
	b) ignore
selection:
a
Title: Capping agent effect on Pd-supported nanoparticles in the hydrogenation of furfural
***************************************************************
Oprions:
	a) add
	b) ignore
selection:
a
Title: Tuning of catalytic sites in Pt/TiO2 catalysts for the chemoselective hydrogenation of 3-nitrostyrene
************************************************************

KeyboardInterrupt: 

## Get DOIs for Articles
The remaining titles need to be further analysed. Recovering their DOIs can help obtain abstracts and acknowledgement statements. 

In [5]:
if current_pass == 3:
    i = 0
    for art_num in working_file:
        if working_file[art_num]['ignore']=='0':
            new_title = working_file[art_num]['Title']
            new_doi = cr_api.getDOIForTitle(new_title)
            if new_doi == "":
                print("Missing DOI:", new_title)
                working_file[art_num]['ignore'] = '4'
                i +=1
            else:
                print("DOI found:", new_doi, "for:", new_title)
                working_file[art_num]['DOIcr'] = new_doi
                working_file[art_num]['ignore'] = '0'
    print("without DOI:", i)
    csvh.write_csv_data(working_file, nr_wf)
    current_pass = 4

## Verify DOIs in DB
Verify that articles do not exist in the DB

In [6]:
if current_pass == 4:
    i = 0
    db_conn = dbh.DataBaseAdapter('ukch_articles.sqlite')
    for art_num in working_file:
        if working_file[art_num]['ignore']=='0':
            new_title = working_file[art_num]['Title']
            new_doi = working_file[art_num]['DOIcr']
            db_title = db_conn.get_title(new_doi)
            if db_title == None:
                print("Not in DB:", new_doi, new_title)
            else:
                print("Already in DB:", new_doi, "for:", new_title, db_title)
                working_file[art_num]['ignore'] = '5'
    print("without DOI:", i)
    csvh.write_csv_data(working_file, nr_wf)
    current_pass = 5


## Get Acknowledgement statements

In [None]:
if current_pass >= 5:
    i = 0
    for art_num in working_file:
        if working_file[art_num]['ignore']=='0':
            article_title = working_file[art_num]['Title']
            article_doi = working_file[art_num]['DOIcr']
            article_url =working_file[art_num]['ArticleURL']
            print("Analysing:", article_title, article_doi, article_url)
            # try to retrive html page for article using link from crossref first
            # and if not try url from pop
            # find reference to uk catalysis hub in html text
            # if found mark as relevant
            found = ""
            referents = ["uk catalysis hub", "uk catalysis", "catalysis hub",
                 'EP/R026645/1', 'resources', 'EP/K014668/1', 'EPSRC', 'EP/K014714/1',
                 'Hub','provided', 'grant', 'biocatalysis', 'EP/R026815/1', 'EP/R026939/1',
                 'support', 'membership', 'EP/M013219/1', 'UK', 'kindly', 'Catalysis',
                 'funded', 'EP/R027129/1', 'Consortium', 'thanked', 'EP/K014854/1', 'EP/K014706/2']
            found = urlh.findFromDOI(article_title, article_doi, referents)
            working_file[art_num]['checked_doi'] = 1
            working_file[art_num]['ack_doi'] = found
            found = urlh.findFromURI(article_title, article_url, referents)
            working_file[art_num]['checked_url'] = 1
            working_file[art_num]['ack_url'] = found
            print("Ack:", found)
    csvh.write_csv_data(working_file, nr_wf)