# Verification of references to UK Catalysis Hub 
A list of articles is obtainded from publish or perish. This list will contain a titles and some IDs whic need to be verified. 

The criteria for adding a publication to the database are: 
a) has an explicit acknowledgement of UK Catalysis Hub
b) mentions one of the UK Catalysis Hub grants
c) has two or more authors with affiliation to UK Catalysis Hub
d) acknowledges support from a scientist affiliated to UK Catalysis Hub.

In [1]:
# Libraries
# library containign functions that read and write to csv files
import lib.handle_csv as csvh
# library for connecting to the db
import lib.handle_db as dbh
# library for handling text matchings
import lib.text_comp as txtc
# library for getting data from crossref
import lib.crossref_api as cr_api
# library for handling url searchs
import lib.handle_urls as urlh

from pathlib import Path


# input files
new_results_file = 'pop_searches/PoPCites5.csv'
previous_results = 'pop_searches/ukch_pop_prev_res.csv'

#output files
nr_wf = new_results_file[:-4]+"_wf.csv"
working_filem = wf_fields = None
current_pass = 0
if Path(nr_wf).is_file():
    working_file, wf_fields = csvh.get_csv_data(nr_wf,'Num')
    for art_num in working_file:
        if current_pass < int(working_file[art_num]['ignore']) :
            current_pass = int(working_file[art_num]['ignore'])
#print(nr_wf)

## Verify if already processed titles are included
Read data and verify if results in file have already been included in previous searches


In [2]:
if current_pass == 0:
    csv_articles, fn_articles = csvh.get_csv_data(new_results_file,'Num')
    prev_articles, fn_prev = csvh.get_csv_data(previous_results,'Num')
    # print(prev_articles)
    # pass 1a exact match
    for art_num in csv_articles:
        new_title = csv_articles[art_num]['Title']
        for prev_num in prev_articles:
            if new_title == prev_articles[prev_num]['Title']:
                #print(art_num, 'Title:', csv_articles[art_num]['Title'], "already processed", prev_num, prev_articles[prev_num]['Title'])
                csv_articles[art_num]['ignore']=1
                break
        if not 'ignore' in csv_articles[art_num].keys():
            csv_articles[art_num]['ignore']=0
    # pass 1b approximate match
    for art_num in csv_articles:
        if csv_articles[art_num]['ignore']==0:
            new_title = csv_articles[art_num]['Title']
            for prev_num in prev_articles:
                if txtc.similar(new_title, prev_articles[prev_num]['Title'])> 0.80:
                    #print(art_num, 'Title:', csv_articles[art_num]['Title'], "already processed", prev_num, prev_articles[prev_num]['Title'])
                    csv_articles[art_num]['ignore']=1
                    break
    csvh.write_csv_data(csv_articles, nr_wf)
    if Path(nr_wf).is_file():
        working_file, wf_fields = csvh.get_csv_data(nr_wf,'Num')
        for art_num in working_file:
            if current_pass < int(working_file[art_num]['ignore']) :
                current_pass = int(working_file[art_num]['ignore'])

## Check Title Wording
Using the workds in previous catalysis hub papers check if the title is likely to be a cat hub title

In [3]:
if current_pass in [0,1]:
    # pass 2
    # check titles for likelihood of being catalysis articles using keywords from titles in current DB 
    print("Get word list from DB")
    x = dbh.DataBaseAdapter('ukch_articles.sqlite')
    db_titles = x.get_value_list('articles','title')
    title_words = set()
    ignore_words=set(['the','of','to','and','a','in','is','it', 'their', 'so', 'as'])
    average = 0
    words_sum = 0.0
    for title in db_titles:
        one_title = set(title.lower().split())
        one_title = one_title - ignore_words
        title_words = title_words.union(one_title)
        words_sum += len(one_title) 
        
    average = words_sum /len(db_titles)
    print("Average words per title:", average)
    title_words = title_words - ignore_words
    for art_num in working_file:
        if 0 == int(working_file[art_num]['ignore']):
            art_title = working_file[art_num]['Title']
            art_words = set(art_title.lower().split())
            occurrences = len(title_words.intersection(art_words))
            working_file[art_num]['keywords']=occurrences
            if occurrences <= 4:
                print("occurrences:", occurrences, "in title:", art_title)
                working_file[art_num]['ignore']=2
            #elif occurrences <= 7:
            else:
                print("occurrences:", occurrences, "in title:", art_title)
    csvh.write_csv_data(working_file, nr_wf)
    current_pass = 2

In [4]:
if current_pass == 2:
    i = 0
    for art_num in working_file:
        #print('Title:', working_file[art_num]['Title'],working_file[art_num]['ignore'])
        if working_file[art_num]['ignore']=='0':
            inspected = False
            while not inspected:
                new_title = working_file[art_num]['Title']
                print('Title:', working_file[art_num]['Title'])
                print('***************************************************************')
                print("Oprions:\n\ta) add\n\tb) ignore")
                print("selection:")
                usr_select = input()
                if usr_select == 'b':
                    working_file[art_num]['ignore']=3 # visual inspection
                    inspected = True
                elif usr_select == 'a':
                    inspected = True
            i += 1
    print("To Process:", i, "Pass:", current_pass)
    csvh.write_csv_data(working_file, nr_wf)
    current_pass = 3

## Get DOIs for Articles
The remaining titles need to be further analysed. Recovering their DOIs can help obtain abstracts and acknowledgement statements. 

In [5]:
if current_pass == 3:
    i = 0
    for art_num in working_file:
        if working_file[art_num]['ignore']=='0':
            new_title = working_file[art_num]['Title']
            new_doi = cr_api.getDOIForTitle(new_title)
            if new_doi == "":
                print("Missing DOI:", new_title)
                working_file[art_num]['ignore'] = '4'
                i +=1
            else:
                print("DOI found:", new_doi, "for:", new_title)
                working_file[art_num]['DOIcr'] = new_doi
                working_file[art_num]['ignore'] = '0'
    print("without DOI:", i)
    csvh.write_csv_data(working_file, nr_wf)
    current_pass = 4

## Verify DOIs in DB
Verify that articles do not exist in the DB

In [6]:
if current_pass == 4:
    i = 0
    db_conn = dbh.DataBaseAdapter('ukch_articles.sqlite')
    for art_num in working_file:
        if working_file[art_num]['ignore']=='0':
            new_title = working_file[art_num]['Title']
            new_doi = working_file[art_num]['DOIcr']
            db_title = db_conn.get_title(new_doi)
            if db_title == None:
                print("Not in DB:", new_doi, new_title)
            else:
                print("Already in DB:", new_doi, "for:", new_title, db_title)
                working_file[art_num]['ignore'] = '5'
    print("without DOI:", i)
    csvh.write_csv_data(working_file, nr_wf)
    current_pass = 5


## Get Acknowledgement statements

In [8]:
if current_pass >= 5:
    i = 0
    for art_num in working_file:
        if working_file[art_num]['ignore']=='0':
            article_title = working_file[art_num]['Title']
            article_doi = working_file[art_num]['DOIcr']
            article_url =working_file[art_num]['ArticleURL']
            print("Analysing:", article_title, article_doi, article_url)
            # try to retrive html page for article using link from crossref first
            # and if not try url from pop
            # find reference to uk catalysis hub in html text
            # if found mark as relevant
            found = ""
            referents = ["uk catalysis hub", "uk catalysis", "catalysis hub",
                 'EP/R026645/1', 'resources', 'EP/K014668/1', 'EPSRC', 'EP/K014714/1',
                 'Hub','provided', 'grant', 'biocatalysis', 'EP/R026815/1', 'EP/R026939/1',
                 'support', 'membership', 'EP/M013219/1', 'UK', 'kindly', 'Catalysis',
                 'funded', 'EP/R027129/1', 'Consortium', 'thanked', 'EP/K014854/1', 'EP/K014706/2']
            found = urlh.findFromDOI(article_title, article_doi, referents)
            working_file[art_num]['checked_doi'] = 1
            working_file[art_num]['ack_doi'] = found
            found = urlh.findFromURI(article_title, article_url, referents)
            working_file[art_num]['checked_url'] = 1
            working_file[art_num]['ack_url'] = found
            print("Ack:", found)
    csvh.write_csv_data(working_file, nr_wf)

Analysing: Rapid synthesis of [Au25 (Cys) 18] nanoclusters via carbon monoxide in microfluidic liquid-liquid segmented flow system and their antimicrobial performance 10.1016/j.cej.2019.123176 https://www.sciencedirect.com/science/article/pii/S1385894719325884
Ack: <span class="anchor-text">Contact and support</span>
Analysing: Enantioselective Synthesis of Chiral Vicinal Amino Alcohols Using Amine Dehydrogenases 10.1021/acscatal.9b03889 https://pubs.acs.org/doi/abs/10.1021/acscatal.9b03889
Ack: <span class="institution-info-wrapper"><span class="institution__intro">Access provided by</span><span class="institution__name">The Chadwick &amp; RAL Libraries</span></span>
<span class="institution__intro">Access provided by</span>
<span class="article_header-suppInfo-text">Supporting Info (1)</span>
Analysing: Understanding the Role of Internal Diffusion Barriers in Pt/Beta Zeolite Catalyzed Isomerization of n‐Heptane 10.1002/anie.201913660 https://onlinelibrary.wiley.com/doi/abs/10.1002/an

Ack: <span class="institution-info-wrapper"><span class="institution__intro">Access provided by</span><span class="institution__name">The Chadwick &amp; RAL Libraries</span></span>
<span class="institution__intro">Access provided by</span>
<span class="article_header-suppInfo-text">Supporting Info (1)</span>
Analysing: The gem-Dialkyl Effect in Diphosphine Ligands: Synthesis, Coordination Behavior and Application in Pd-catalyzed Hydroformylation 10.1021/acscatal.9b03007 https://pubs.acs.org/doi/abs/10.1021/acscatal.9b03007
Ack: <span class="institution-info-wrapper"><span class="institution__intro">Access provided by</span><span class="institution__name">The Chadwick &amp; RAL Libraries</span></span>
<span class="institution__intro">Access provided by</span>
<span class="article_header-suppInfo-text">Supporting Info (2)</span>
Analysing: Rapid, High‐Yield Fructose Dehydration to 5‐Hydroxymethylfurfural in Mixtures of Water and the Noncoordinating Ionic Liquid [bmim][OTf] 10.1002/cssc.2

Ack: <span class="institution-info-wrapper"><span class="institution__intro">Access provided by</span><span class="institution__name">The Chadwick &amp; RAL Libraries</span></span>
<span class="institution__intro">Access provided by</span>
<span class="article_header-suppInfo-text">Supporting Info (1)</span>
Analysing: Analysis for Science Librarians of the 2018 Nobel Prize in Chemistry: Directed Evolution of Enzymes and Phage Display of Peptides and Antibodies 10.1080/0194262x.2019.1579159 https://www.tandfonline.com/doi/abs/10.1080/0194262X.2019.1579159
Ack: 
Analysing: Supported Transition Metal Phosphides: Activity Survey for HER, ORR, OER, and Corrosion Resistance in Acid and Alkaline Electrolytes 10.1021/acscatal.9b03359 https://pubs.acs.org/doi/abs/10.1021/acscatal.9b03359
Ack: <span class="institution-info-wrapper"><span class="institution__intro">Access provided by</span><span class="institution__name">The Chadwick &amp; RAL Libraries</span></span>
<span class="institution__in

Ack: <span class="anchor-text">Contact and support</span>
Analysing: Identifying pseudoenzymes using functional annotation. How loss of function correlates with mutations in the catalytic site 10.1111/febs.15142 https://febs.onlinelibrary.wiley.com/doi/abs/10.1111/febs.15142
Ack: <span aria-labelledby="febs15142-note-0001_25-controller" class="footNotePopup" id="febs15142-note-0001_25"><button class="footNoteClose"><i aria-hidden="true" class="icon-tools_close"></i></button>
In the Swiss‐Prot combined dataset, proteins are considered enzymes if they are associated with either a catalytic KW or a catalytic experimental GO annotation. See main text for details. In the Swiss‐Prot experimental dataset, the same rule is used but applied only to the sequences with functional annotation supported by experimental evidence.
<span class="chevronDown"></span></span>
<span aria-labelledby="febs15142-note-0001_26-controller" class="footNotePopup" id="febs15142-note-0001_26"><button class="footNoteC

Ack: <span class="institution-info-wrapper"><span class="institution__intro">Access provided by</span><span class="institution__name">The Chadwick &amp; RAL Libraries</span></span>
<span class="institution__intro">Access provided by</span>
<span class="article_header-suppInfo-text">Supporting Info (1)</span>
Analysing: Near-Complete Structure and Model of Tel1ATM from Chaetomium thermophilum Reveals a Robust Autoinhibited ATP State 10.1016/j.str.2019.10.013 https://www.sciencedirect.com/science/article/pii/S0969212619303545
Ack: <span class="anchor-text">Contact and support</span>
Analysing: DFT-assisted Spectroscopic Studies on the Coordination of Small Ligands to Palladium: From Isolated Ions to Nanoparticles 10.1021/acs.jpcc.9b09791 https://pubs.acs.org/doi/abs/10.1021/acs.jpcc.9b09791
Ack: <span class="institution-info-wrapper"><span class="institution__intro">Access provided by</span><span class="institution__name">The Chadwick &amp; RAL Libraries</span></span>
<span class="instit

Ack: <span class="anchor-text">Contact and support</span>
Analysing: Synthesis, crystal structure and bovine serum albumin–binding studies of a new Cd (II) complex incorporating 2, 2′-(propane-1, 3-diyl) bis (1H-imidazole-4, 5 … 10.1177/1747519819895240 https://journals.sagepub.com/doi/abs/10.1177/1747519819895240
Ack: 
Analysing: Hydrophilic microporous membranes for selective ion separation and flow-battery energy storage 10.1038/s41563-019-0536-8 https://idp.nature.com/authorize/casa?redirect_uri=https://www.nature.com/articles/s41563-019-0536-8&casa_token=tx2b8oiDfK0AAAAA:zORBLeozH26mbMl4HnPzUE0YDF2K3dtBOi94x8aNb6Z-R-pEettkC32Gk3T1HylngGNlClA_zZl1kk7V
Ack: 
Analysing: Structural mechanism of DNA-end synapsis in the non-homologous end joining pathway for repairing double-strand breaks: bridge over troubled ends 10.1042/bst20180518 https://portlandpress.com/biochemsoctrans/article-abstract/47/6/1609/221466
Ack: 
Analysing: Understanding supported noble metal catalysts using first-pri

Ack: <span>Supporting Information</span>
Analysing: Ag nanoparticle-decorated, ordered mesoporous silica as an efficient electrocatalyst for alkaline water oxidation reaction 10.1039/c8dt04159h https://pubs.rsc.org/en/content/articlehtml/2019/dt/c8dt04159h
Ack: 
Analysing: Synthesis of Selenoaziridines: A Study on Stereochemical Outcomes of the Reaction of Aziridine Radicals and Anions Generated from Iodoaziridines 10.1021/acsomega.8b03019 https://pubs.acs.org/doi/abs/10.1021/acsomega.8b03019
Ack: <span class="institution-info-wrapper"><span class="institution__intro">Access provided by</span><span class="institution__name">The Chadwick &amp; RAL Libraries</span></span>
<span class="institution__intro">Access provided by</span>
<span class="article_header-suppInfo-text">Supporting Info (1)</span>
Analysing: Catalytic Friedel‐Crafts Reactions on Saturated Heterocycles and Small Rings for sp3‐sp2 Coupling of Medicinally Relevant Fragments 10.1002/ejoc.201900498 https://onlinelibrary.wile

Ack: <span class="institution-info-wrapper"><span class="institution__intro">Access provided by</span><span class="institution__name">The Chadwick &amp; RAL Libraries</span></span>
<span class="institution__intro">Access provided by</span>
<span class="article_header-suppInfo-text">Supporting Info (3)</span>
Analysing: Conversion of polyethylene terephthalate to high-quality terephthalic acid by hydrothermal hydrolysis: the study of process parameters 10.1177/0040517519893714 https://journals.sagepub.com/doi/abs/10.1177/0040517519893714
('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Ack: 
Analysing: Hydrothermal liquefaction of microalgae using Fe3O4 nanostructures as efficient catalyst for the production of bio-oil: Optimization of reaction parameters by response … 10.1016/j.biombioe.2019.105417 https://www.sciencedirect.com/science/article/pii/S0961953419303666
Ack: <span class="anchor-text">Contact and support</span>
Analysing: Single Step Plasma Proc

Ack: 
Analysing: Graphene and molybdenum disulphide hybrids for energy applications: an update 10.1016/j.mtadv.2019.100053 https://www.sciencedirect.com/science/article/pii/S2590049819301274
Ack: <span class="anchor-text">Contact and support</span>
Analysing: Effect of squalene rich fraction from shark liver on mechanical, barrier and thermal properties of fish (Probarbus Jullieni) skin gelatin film 10.1016/j.foodhyd.2019.05.019 https://www.sciencedirect.com/science/article/pii/S0268005X19306563
Ack: <span class="anchor-text">Contact and support</span>
Analysing: Investigating the co-firing characteristics of bamboo wastes and coal through cone calorimetry and thermogravimetric analysis coupled with Fourier transform infrared … 10.1177/0734242x19893018 https://journals.sagepub.com/doi/abs/10.1177/0734242X19893018
('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Ack: <span class="contribDegrees"><a aria-label="Open contributor information pop-up for Wanhe H

Ack: <span>Supporting Information</span>
Analysing: Recent advancements in sorption technology for solar thermal energy storage applications 10.1016/j.solener.2018.06.102 https://www.sciencedirect.com/science/article/pii/S0038092X18306546
Ack: <span class="anchor-text">Contact and support</span>
Analysing: Nylon 612/TiO2 composites by anionic copolymerization-molding process: Comparative evaluation of thermal and mechanical performance 10.1177/0021998319862345 https://journals.sagepub.com/doi/abs/10.1177/0021998319862345
('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Ack: 
