# Search in crossref archive

Verify what are the ids stored in crossref when funder is included 
This could help build a query that looks for specific funder insted of retrieving all records with any funder

Could do the same for affiliations?
Could launch a search for the whole year?


In [1]:
# Connecting to the db
import lib.handle_db as dbh

# read and write csv files
import lib.handle_csv as csv_rw

# date functions
from datetime import datetime, date, timedelta

# managing files and file paths
from pathlib import Path

# import custom functions (common to various notebooks)
import processing_functions as pr_fns

#CR libraries
from crossref.restful import Works, Etiquette



# search for UKCH Awards in CR record
def award_in_crossref(aw, award_list):
    ukch_wks =[]
    not_revised = []
    for wk in aw:      
        awd_list = []
        if 'funder' in wk.keys():
            for fdr in wk['funder']:
                if 'award' in fdr.keys():
                   awds = 0
                   for awd in fdr['award']:
                        if awd in award_list:
                            awd_list.append(awd)
                            #print (fdr)
                            break
        else:
            not_revised.append(wk)
        if len(awd_list) > 0:
            ukch_wks.append(wk)
    return ukch_wks, not_revised


In [2]:
# set output vars
base_dir = './pub_search_crossref'
csv_file_out = 'cr_funder202401_c.csv'
out_file = Path(base_dir, csv_file_out)

# CR etiquette
my_etiquette = Etiquette('UK Catalysis Hub - Catalysis Data Infrastructure', 
                         'Prototype 1', 
                         'https://ukcatalysishub.co.uk/core/', 
                         'nieva@rc-harwell.ac.uk')

# cr api for retrieving works 
works = Works(etiquette=my_etiquette)

award_list = ['EP/R026939/1', 'EP/R026815/1', 'EP/R026645/1', 'EP/R027129/1', 'EP/M013219/1',
               'EP/K014706/2', 'EP/K014668/1', 'EP/K014854/1', 'EP/K014714/1',]


In [None]:
# open DB
db_name = 'production'

# 1 currend app DB
ukchapp_db = "db_files/" + db_name + ".sqlite3"

# get publication data from the ukch app
app_pubs = pr_fns.get_pub_data(ukchapp_db)


csv_data = {}
for a_pub in app_pubs:
    if a_pub[0] > 633 :
        pub_id = a_pub[0]
        pub_title = a_pub[1]
        pub_doi = a_pub[2]
        pub_url = a_pub[3]
        pub_pdf = a_pub[4]

        db_record = {"pub_id": pub_id, "pub_title": pub_title, "pub_doi":pub_doi, "pub_url":pub_url, "pub_pdf": pub_pdf}
        funder_data = {'funder_doi':'','funder_name':'','funder_doi_asserted_by':'','funder_awards':[]}
        cr_dates={'indexed':'','created':'','deposited':''}
        if pub_doi != None or \
            pub_doi == '':
            print(works.filter(doi=pub_doi.strip()).url)
            cr_works = works.filter(doi=pub_doi.strip())

            for crwk in cr_works:     
                awd_list = []             
                if 'funder' in crwk.keys():
                    for fdr in crwk['funder']:
                        if 'award' in fdr.keys():
                           awds = 0
                           for awd in fdr['award']:
                                if awd in award_list:
                                    awd_list.append(awd)
                                    if funder_data['funder_name'] == "":
                                        if 'DOI' in fdr.keys(): funder_data['funder_doi'] = fdr['DOI']
                                        if 'name' in fdr.keys(): funder_data['funder_name'] = fdr['name']
                                        if 'doi-asserted-by' in fdr.keys(): funder_data['funder_doi_asserted_by'] = fdr['doi-asserted-by']
                        funder_data["funder_awards"] = awd_list
                cr_dates['indexed']=crwk['indexed']['date-time']
                cr_dates['created']=crwk['created']['date-time']
                cr_dates['deposited']=crwk['deposited']['date-time']
        db_record = {**db_record, **funder_data, **cr_dates}        
        csv_data[pub_id] =db_record
    

In [3]:
# WRITE TO FILE
if len(csv_data) > 0:
    csv_rw.write_csv_data(csv_data, out_file) 

NameError: name 'csv_data' is not defined

In [4]:
ukch_awards = ['EP/R026939/1', 'EP/R026815/1', 'EP/R026645/1', 'EP/R027129/1', 'EP/M013219/1',
               'EP/K014706/2', 'EP/K014668/1', 'EP/K014854/1', 'EP/K014714/1',
               'EP/R026939', 'EP/R026815', 'EP/R026645', 'EP/R027129', 'EP/M013219',
               'EP/K014706', 'EP/K014668', 'EP/K014854', 'EP/K014714',
              ]

mcc_awards = ['EP/R029431/1', 'EP/P020194/1', 'EP/T022213/1', 'EP/D504872/1', 'EP/F067496/1','EP/X035859/1', 
              'EP/W032260/1', 'EP/L000202/1', 'EP/R029431','EP/P020194', 'EP/T022213', 'EP/D504872', 
              'EP/F067496','EP/X035859', 'EP/W032260', 'EP/L000202']
# of EP/P020194 for THOMAS, and EP/D504872, EP/F067496, EP/L000202, EP/R029431
# (EP/X035859), this work used the ' followed by the appropriate acknowledgement: 
# 'ARCHER2 UK National Supercomputing Service (http://www.archer2.ac.uk).
# ' for ARCHER2, 'the UK Materials and Molecular Modelling Hub for computational resources,
# MMM Hub, which is partially funded by EPSRC (EP/T022213)' for YOUNG, and 'the UK Materials 
# and Molecular Modelling Hub for computational resources, MMM Hub, which is partially funded by EPSRC (EP/W032260)'
# for the GPU nodes of YOUNG.

award_list = ukch_awards #mcc_awards #ukch_awards + mcc_awards
award_list = ['EP/L000202/1','EP/L000202'] # one missing mcc grant

In [5]:
def get_authors(wk):
    author_list = ""
    if 'author' in wk.keys() :
        for autr in wk['author']:
            if author_list == "":
                author_list = autr['family'] + (", "+ autr ['given'] if 'given' in autr.keys() else "" )
            else:
                author_list += ", " + autr['family']+ (", "+ autr ['given'] if 'given' in autr.keys() else "" )
    return author_list

def get_awards(wk):
    mcc_pub = ukch_pub = False
    pub_awards = ""
    for fdr in wk['funder']:
        if 'award' in fdr.keys():
          for awd in fdr['award']:
               if awd in award_list:
                    if pub_awards  == "":
                        pub_awards = awd
                    else:
                        pub_awards += ", " +awd
                    if awd in ukch_awards:
                        ukch_pub = True
                    if awd in mcc_awards:
                        mcc_pub = True
    print (pub_awards, mcc_pub, ukch_pub)
    return pub_awards, mcc_pub, ukch_pub  

def get_pub_year(wk):
    ol_year = 0
    pr_year = 0
    pub_year = 0
    if 'published-online' in wk.keys() and 'date-parts' in wk['published-online'].keys():
        ol_year = int(wk['published-online']['date-parts'][0][0])
    if 'published-print' in wk.keys() and 'date-parts' in wk['published-print'].keys():
        pr_year = int(wk['published-print']['date-parts'][0][0])
    if pr_year > 0 and ol_year > 0:
        if pr_year > ol_year:
            pub_year = ol_year
        else:
            pub_year = pr_year
    elif ol_year > 0:
        pub_year = ol_year
    elif pr_year > 0:
        pub_year = pr_year
    return  pub_year

In [6]:
out_dir = './pub_search_crossref/cr_results_202404/mcc/'
# loop trough dates:

start_date = date(2013, 11, 1)
stop_date = date(2024, 4, 30)

while start_date < stop_date:
    end_date = start_date + timedelta(days=6)
    publications = {}
    print('Looking up :', str(start_date),"to", str(end_date))
    for this_aw in award_list:
        pub_w_grant = works.filter(from_created_date=str(start_date)).filter(until_created_date=str(end_date)).filter(award__number=this_aw)
        for wk in pub_w_grant:
            if not wk['DOI'] in publications:
                publications[wk['DOI']] = wk

    foud_pubs = {}
    for wk_doi in publications:
        wk = publications[wk_doi]
        art_authors = get_authors(wk)
        found_awards, mcc_pub, ukch_pub = get_awards(wk)
        pub_year = get_pub_year(wk)    
        this_pub = {'authors': art_authors,
                    'year': pub_year,
                    'title': wk['title'][0],
                    'DOI': wk['DOI'],
                    'awards': found_awards,
                    'mcc': int(mcc_pub),
                    'ukch': int(ukch_pub)}
        if not wk['DOI'] in foud_pubs:
             foud_pubs[wk['DOI']]= this_pub
    print ("Found", len(foud_pubs), "publications on the", str(start_date),"to",str(end_date))

    if len(foud_pubs) > 0:
        csv_rw.write_csv_data(foud_pubs, out_dir+'cr_check_'+str(end_date)+'a.csv') 
        foud_pubs = {}
    start_date = end_date + timedelta(days=1)

Looking up : 2013-11-01 to 2013-11-07
Found 0 publications on the 2013-11-01 to 2013-11-07
Looking up : 2013-11-08 to 2013-11-14
Found 0 publications on the 2013-11-08 to 2013-11-14
Looking up : 2013-11-15 to 2013-11-21
Found 0 publications on the 2013-11-15 to 2013-11-21
Looking up : 2013-11-22 to 2013-11-28
Found 0 publications on the 2013-11-22 to 2013-11-28
Looking up : 2013-11-29 to 2013-12-05
Found 0 publications on the 2013-11-29 to 2013-12-05
Looking up : 2013-12-06 to 2013-12-12
Found 0 publications on the 2013-12-06 to 2013-12-12
Looking up : 2013-12-13 to 2013-12-19
Found 0 publications on the 2013-12-13 to 2013-12-19
Looking up : 2013-12-20 to 2013-12-26
Found 0 publications on the 2013-12-20 to 2013-12-26
Looking up : 2013-12-27 to 2014-01-02
Found 0 publications on the 2013-12-27 to 2014-01-02
Looking up : 2014-01-03 to 2014-01-09
Found 0 publications on the 2014-01-03 to 2014-01-09
Looking up : 2014-01-10 to 2014-01-16
Found 0 publications on the 2014-01-10 to 2014-01-16

EP/L000202 True False
EP/L000202 True False
Found 2 publications on the 2015-05-29 to 2015-06-04
Looking up : 2015-06-05 to 2015-06-11
EP/L000202 True False
EP/L000202 True False
Found 2 publications on the 2015-06-05 to 2015-06-11
Looking up : 2015-06-12 to 2015-06-18
EP/L000202/1 True False
Found 1 publications on the 2015-06-12 to 2015-06-18
Looking up : 2015-06-19 to 2015-06-25
EP/L000202 True False
Found 1 publications on the 2015-06-19 to 2015-06-25
Looking up : 2015-06-26 to 2015-07-02
EP/L000202 True False
Found 1 publications on the 2015-06-26 to 2015-07-02
Looking up : 2015-07-03 to 2015-07-09
EP/L000202 True False
Found 1 publications on the 2015-07-03 to 2015-07-09
Looking up : 2015-07-10 to 2015-07-16
EP/L000202 True False
EP/L000202 True False
Found 2 publications on the 2015-07-10 to 2015-07-16
Looking up : 2015-07-17 to 2015-07-23
EP/L000202 True False
EP/L000202 True False
EP/L000202 True False
Found 3 publications on the 2015-07-17 to 2015-07-23
Looking up : 2015-07-2

EP/L000202 True False
EP/L000202 True False
EP/L000202 True False
EP/L000202 True False
Found 4 publications on the 2016-09-09 to 2016-09-15
Looking up : 2016-09-16 to 2016-09-22
EP/L000202 True False
Found 1 publications on the 2016-09-16 to 2016-09-22
Looking up : 2016-09-23 to 2016-09-29
EP/L000202/1 True False
EP/L000202 True False
EP/L000202 True False
EP/L000202 True False
Found 4 publications on the 2016-09-23 to 2016-09-29
Looking up : 2016-09-30 to 2016-10-06
EP/L000202 True False
EP/L000202 True False
EP/L000202 True False
Found 3 publications on the 2016-09-30 to 2016-10-06
Looking up : 2016-10-07 to 2016-10-13
EP/L000202 True False
Found 1 publications on the 2016-10-07 to 2016-10-13
Looking up : 2016-10-14 to 2016-10-20
EP/L000202 True False
EP/L000202 True False
Found 2 publications on the 2016-10-14 to 2016-10-20
Looking up : 2016-10-21 to 2016-10-27
EP/L000202 True False
EP/L000202 True False
Found 2 publications on the 2016-10-21 to 2016-10-27
Looking up : 2016-10-28 t

Found 0 publications on the 2017-11-17 to 2017-11-23
Looking up : 2017-11-24 to 2017-11-30
EP/L000202 True False
EP/L000202 True False
EP/L000202 True False
EP/L000202 True False
Found 4 publications on the 2017-11-24 to 2017-11-30
Looking up : 2017-12-01 to 2017-12-07
EP/L000202 True False
EP/L000202 True False
Found 2 publications on the 2017-12-01 to 2017-12-07
Looking up : 2017-12-08 to 2017-12-14
EP/L000202/1 True False
EP/L000202 True False
EP/L000202 True False
Found 3 publications on the 2017-12-08 to 2017-12-14
Looking up : 2017-12-15 to 2017-12-21
Found 0 publications on the 2017-12-15 to 2017-12-21
Looking up : 2017-12-22 to 2017-12-28
Found 0 publications on the 2017-12-22 to 2017-12-28
Looking up : 2017-12-29 to 2018-01-04
Found 0 publications on the 2017-12-29 to 2018-01-04
Looking up : 2018-01-05 to 2018-01-11
EP/L000202 True False
Found 1 publications on the 2018-01-05 to 2018-01-11
Looking up : 2018-01-12 to 2018-01-18
EP/L000202 True False
EP/L000202 True False
EP/L00

EP/L000202 True False
EP/L000202 True False
EP/L000202 True False
EP/L000202 True False
Found 4 publications on the 2019-01-25 to 2019-01-31
Looking up : 2019-02-01 to 2019-02-07
EP/L000202 True False
EP/L000202 True False
EP/L000202 True False
EP/L000202 True False
Found 4 publications on the 2019-02-01 to 2019-02-07
Looking up : 2019-02-08 to 2019-02-14
EP/L000202 True False
EP/L000202 True False
EP/L000202 True False
EP/L000202 True False
Found 4 publications on the 2019-02-08 to 2019-02-14
Looking up : 2019-02-15 to 2019-02-21
EP/L000202 True False
Found 1 publications on the 2019-02-15 to 2019-02-21
Looking up : 2019-02-22 to 2019-02-28
EP/L000202 True False
EP/L000202 True False
EP/L000202 True False
EP/L000202 True False
Found 4 publications on the 2019-02-22 to 2019-02-28
Looking up : 2019-03-01 to 2019-03-07
EP/L000202/1 True False
EP/L000202 True False
Found 2 publications on the 2019-03-01 to 2019-03-07
Looking up : 2019-03-08 to 2019-03-14
EP/L000202 True False
EP/L000202 T

EP/L000202 True False
EP/L000202 True False
Found 2 publications on the 2020-03-20 to 2020-03-26
Looking up : 2020-03-27 to 2020-04-02
EP/L000202 True False
Found 1 publications on the 2020-03-27 to 2020-04-02
Looking up : 2020-04-03 to 2020-04-09
Found 0 publications on the 2020-04-03 to 2020-04-09
Looking up : 2020-04-10 to 2020-04-16
EP/L000202 True False
EP/L000202 True False
EP/L000202 True False
Found 3 publications on the 2020-04-10 to 2020-04-16
Looking up : 2020-04-17 to 2020-04-23
EP/L000202 True False
Found 1 publications on the 2020-04-17 to 2020-04-23
Looking up : 2020-04-24 to 2020-04-30
Found 0 publications on the 2020-04-24 to 2020-04-30
Looking up : 2020-05-01 to 2020-05-07
EP/L000202/1 True False
EP/L000202/1, EP/L000202/1 True False
EP/L000202 True False
EP/L000202 True False
EP/L000202 True False
EP/L000202 True False
Found 6 publications on the 2020-05-01 to 2020-05-07
Looking up : 2020-05-08 to 2020-05-14
Found 0 publications on the 2020-05-08 to 2020-05-14
Lookin

EP/L000202/1 True False
EP/L000202/1 True False
EP/L000202 True False
EP/L000202 True False
EP/L000202 True False
Found 5 publications on the 2021-05-28 to 2021-06-03
Looking up : 2021-06-04 to 2021-06-10
EP/L000202 True False
EP/L000202 True False
Found 2 publications on the 2021-06-04 to 2021-06-10
Looking up : 2021-06-11 to 2021-06-17
EP/L000202 True False
EP/L000202 True False
EP/L000202 True False
Found 3 publications on the 2021-06-11 to 2021-06-17
Looking up : 2021-06-18 to 2021-06-24
EP/L000202/1 True False
EP/L000202 True False
EP/L000202 True False
EP/L000202 True False
Found 4 publications on the 2021-06-18 to 2021-06-24
Looking up : 2021-06-25 to 2021-07-01
EP/L000202 True False
Found 1 publications on the 2021-06-25 to 2021-07-01
Looking up : 2021-07-02 to 2021-07-08
EP/L000202 True False
EP/L000202 True False
EP/L000202 True False
Found 3 publications on the 2021-07-02 to 2021-07-08
Looking up : 2021-07-09 to 2021-07-15
EP/L000202 True False
EP/L000202 True False
EP/L0002

EP/L000202/1 True False
Found 1 publications on the 2022-08-05 to 2022-08-11
Looking up : 2022-08-12 to 2022-08-18
EP/L000202 True False
Found 1 publications on the 2022-08-12 to 2022-08-18
Looking up : 2022-08-19 to 2022-08-25
Found 0 publications on the 2022-08-19 to 2022-08-25
Looking up : 2022-08-26 to 2022-09-01
EP/L000202 True False
Found 1 publications on the 2022-08-26 to 2022-09-01
Looking up : 2022-09-02 to 2022-09-08
Found 0 publications on the 2022-09-02 to 2022-09-08
Looking up : 2022-09-09 to 2022-09-15
EP/L000202 True False
EP/L000202 True False
Found 2 publications on the 2022-09-09 to 2022-09-15
Looking up : 2022-09-16 to 2022-09-22
EP/L000202 True False
Found 1 publications on the 2022-09-16 to 2022-09-22
Looking up : 2022-09-23 to 2022-09-29
Found 0 publications on the 2022-09-23 to 2022-09-29
Looking up : 2022-09-30 to 2022-10-06
EP/L000202 True False
EP/L000202 True False
EP/L000202 True False
Found 3 publications on the 2022-09-30 to 2022-10-06
Looking up : 2022-1

Found 0 publications on the 2024-01-05 to 2024-01-11
Looking up : 2024-01-12 to 2024-01-18
Found 0 publications on the 2024-01-12 to 2024-01-18
Looking up : 2024-01-19 to 2024-01-25
Found 0 publications on the 2024-01-19 to 2024-01-25
Looking up : 2024-01-26 to 2024-02-01
EP/L000202 True False
EP/L000202 True False
Found 2 publications on the 2024-01-26 to 2024-02-01
Looking up : 2024-02-02 to 2024-02-08
EP/L000202 True False
Found 1 publications on the 2024-02-02 to 2024-02-08
Looking up : 2024-02-09 to 2024-02-15
EP/L000202 True False
Found 1 publications on the 2024-02-09 to 2024-02-15
Looking up : 2024-02-16 to 2024-02-22
Found 0 publications on the 2024-02-16 to 2024-02-22
Looking up : 2024-02-23 to 2024-02-29
Found 0 publications on the 2024-02-23 to 2024-02-29
Looking up : 2024-03-01 to 2024-03-07
Found 0 publications on the 2024-03-01 to 2024-03-07
Looking up : 2024-03-08 to 2024-03-14
EP/L000202 True False
Found 1 publications on the 2024-03-08 to 2024-03-14
Looking up : 2024-