# CR Vs DB Verify
check if the results from cr search are in the DB
- get list of CSV search results files
- join searches as a set (unique references)
- open DB
- search if occurrences in DB
- create a list of new candidates

In [1]:
# Connecting to the db
import lib.handle_db as dbh

# read and write csv files
import lib.handle_csv as csv_rw

# date functions
from datetime import datetime, date, timedelta

# managing files and file paths
from pathlib import Path

# import custom functions (common to various notebooks)
import processing_functions as pr_fns

In [4]:
# 01 search if the data was in previous WoS results

search_dir = './WebOfScience/wos_202311/'
prev_s_dir = './WebOfScience/'

prv_file_name = "wos_ukch_previous.csv"

in_file_name = "wos_202311_ukch.csv"
out_file_name = "wos_202311_ukch_01_wosp.csv"

in_file = Path(search_dir, in_file_name)
out_file = Path(search_dir, out_file_name)
prv_file = Path(prev_s_dir, prv_file_name)

prv_data, _ =  csv_rw.get_csv_data(prv_file)

in_data, _ = csv_rw.get_csv_data(in_file)

# compare search to previous results
for in_id in in_data:
    in_data[in_id]['prv_checked'] = 0
    for prv_id in prv_data:
        if in_data[in_id]['doi'] == prv_data[prv_id]['doi'] or \
        in_data[in_id]['wos_ID'] == prv_data[prv_id]['wos_ID']: 
            in_data[in_id]['prv_checked'] = 1
            break
        
        
# WRITE TO FILE
if len(in_data) > 0:
    csv_rw.write_csv_data(in_data, out_file) 

In [6]:
# 02 search if results are in the DB already

# WoS search results
base_dir = './WebOfScience/wos_202311/'
          
in_file_name = "wos_202311_ukch_01_wosp.csv"
out_file_name = "wos_202311_ukch_02_db.csv"

# open DB
db_name = 'production'

# 1 currend app DB
ukchapp_db = "db_files/" + db_name + ".sqlite3"

out_file = Path(base_dir, out_file_name)
in_file = Path(base_dir, in_file_name)

csv_data, csv_headings = csv_rw.get_csv_data(in_file)

# get publication data from the ukch app
app_pubs = pr_fns.get_pub_data(ukchapp_db)

for collected_id in csv_data:
    a_doi = csv_data[collected_id]['doi']
    alt_doi = "";
    if ("/anie." in a_doi):
        alt_doi = a_doi.replace("/anie.","/ange.")
    elif ("/ange." in a_doi):
        alt_doi = a_doi.replace("/ange.","/anie.")
    found_in_db = 0
    for a_pub in app_pubs:
        pub_id = a_pub[0]
        pub_title = a_pub[1]
        pub_doi = a_pub[2]
        pub_url = a_pub[3]
        pub_pdf = a_pub[4]
        if a_doi == pub_doi or \
            alt_doi == pub_doi:
            found_in_db = 1
    csv_data[collected_id]['in_db'] = found_in_db
    
# WRITE TO FILE
if len(csv_data) > 0:
    csv_rw.write_csv_data(csv_data, out_file) 
    

In [7]:
# 03 Search if results are in CR search for same month

# if not in checked before, not in DB and not in CR then leave ask blank


# 04 search the docs directly
# WoS search results
base_dir = './WebOfScience/wos_202311/'
          
in_file_name = "wos_202311_ukch_03_cr.csv"

out_file_name = "wos_202311_ukch_04_check.csv"


in_file = Path(base_dir, in_file_name)

out_file = Path(base_dir, out_file_name)

csv_data, csv_headings = csv_rw.get_csv_data(in_file)

for an_id in csv_data:
    if (csv_data[an_id]['ask']==''):#and csv_data[an_id]['Reason']!= 'one or more authors with UKCH affiliation'):
        print (csv_data[an_id])
        print ("https://doi.org/"+csv_data[an_id]['doi'])
        print("ask if should be added")
        should_ask = input()
        csv_data[an_id]['ask']=should_ask
        print("reason for asking")
        reason_to_ask = input()
        csv_data[an_id]['Reason']=reason_to_ask
        
# WRITE TO FILE
if len(csv_data) > 0:
    csv_rw.write_csv_data(csv_data, out_file) 

{'num': '152', 'Authors': 'Wang, XD', 'Author Full Names': 'Wang, Xiaodong', 'Article Title': 'Heterogeneous Catalysis Mediated Interconversion between NAD(P)+ and NAD(P)H Accompanied by Consumption and Generation of Hydrogen Johnson Matthey Technology Review features laboratory research', 'Source Title': 'JOHNSON MATTHEY TECHNOLOGY REVIEW', 'Publication Year': '2023', 'Volume': '67', 'Issue': '4', 'Start Page': '452', 'End Page': '454', 'Article Number': '', 'doi': '10.1595/205651323X16686913816837', 'DOI Link': 'http://dx.doi.org/10.1595/205651323X16686913816837', 'wos_ID': 'WOS:001083261100010', 'prv_checked': '0', 'in_db': '0', 'in_cr_search': '0', 'ask': ''}
https://doi.org/10.1595/205651323X16686913816837
ask if should be added
No
reason for asking
article not available
{'num': '164', 'Authors': 'Al Sobhi, S; AlShibane, I; Catlow, CRA; Daisley, A; Hargreaves, JSJ; Hector, AL; Higham, MD; Zeinalipour-Yazdi, CD', 'Author Full Names': 'Al Sobhi, Samia; AlShibane, Ihfaf; Catlow, C. R

In [13]:
# 05
# add new results to previous wos search

search_dir = './WebOfScience/wos_202311/'
prev_s_dir = './WebOfScience/'

prv_file_name = "wos_ukch_previous.csv"

in_file_name = "wos_202311_ukch_04_check.csv"
out_file_name = "wos_ukch_previous_a.csv"

in_file = Path(search_dir, in_file_name)

out_file = Path(prev_s_dir, out_file_name)
prv_file = Path(prev_s_dir, prv_file_name)

prv_data, prev_head =  csv_rw.get_csv_data(prv_file)

in_data, new_head = csv_rw.get_csv_data(in_file)



['authors', 'title', 'pub_year', 'doi', 'wos_ID', 'cut_date']
['num', 'Authors', 'Author Full Names', 'Article Title', 'Source Title', 'Publication Year', 'Volume', 'Issue', 'Start Page', 'End Page', 'Article Number', 'doi', 'DOI Link', 'wos_ID', 'prv_checked', 'in_db', 'in_cr_search', 'ask', 'Reason']


In [20]:
max_count = max(prv_data)

for in_id in in_data:
    if in_data[in_id]['prv_checked'] == 0:
        new_wos_result = {'authors': in_data[in_id]['Authors'], 
                          'title':in_data[in_id]['Article Title'], 
                          'pub_year':in_data[in_id]['Publication Year'], 
                          'doi':in_data[in_id]['doi'], 
                          'wos_ID':in_data[in_id]['wos_ID'], 
                          'cut_date':"2023-12-10"}
        max_count += 1
        print(new_wos_result)
        prv_data[max_count] = new_wos_result

# WRITE TO FILE
if len(prv_data) > 0:
    csv_rw.write_csv_data(prv_data, out_file) 

## OLD code 

In [None]:
## Run the first cell and then run this one
base_dir = './WebOfScience'
out_file_name = "cr_check_wos_2023c.csv"

out_file = Path(base_dir, out_file_name)

csv_data, csv_headings = csv_rw.get_csv_data(out_file)

for an_id in csv_data:
    if (csv_data[an_id]['ask']==''):#and csv_data[an_id]['Reason']!= 'one or more authors with UKCH affiliation'):
        print (csv_data[an_id])
        print ("https://doi.org/"+csv_data[an_id]['doi'])
        print("ask if should be added")
        should_ask = input()
        csv_data[an_id]['ask']=should_ask
        print("reason for asking")
        reason_to_ask = input()
        csv_data[an_id]['Reason']=reason_to_ask
        
    # WRITE TO FILE
if len(csv_data) > 0:
    csv_rw.write_csv_data(csv_data, out_file)     

In [None]:
# get list of CSV search results files
base_dir = './pub_search_crossref'
csv_flies_list = ['cr_ol_202204.csv','cr_online_lookup.csv','cr_archive_lookup.csv']
collect_csv = {}
out_file = Path(base_dir, "cr_all202204.csv")

In [None]:
# join searches as a set (unique references)
for a_csv in csv_flies_list:
    a_file = Path(base_dir,a_csv)
    if a_file.is_file():
        print("**********************************************")
        print("checking ", a_file)
        csv_data, csv_headings = csv_rw.get_csv_data(a_file)
        print("Found: ", a_file, " entries: ", len(csv_data))
        for an_id in csv_data:
            a_doi = csv_data[an_id]['doi']
            alt_doi = "";
            if ("/anie." in a_doi):
                alt_doi = a_doi.replace("/anie.","/ange.")
            elif ("/ange." in a_doi):
                alt_doi = a_doi.replace("/ange.","/anie.")
            new_id = len(collect_csv)+1
            already_collected = False
            for collected_id in collect_csv:
                if a_doi == collect_csv[collected_id]['doi'] or \
                   alt_doi == collect_csv[collected_id]['doi']:
                    print ('reviewing:', csv_data[an_id]['doi'],"\nFound ", collect_csv[collected_id])
                    collect_csv[collected_id]['occurrence'] += 1
                    collect_csv[collected_id]['alt_doi'] = alt_doi
                    already_collected = True
                    break
            if not already_collected: 
                print("adding", new_id, a_doi)
                collect_csv[new_id] = csv_data[an_id]
                collect_csv[new_id]['occurrence'] = 1
                collect_csv[new_id]['alt_doi'] = ''
# WRITE TO FILE
if len(collect_csv) > 0:
    csv_rw.write_csv_data(collect_csv, out_file) 

In [None]:
# WRITE TO FILE
if len(csv_data) > 0:
    csv_rw.write_csv_data(csv_data, out_file) 

In [None]:
# WRITE TO FILE
if len(csv_data) > 0:
    csv_rw.write_csv_data(csv_data, out_file) 

In [None]:
ukch_awards = ['EP/R026939/1', 'EP/R026815/1', 'EP/R026645/1', 'EP/R027129/1', 'EP/M013219/1',
               'EP/K014706/2', 'EP/K014668/1', 'EP/K014854/1', 'EP/K014714/1',]

for an_id in csv_data:
    if (csv_data[an_id]['Reason']!='' ):
        awards_list = []
        for an_aw in ukch_awards:
            if an_aw in csv_data[an_id]['Reason']:
                awards_list.append(an_aw)
        if awards_list != []:
            csv_data[an_id]['award_in_ack'] = str(awards_list)

# WRITE TO FILE
if len(csv_data) > 0:
    csv_rw.write_csv_data(csv_data, out_file) 

In [None]:
csv_data