# Check existence of records in list in the publications DB
Compare a list of DOIs to the existing records in the DB and determine if the records are present in the DB. Additionally, determine which records in the DB are not in the list (when the task is to verify origins of DB entries against other sourcess such as Cross Ref).


In [1]:
# Connecting to the db
import lib.handle_db as dbh

# read and write csv files
import lib.handle_csv as csv_rw

# date functions
from datetime import datetime, date, timedelta

# managing files and file paths
from pathlib import Path

# import custom functions (common to various notebooks)
import processing_functions as pr_fns

#CR libraries
from crossref.restful import Works, Etiquette


In [2]:
# open DB and get the set of dois in db
def get_db_dois_set(app_db):
    app_pubs = pr_fns.get_pub_data(app_db)
    return set([a_pub[2].lower() for a_pub in app_pubs])

# get a set of dois from a csv_file
def get_csv_dois_set(csv_file):
    pubs_list, _ = csv_rw.get_csv_data(csv_file)
    return set([pubs_list[a_pub]['doi'].lower() for a_pub in pubs_list])

In [3]:
ws_search_csv = "../mcc_data/mcc_2024.csv"

# get list of publications in WS
ws_dois = get_csv_dois_set(ws_search_csv)
print("DOIs in website:", len(ws_dois))


di_app_db = "../mcc_data/development.sqlite3"
# get the set of all dois in DB
db_dois = get_db_dois_set(di_app_db)

print("DOIs in DB:", len(db_dois))

cr_search_csv = "../mcc_data/cr_check_2024.csv"
# get list of publications found in CR
cr_dois = get_csv_dois_set(cr_search_csv)
print("DOIs in CR:", len(cr_dois))




DOIs in website: 901
DOIs in DB: 1129
DOIs in CR: 1019


In [4]:
# these comparisons are not important as the base source is the content of WS, not he app db
# dois in db and not in search: 
in_db_not_cr = db_dois - cr_dois

print("Number of DOIs in DB not in search", len (in_db_not_cr))

# dois in search and not in db: 
in_cr_only =  cr_dois - db_dois

print("Number of DOIs only on search", len (in_cr_only))

# dois in search and in db:
in_db_and_search = db_dois.intersection(cr_dois)

print("Number of DOIs only in db and search", len (in_db_and_search))



Number of DOIs in DB not in search 639
Number of DOIs only on search 529
Number of DOIs only in db and search 490


In [5]:
# dois in db and not in search: 
in_db_not_ws = db_dois - ws_dois
print("Number of DOIs in DB not in WS\t", len (in_db_not_ws), "\t** need to ask if keep")

# dois in search and not in db: 
in_ws_only =  ws_dois - db_dois

print("Number of DOIs only in WS\t", len (in_ws_only), "\t** need to add to DB")

# dois in search and in db:
in_db_and_ws = db_dois.intersection(ws_dois)

print("Number of DOIs in DB and WS\t", len (in_db_and_ws))

Number of DOIs in DB not in WS	 241 	** need to ask if keep
Number of DOIs only in WS	 13 	** need to add to DB
Number of DOIs in DB and WS	 888


In [6]:
# dois in db and not in search: 
in_cr_not_ws = cr_dois - ws_dois
print("Number of DOIs in CR not in WS\t", len (in_cr_not_ws), "\t** ask if they should be added")

# dois in ws and not in cr: 
in_ws_not_cr =  ws_dois - cr_dois

print("Number of DOIs in WS not in CR\t", len (in_ws_not_cr))

# dois in search and in ws:
in_cr_and_ws = cr_dois.intersection(ws_dois)

print("Number of DOIs in CR and WS\t", len (in_cr_and_ws))

Number of DOIs in CR not in WS	 641 	** ask if they should be added
Number of DOIs in WS not in CR	 523
Number of DOIs in CR and WS	 378


In [7]:
# dois on CR or DB not in WS
in_cr_or_db_not_ws = in_cr_not_ws.union(in_db_not_ws)
print("Number of DOIs in CR and DB, but not in WS\t", len (in_cr_or_db_not_ws), "\t** total of pubs to be reported")

#dois in DB not in search 
in_db_not_cr_not_ws = in_db_not_ws-in_cr_not_ws
print("Number of DOIs in DB, but not in WS an CR \t", len (in_db_not_cr_not_ws), "\t** pubs reported previously, not in WS")


Number of DOIs in CR and DB, but not in WS	 769 	** total of pubs to be reported
Number of DOIs in DB, but not in WS an CR 	 128 	** pubs reported previously, not in WS


In [9]:
# CR report
# Build reporting file from:
#   901 record in WS -- all in the DB from 2022 search
#   641 records from CR search not in WS report to scott and ask if they shuold be added
#   128 records from DB and not in WS repost to scott and ask if they shoul be ignored

# full list of mcc 
mcc_ws_list, _ = csv_rw.get_csv_data(ws_search_csv)



for a_pub in mcc_ws_list:
    this_doi = mcc_ws_list[a_pub]['doi']
    mcc_ws_list[a_pub]['in_db']=0
    for db_pub in app_pubs:
        try:
            if this_doi.strip().lower() == db_pub[2].strip().lower():
                mcc_ws_list[a_pub]['in_db']= 1
                print(mcc_ws_list[a_pub])
                break
        except:
            print ("issue with",  mcc_ws_list[a_pub])
            break

NameError: name 'app_pubs' is not defined

In [None]:
if len(mcc_ws_list) > 0:
    csv_rw.write_csv_data(mcc_ws_list, '../mcc_data/cr_check_2024a.csv') 

In [None]:
mcc_ws_list