# Check existence of records in list in the publications DB
Compare a list of DOIs to the existing records in the DB and determine if the records are present in the DB. Additionally, determine which records in the DB are not in the list (when the task is to verify origins of DB entries against other sourcess such as Cross Ref).


In [1]:
# Connecting to the db
import lib.handle_db as dbh

# read and write csv files
import lib.handle_csv as csv_rw

# date functions
from datetime import datetime, date, timedelta

# managing files and file paths
from pathlib import Path

# import custom functions (common to various notebooks)
import processing_functions as pr_fns

#CR libraries
from crossref.restful import Works, Etiquette


In [2]:
# open DB and get the set of dois in db
def get_db_dois_set(app_db):
    app_pubs = pr_fns.get_pub_data(app_db)
    return set([a_pub[2].lower() for a_pub in app_pubs])

# get a set of dois from a csv_file
def get_csv_dois_set(csv_file):
    pubs_list, _ = csv_rw.get_csv_data(csv_file)
    return set([pubs_list[a_pub]['doi'].lower() for a_pub in pubs_list])

In [3]:
# get the set of all dois in DB
dois_in_db = get_db_dois_set("../mcc_data/development.sqlite3")

print("DOIs in DB:", len(dois_in_db))

# get list of publications found in CR
dois_in_cr = get_csv_dois_set("../mcc_data/cr_check_2024.csv")
print("DOIs in CR:", len(dois_in_cr ))

# get list of publications found in CR
dois_in_ws = get_csv_dois_set("../mcc_data/mcc_2024.csv")
print("DOIs in website:", len(dois_in_ws ))


DOIs in DB: 1129
DOIs in CR: 1019
DOIs in website: 901


In [4]:
# dois in db and not in search: 
ind_db_not_search = dois_in_db - dois_in_cr

print("Number of DOIs in DB not in search", len (ind_db_not_search))

# dois in search and not in db: 
ind_search_only =  dois_in_cr - dois_in_db

print("Number of DOIs only on search", len (ind_search_only))

# dois in search and in db:
in_db_and_search = dois_in_db.intersection(dois_in_cr)

print("Number of DOIs only in db and search", len (in_db_and_search))


Number of DOIs in DB not in search 639
Number of DOIs only on search 529
Number of DOIs only in db and search 490


In [8]:
# dois in db and not in search: 
ind_db_not_ws = dois_in_db - dois_in_ws
print("Number of DOIs in DB not in WS", len (ind_db_not_ws), "**")

# dois in search and not in db: 
ind_ws_only =  dois_in_ws - dois_in_db

print("Number of DOIs only in WS", len (ind_ws_only))

# dois in search and in db:
in_db_and_ws = dois_in_db.intersection(dois_in_ws)

print("Number of DOIs in DB and WS", len (in_db_and_ws))

Number of DOIs in DB not in WS 241 **
Number of DOIs only in WS 13
Number of DOIs in DB and WS 888


In [7]:
# dois in db and not in search: 
ind_cr_not_ws = dois_in_cr - dois_in_ws
print("Number of DOIs in CR not in WS", len (ind_cr_not_ws), "**")

# dois in ws and not in cr: 
ind_ws_not_cr =  dois_in_ws - dois_in_cr

print("Number of DOIs in WS not in CR", len (ind_ws_not_cr))

# dois in search and in ws:
in_cr_and_ws = dois_in_cr.intersection(dois_in_ws)

print("Number of DOIs in CR and WS", len (in_cr_and_ws))

Number of DOIs in CR not in WS 641 **
Number of DOIs in WS not in CR 523
Number of DOIs in CR and WS 378


In [10]:
# dois on CR or DB not in WS
in_cr_or_db_not_ws = ind_cr_not_ws.union(ind_db_not_ws)
print("Number of DOIs in CR and DB, but not in WS", len (in_cr_or_db_not_ws), "**")

Number of DOIs in CR and DB, but not in WS 769 **


In [11]:
for a_pub in mcc_ws_list:
    this_doi = mcc_ws_list[a_pub]['doi']
    mcc_ws_list[a_pub]['in_db']=0
    for db_pub in app_pubs:
        try:
            if this_doi.strip().lower() == db_pub[2].strip().lower():
                mcc_ws_list[a_pub]['in_db']= 1
                print(mcc_ws_list[a_pub])
                break
        except:
            print ("issue with",  mcc_ws_list[a_pub])
            break

NameError: name 'mcc_ws_list' is not defined

In [None]:
if len(mcc_ws_list) > 0:
    csv_rw.write_csv_data(mcc_ws_list, '../mcc_data/cr_check_2024a.csv') 

In [None]:
mcc_ws_list