# Check existence of records in list in the publications DB
Compare a list of DOIs to the existing records in the DB and determine if the records are present in the DB. Additionally, determine which records in the DB are not in the list (when the task is to verify origins of DB entries against other sourcess such as Cross Ref).


In [2]:
# Connecting to the db
import lib.handle_db as dbh

# read and write csv files
import lib.handle_csv as csv_rw

# date functions
from datetime import datetime, date, timedelta

# managing files and file paths
from pathlib import Path

# import custom functions (common to various notebooks)
import processing_functions as pr_fns

#CR libraries
from crossref.restful import Works, Etiquette


In [43]:
# open DB and get the set of dois in db
def get_db_dois_set(app_db):
    app_pubs = pr_fns.get_pub_data(ukchapp_db)
    return set([a_pub[2].lower() for a_pub in app_pubs])

db_name = 'development'

# 1 currend app DB
ukchapp_db = "../mcc_data/" + db_name + ".sqlite3"

# get publication data from the ukch app
app_pubs = pr_fns.get_pub_data(ukchapp_db)

In [44]:
dois_in_db = set([a_pub[2].lower() for a_pub in app_pubs])
other_dois_in_db = get_db_dois_set("../mcc_data/development.sqlite3")


In [45]:
print("DOIs in DB:", len(dois_in_db))

print("DOIs in DB:", len(other_dois_in_db))

# get list of publications found in CR
cr_search_file = "../mcc_data/cr_check_2024.csv"
mcc_cr_list, _ = csv_rw.get_csv_data(cr_search_file)
dois_in_cr = set([mcc_cr_list[a_pub]['doi'].lower() for a_pub in mcc_cr_list])
print("DOIs in CR:", len(dois_in_cr ))

# get list of publications found in CR
mcc_ws_files = "../mcc_data/mcc_2024.csv"
mcc_ws_list, _ = csv_rw.get_csv_data(mcc_ws_files)
dois_in_ws = set([mcc_ws_list[a_pub]['doi'].lower() for a_pub in mcc_ws_list])
print("DOIs in website:", len(dois_in_ws ))

DOIs in DB: 1129
DOIs in DB: 1129
DOIs in CR: 1019
DOIs in website: 901


In [41]:
# dois in db and not in search: 
ind_db_not_search = dois_in_db - dois_in_cr

print("Number of DOIs in DB not in search", len (ind_db_only ))

# dois in search and not in db: 
ind_search_only =  dois_in_cr - dois_in_db

print("Number of DOIs only on search", len (ind_search_only))

# dois in search and in db:
in_db_and_search = dois_in_db.intersection(dois_in_cr)

print("Number of DOIs only in db and search", len (in_db_and_search))


Number of DOIs in DB not in search 639
Number of DOIs only on search 529
Number of DOIs only in db and search 490


In [42]:
# dois in db and not in search: 
ind_db_not_ws = dois_in_db - dois_in_ws
print("Number of DOIs in DB not in WS", len (ind_db_not_ws))

# dois in search and not in db: 
ind_ws_only =  dois_in_ws - dois_in_db

print("Number of DOIs only in WS", len (ind_ws_only))

# dois in search and in db:
in_db_and_ws = dois_in_db.intersection(dois_in_ws)

print("Number of DOIs in DB and WS", len (in_db_and_ws))

Number of DOIs in DB not in WS 241
Number of DOIs only in WS 13
Number of DOIs in DB and WS 888


In [7]:
for a_pub in mcc_ws_list:
    this_doi = mcc_ws_list[a_pub]['doi']
    mcc_ws_list[a_pub]['in_db']=0
    for db_pub in app_pubs:
        try:
            if this_doi.strip().lower() == db_pub[2].strip().lower():
                mcc_ws_list[a_pub]['in_db']= 1
                print(mcc_ws_list[a_pub])
                break
        except:
            print ("issue with",  mcc_ws_list[a_pub])
            break

{'num': '1', 'authors': 'Sotelo‐Vazquez, Carlos, Quesada‐Cabrera, Raul, Ling, Min, Scanlon, David O., Kafizas, Andreas, Thakur, Pardeep Kumar, Lee, Tien‐Lin, Taylor, Alaric, Watson, Graeme W., Palgrave, Robert G., Durrant, James R., Blackman, Christopher S., Parkin, Ivan P.', 'year': '2017', 'title': 'Evidence and Effect of Photogenerated Charge Transfer for Enhanced Photocatalysis in WO<sub>3</sub>/TiO<sub>2</sub> Heterojunction Films: A Computational and Experimental Study', 'doi': '10.1002/adfm.201605413', 'awards': 'EP/L000202', 'mcc': '1', 'cut_date': '30/03/2017', 'in_db': 1}
{'num': '3', 'authors': 'Swallow, Jack E. N., Williamson, Benjamin A. D., Whittles, Thomas J., Birkett, Max, Featherstone, Thomas J., Peng, Nianhua, Abbott, Alex, Farnworth, Mark, Cheetham, Kieran J., Warren, Paul, Scanlon, David O., Dhanak, Vin R., Veal, Tim D.', 'year': '2017', 'title': 'Self‐Compensation in Transparent Conducting F‐Doped SnO<sub>2</sub>', 'doi': '10.1002/adfm.201701900', 'awards': 'EP/L00

{'num': '398', 'authors': 'Kubas, Adam, Gajdos, Fruzsina, Heck, Alexander, Oberhofer, Harald, Elstner, Marcus, Blumberger, Jochen', 'year': '2015', 'title': 'Electronic couplings for molecular charge transfer: benchmarking CDFT, FODFT and FODFTB against high-level ab initio calculations. II', 'doi': '10.1039/c4cp04749d', 'awards': 'EP/L000202', 'mcc': '1', 'cut_date': '25/12/2014', 'in_db': 1}
{'num': '399', 'authors': 'Brincat, Nicholas A., Parker, Stephen C., Molinari, Marco, Allen, Geoffrey C., Storr, Mark T.', 'year': '2015', 'title': 'Density functional theory investigation of the layered uranium oxides U<sub>3</sub>O<sub>8</sub> and U<sub>2</sub>O<sub>5</sub>', 'doi': '10.1039/c4dt02493a', 'awards': 'EP/L000202', 'mcc': '1', 'cut_date': '13/11/2014', 'in_db': 1}
{'num': '401', 'authors': 'Murray, Alexander T., Frost, Jarvist M., Hendon, Christopher H., Molloy, Christopher D., Carbery, David R., Walsh, Aron', 'year': '2015', 'title': 'Modular design of SPIRO-OMeTAD analogues as ho

{'num': '728', 'authors': 'Skelton, Jonathan M., Jackson, Adam J., Dimitrievska, Mirjana, Wallace, Suzanne K., Walsh, Aron', 'year': '2015', 'title': 'Vibrational spectra and lattice thermal conductivity of kesterite-structured Cu2ZnSnS4 and Cu2ZnSnSe4', 'doi': '10.1063/1.4917044', 'awards': 'EP/L000202', 'mcc': '1', 'cut_date': '09/04/2015', 'in_db': 1}
{'num': '729', 'authors': 'Webster, R., Bernasconi, L., Harrison, N. M.', 'year': '2015', 'title': 'Optical properties of alkali halide crystals from all-electron hybrid TD-DFT calculations', 'doi': '10.1063/1.4921822', 'awards': 'EP/L000202', 'mcc': '1', 'cut_date': '04/06/2015', 'in_db': 1}
{'num': '730', 'authors': 'Shayeghi, A., Schäfer, R., Rayner, D. M., Johnston, R. L., Fielicke, A.', 'year': '2015', 'title': 'Charge-induced dipole vs. relativistically enhanced covalent interactions in Ar-tagged Au-Ag tetramers and pentamers', 'doi': '10.1063/1.4923255', 'awards': 'EP/L000202', 'mcc': '1', 'cut_date': '16/07/2015', 'in_db': 1}
{

In [8]:
if len(mcc_ws_list) > 0:
    csv_rw.write_csv_data(mcc_ws_list, '../mcc_data/cr_check_2024a.csv') 

In [9]:
mcc_ws_list

{1: {'num': '1',
  'authors': 'Sotelo‐Vazquez, Carlos, Quesada‐Cabrera, Raul, Ling, Min, Scanlon, David O., Kafizas, Andreas, Thakur, Pardeep Kumar, Lee, Tien‐Lin, Taylor, Alaric, Watson, Graeme W., Palgrave, Robert G., Durrant, James R., Blackman, Christopher S., Parkin, Ivan P.',
  'year': '2017',
  'title': 'Evidence and Effect of Photogenerated Charge Transfer for Enhanced Photocatalysis in WO<sub>3</sub>/TiO<sub>2</sub> Heterojunction Films: A Computational and Experimental Study',
  'doi': '10.1002/adfm.201605413',
  'awards': 'EP/L000202',
  'mcc': '1',
  'cut_date': '30/03/2017',
  'in_db': 1},
 2: {'num': '2',
  'authors': 'Li, Bo, Yuan, Fukang, He, Guanjie, Han, Xiaoyu, Wang, Xin, Qin, Jinbao, Guo, Zheng Xiao, Lu, Xinwu, Wang, Qian, Parkin, Ivan P., Wu, Chengtie',
  'year': '2017',
  'title': 'Ultrasmall CuCo<sub>2</sub>S<sub>4</sub> Nanocrystals: All‐in‐One Theragnosis Nanoplatform with Magnetic Resonance/Near‐Infrared Imaging for Efficiently Photothermal Therapy of Tumors',