## Verify if files exist in UK Catalysis Hub App DB

A list of articles is obtainded from UKCH publications pages. This list will contain a titles and some IDs which need to be verified to see if they are already in the DB.

All missing titles need to be added to the DB.

In [8]:
# Libraries
# library containign functions that read and write to csv files
import lib.handle_csv as csvh
# library for connecting to the db
import lib.handle_db as dbh
# library for handling text matchings
import lib.text_comp as txtc
# library for getting data from crossref
import lib.crossref_api as cr_api
# library for handling url searchs
import lib.handle_urls as urlh

from pathlib import Path


# input files
ukch_pubs_list = 'scripts/UKCH20201018_mod_1.csv'
ukchapp_db = "../railsapp/ukchapp/db/development.sqlite3"

Get the name of the file with the current publications in the UKCH site

In [10]:
ukch_pubs_list = ""
while not Path(ukch_pubs_list).is_file():
    print('Please enter the name of the input file:')
    ukch_pubs_list = input()

Please enter the name of the input file:
scripts/UKCH20201018_mod_1.csv


Get the name of the DB file to use

In [12]:
ukchapp_db = ""
while not Path(ukchapp_db).is_file():
    print('Please enter the name of the input file:')
    ukchapp_db = input()

Please enter the name of the input file:
../railsapp/ukchapp/db/development.sqlite3


Set the name of the output file

In [23]:
#output file
output_file = ukch_pubs_list[:-4]+"_db.csv"
print("Verifying if the articles listed in: \n\t", Path(ukch_pubs_list).name)
print("Are already in the app DB: \n\t", Path(ukchapp_db).name)

print("The results will bt saves in: \n\t", output_file)

Verifying if the articles listed in: 
	 UKCH20201018_mod_1.csv
Are already in the app DB: 
	 development.sqlite3
The results will bt saves in: 
	 scripts/UKCH20201018_mod_1_db.csv


In [None]:
# open the input file
working_file, wf_fields = csvh.get_csv_data(ukch_pubs_list,'id')
# create connection to the DB
db_conn = dbh.DataBaseAdapter(ukchapp_db)
# get a list of the articles in the DB
db_articles = db_conn.get_full_table('articles')

# verify each article in the input file against the articles in the DB
for art_num in working_file:
    current_title = working_file[art_num]['Title']
    current_theme = working_file[art_num]['theme_id'],
    current_collaboration = working_file[art_num]['collaboration'],
    current_year =  working_file[art_num]['p_year']
    #print(working_file[art_num]['Title'], working_file[art_num]['theme'],working_file[art_num]['p_year'])
    in_db = False
    #print(current_title)
    for db_art in db_articles:
        db_title = db_art[2]
        db_doi = db_art[1]
        db_id = db_art[0]
        #print("DB title", db_title)
        if txtc.similar(current_title, db_title)> 0.80:
            #print(art_num, 'Title:', current_title, "already in db")
            in_db = True
            working_file[art_num]['doi'] = db_doi
            working_file[art_num]['db_id'] = db_id
            break
            
    if not in_db:
        #print(art_num, 'Title:', current_title, "need to add to DB")
        working_file[art_num]['Add'] = 1
    else:
        #verify if theme and year are correct
        found_theme = False
        year_ok = False
        filter_str = "article_id = '" + str(working_file[art_num]['db_id']) + "'"
        art_theme_ids = db_conn.get_values("article_themes", "article_id", filter_str)
        
        for art_theme in art_theme_ids:
            a_theme = db_conn.get_row("article_themes", art_theme[0])
            if len(a_theme) == 1:
                if a_theme[0][4] == current_theme:
                    found_theme = True
                    if a_theme[0][6] == current_year:
                        year_ok = True
                        break
        if not found_theme:
            working_file[art_num]['theme issues'] = "need to add article theme link"
        elif found_theme and not year_ok:
            working_file[art_num]['theme issues'] = "verify article theme project year"

csvh.write_csv_data(working_file, output_file)