# Get data for new articles and list of all articles not citing data
A list of publications is obtainded from the app database. This list will contain a titles, IDs and DOIs which need to be explored to look for the corresponding pdf files. 
The steps of the process are: 
 1. get a Title, DOI, and URL for each publication
 2. convert the DOI to a pdf file name and try to open de file
 3. use pdfMiner and/or CDE to get the reference to data
 4. add a new dataset entry each time a new data object is found

In [15]:
# Libraries
# library containign functions that read and write to csv files
import lib.handle_csv as csvh
# library for connecting to the db
import lib.handle_db as dbh
# library for handling text matchings
import lib.text_comp as txtc
# library for getting data from crossref
import lib.crossref_api as cr_api
# library for handling url searchs
import lib.handle_urls as urlh
# managing files and file paths
from pathlib import Path
# add aprogress bar
from tqdm import notebook 
#library for handling json files
import json
# library for using regular expressions
import re
# library for handling http requests
import requests
# import custom functions (common to various notebooks)
import processing_functions as pr_fns

current_step = 1

In [16]:
def pdf_column_populated(data_db):
    ukchapp_db = "db_files/" + data_db + ".sqlite3"
    
    # get publication data from the ukch app
    app_pubs = pr_fns.get_pub_data(ukchapp_db)

    i_indx = 0
    for a_pub in app_pubs:
        if a_pub[4] != None:
            i_indx += 1
    #print (i_indx/len(app_pubs) > 0.9, i_indx/len(app_pubs))
    return (i_indx/len(app_pubs) > 0.9)

## Get pdf and HTML names into app DB

0. Add fields to articles table for holding pdf file names
1. Open the previously verified DB and get the publications list
2. Open the current publication list from the appdb
3. Get pdf and html file names from previous and put it in current



In [17]:
def add_pdf_file_column(db_name, table_name, column_name, column_type):
    if not column_exists(db_name, table_name, column_name):
        ukchapp_db = "db_files/" + db_name + ".sqlite3"
        db_conn = dbh.DataBaseAdapter(ukchapp_db)
        db_conn.add_column(table_name, column_name, column_type)
    else:
        print (column_name, "Alredy exists in ", table_name)
        
def column_exists(db_name, table_name, column_name):
    ukchapp_db = "db_files/" + db_name + ".sqlite3"
    db_conn = dbh.DataBaseAdapter(ukchapp_db)
    ti=db_conn.get_table_info('articles')
    for a_col in ti:
        if a_col[1] == column_name:
            return True
    return False
    
    
def add_pdf_file_names(prev_db, curr_db):
    has_file_names = False
    
    prevapp_db = "db_files/"+prev_db +".sqlite3"

    while not Path(prevapp_db).is_file():
        print('Please enter the name of app db file:')
        prevapp_db = input()

    # get publication data from the db
    prev_pubs = pr_fns.get_pub_data(prevapp_db)

    #2 currend app DB
    ukchapp_db = "db_files/" + curr_db + ".sqlite3"
    while not Path(ukchapp_db).is_file():
        print('Please enter the name of app db file:')
        curr_db = input()
        ukchapp_db = "db_files/" + curr_db + ".sqlite3"
    
    # get publication data from the ukch app
    app_pubs = pr_fns.get_pub_data(ukchapp_db)
    
    # check if file names have been added
    # return true if OK
    if pdf_column_populated(curr_db):
        has_file_names = True
        return curr_db, has_file_names

    # 3 get pdf file name from previous and put it in current
    for a_pub in notebook.tqdm(prev_pubs):
        pub_id = a_pub[0]
        pub_title = a_pub[1]
        pub_doi = a_pub[2]
        pub_url = a_pub[3]
        pub_pdf = a_pub[4]
        match_found = False
        for curr_pub in app_pubs:
            if curr_pub[2] == pub_doi and pub_doi != None:
                pr_fns.set_pdf_file_value(pub_pdf, curr_pub[0], ukchapp_db)
                match_found = True
                break
            elif curr_pub[1] == pub_title:
                pr_fns.set_pdf_file_value(pub_pdf, curr_pub[0], ukchapp_db)
                match_found = True
                break
        if not match_found:
            print("*************\n",a_pub)

        has_file_names = True
    return curr_db, has_file_names

## Check that idexed pdf files exist 

Use the data on the articles table to verify if file are stored in the corresponding folder


In [18]:
def pdf_data_exists(data_db):
    ukchapp_db = "db_files/" + data_db + ".sqlite3"
    
    # get publication data from the ukch app
    app_pubs = pr_fns.get_pub_data(ukchapp_db)

    i_indx = 0
    for a_pub in notebook.tqdm(app_pubs):
        pub_id = a_pub[0]
        pub_title = a_pub[1]
        pub_doi = a_pub[2]
        pub_url = a_pub[3]
        pub_pdf = a_pub[4]
        if pub_pdf == None:
            print("*************************")
            i_indx +=1
            print(i_indx, "Missing PDF for:", pub_doi, pub_id)
            
        else:
            pdf_file = "pdf_files/" + pub_pdf
            if not Path(pdf_file).is_file():
                print("*************************")
                i_indx +=1
                print(i_indx, "Missing file for:", pdf_file, "for", pub_doi, pub_id)
                
    #print(i_indx/len(app_pubs) )
    # If less than 1% if missing that is OK
    return (i_indx/len(app_pubs) < 0.01)


## Check that all PDF files are indexed 

Check that the files in the folder are all accounted for (have a corersponding record)

In [19]:
def check_files_in_db(data_db):
    ukchapp_db = "db_files/" + data_db + ".sqlite3"
    
    # get publication data from the ukch app
    app_pubs = pr_fns.get_pub_data(ukchapp_db)
    files_not_in_DB = 0
    for infile in notebook.tqdm(Path("pdf_files").glob('*.pdf')):
        file_found = False
        for a_pub in app_pubs:
            if infile.name == a_pub[4]:
                file_found = True
                break
        if not file_found:
            print("Not in DB:", infile.name)
            files_not_in_DB += 1
    return files_not_in_DB < 1

## Get missing pdfs
If there are more than 1% missing try to get them 

In [20]:
# use regular expression to check if a given string
# is a valid DOI, using pattern from CR
def valid_doi(cr_doi):
    # CR DOIS: https://www.crossref.org/blog/dois-and-matching-regular-expressions/
    # CR DOIs re1
    # /^10.\d{4,9}/[-._;()/:A-Z0-9]+$/i
    if cr_doi == None:
        return False
    cr_re_01 = '^10.\d{4,9}/[-._;()/:A-Z0-9]+'
    compare = re.match(cr_re_01, cr_doi, re.IGNORECASE)
    if compare != None and cr_doi == compare.group():
        return True
    else:
        return False
    
def get_missing_pdfs(data_db):
    return_val = False
    ukchapp_db = "db_files/" + data_db + ".sqlite3"
    # get publication data from the ukch app
    db_pubs = pr_fns.get_pub_data(ukchapp_db)
    for a_pub in notebook.tqdm(db_pubs):
        if a_pub[0] > 0:
            pub_id = a_pub[0]
            pub_title = a_pub[1]
            pub_doi = a_pub[2]
            pub_url = a_pub[3]
            pub_pdf = a_pub[4]
            if pub_pdf == None:
                not_in_url = True
                print("ID: ", pub_id, "Publication: ",pub_title,
                      "\n\tDOI: ", pub_doi, " URL: ", pub_url)
                if "pdf" in pub_url:
                    print ("\tTry to get the pdf from URL: ", pub_url)
                    try:
                        response = requests.get(pub_url)
                        content_type = response.headers['content-type']
                        if not 'text' in content_type:
                            #print(response.headers)
                            cd= response.headers['content-disposition']
                            #print(cd)
                            fname = re.findall("filename=(.+)", cd)[0]
                            #print(fname)
                            if not Path('pdf_files/' + pdf_file).is_file():
                                with open('pdf_files/'+ fname +'.pdf', 'wb') as f:
                                    f.write(response.content)
                            else:
                                set_pdf_file_value(pdf_file, pub_id, ukchapp_db)
                            not_in_url = False
                    except:
                        print("ID: ", pub_id, "\nPublication: ",pub_title, 
                               "\nDOI: ", pub_doi, "\nDOI: ", pub_url) 
                if not_in_url:
                    print("\tTry to see if json file has link to pdf: ")
                    if valid_doi(pub_doi):
                        crjd, doi_file = pr_fns.get_cr_json_object(pub_doi)
                        got_pdf = False
                        if "link" in crjd.keys():
                            for a_link in crjd["link"]:
                                if "\tURL" in a_link.keys() and ("pdf" in a_link["URL"] or "pdf" in a_link["content-type"]):
                                    cr_url = a_link["URL"]
                                    #print("URL: ", cr_url)
                                    pdf_file = get_pdf_from_url(cr_url)
                                    # if the name corresponds to a existing file, assign value to db_record
                                    if Path('pdf_files/' + pdf_file).is_file():
                                        print("\tFile name:", pdf_file)
                                        set_pdf_file_value(pdf_file, pub_id, ukchapp_db)
                                        got_pdf = True
                                    else:
                                        print("\tcould not get file from", cr_url)
                        else: 
                            print("\tno links in json", pub_doi)
                    if not got_pdf and "elsevier" in pub_url:
                        print("\tTrying elsevier doi:" )
                        pdf_file = pr_fns.get_elsevier_pdf(pub_doi)
                        if Path('pdf_files/' + pdf_file).is_file():
                            print("\tFile name:", pdf_file)
                            pr_fns.set_pdf_file_value(pdf_file, pub_id, ukchapp_db)
                            got_pdf = True
                    elif not got_pdf and "wiley" in pub_url:
                        print("\tTrying wiley doi:" )
                        pdf_file = pr_fns.get_wiley_pdf(pub_doi)
                        if Path('pdf_files/' + pdf_file).is_file():
                            print("\tFile name:", pdf_file)
                            pr_fns.set_pdf_file_value(pdf_file, pub_id, ukchapp_db)
                            got_pdf = True
                    elif not got_pdf and "pubs.acs" in pub_url:
                        print("\tTrying acs doi:" )
                        pdf_file = pr_fns.get_acs_pdf(pub_doi)
                        if Path('pdf_files/' + pdf_file).is_file():
                            print("\tFile name:", pdf_file)
                            pr_fns.set_pdf_file_value(pdf_file, pub_id, ukchapp_db)
                            got_pdf = True
                    if not got_pdf:
                        print("\tTry doi:  https://doi.org/" + pub_doi)
    return return_val
    


## Use pdfminer to get metadata from pdf file

Functions which use pdf miner to get data from pdf_file

In [21]:
import pdfminer
from pdfminer import high_level as pdfmnr_hl

# functions for PDFminer

def get_pdf_text(pdf_file):
    return pdfmnr_hl.extract_text(pdf_file)

# get the paragraph fragments with references to data
def get_ref_sentences(pdf_text):
    sentences = pdf_text.split("\n")
    groups=[]
    for sentence in sentences:
        if pr_fns.is_data_stmt(sentence.lower()):
            idx = sentences.index(sentence)
            groups.append([idx-1, idx, idx+1])
    reduced_groups = []
    for group in groups:
        idx_group = groups.index(group)
        if groups.index(group) > 0:
            set_g = set(group)
            # make the array before current a set
            set_bg = set(groups[idx_group - 1])
            # make the array after current a set
            set_ag = set()
            if idx_group + 1 < len(groups):    
                set_ag = set(groups[idx_group + 1])
            if len(set_bg.intersection(set_g)) > 0:
                ordered_union = list(set_bg.union(set_g))
                ordered_union.sort()
                reduced_groups.append(ordered_union)
            if len(set_ag.intersection(set_g)) > 0:
                ordered_union = list(set_ag.union(set_g))
                ordered_union.sort()
                reduced_groups.append(ordered_union)
            if len(reduced_groups) > 0:
                is_in_rg = False
                for a_rg in reduced_groups:
                    if set_g.issubset(a_rg):
                        is_in_rg = True
                        break
                if not is_in_rg:
                    reduced_groups.append(list(set_g))
    return_group = []
    for sentence_group in reduced_groups:
        full_sentence = ""
        for single_sentence in sentence_group:
            print (single_sentence)
            full_sentence += " " + sentences[single_sentence].strip()
        return_group.append(full_sentence)
        print (full_sentence)
    return return_group

# get the paragraph fragments with references to data
def get_all_data_sentences(pdf_text):
    sentences = pdf_text.split("\n")
    groups=[]
    for sentence in sentences:
        if 'data' in sentence.lower() or 'inform' in sentence.lower():
            idx = sentences.index(sentence)
            groups.append([idx-1, idx, idx+1])
    reduced_groups = []
    for group in groups:
        idx_group = groups.index(group)
        if groups.index(group) > 0:
            set_g = set(group)
            # make the array before current a set
            set_bg = set(groups[idx_group - 1])
            # make the array after current a set
            set_ag = set()
            if idx_group + 1 < len(groups):    
                set_ag = set(groups[idx_group + 1])
            if len(set_bg.intersection(set_g)) > 0:
                ordered_union = list(set_bg.union(set_g))
                ordered_union.sort()
                reduced_groups.append(ordered_union)
            if len(set_ag.intersection(set_g)) > 0:
                ordered_union = list(set_ag.union(set_g))
                ordered_union.sort()
                reduced_groups.append(ordered_union)
            if len(reduced_groups) > 0:
                is_in_rg = False
                for a_rg in reduced_groups:
                    if set_g.issubset(a_rg):
                        is_in_rg = True
                        break
                if not is_in_rg:
                    reduced_groups.append(list(set_g))
    return_group = []
    for sentence_group in reduced_groups:
        full_sentence = ""
        for single_sentence in sentence_group:
            full_sentence += sentences[single_sentence].strip()
        if not full_sentence in return_group:
            return_group.append(full_sentence)
    return return_group

# get the http strings from references to data
def get_http_ref(sentence):
    http_frag = ""
    if 'http' in sentence.lower():
        idx_http = sentence.lower().index('http')
        http_frag = sentence[idx_http:]
        space_in_ref = True
        while " " in http_frag:
            space_idx = http_frag.rfind(" ")
            http_frag = http_frag[:space_idx]
        if(http_frag[-1:]=="."):
            http_frag = http_frag[:-1]
    return http_frag

## Get data mentions from pdf files
Write the results to a csv file to be checked to verify data mentions

In [22]:
def get_data_refs(data_db, start_processing, stop_processing, work_dir):
    ukchapp_db = "db_files/" + data_db + ".sqlite3"
    out_name =  'pdf_mentions' + "_" + str(start_processing).zfill(4)+ "_" + str(stop_processing).zfill(4)
    out_file = Path(work_dir, out_name + ".csv")
    if out_file.is_file():
        print ("Already checked for data refences in:", data_db, "saved as", out_file)
        return out_name

    # get publication data from the ukch app
    db_pubs = pr_fns.get_pub_data(ukchapp_db)

    # get the list of dois already mined for data 
    input_file = './data_load/pub_data_add202012.csv'
    id_field = 'num'
    processed, headings = csvh.get_csv_data(input_file, id_field)
    for id_num in processed:
        current_title = processed[id_num]['doi']
    processed[1]['num']

    processed_dois = []
    for entry in processed:
        if not processed[entry]['doi'] in processed_dois:
            processed_dois.append( processed[entry]['doi'])

    data_records = {}
    data_mentions = {}
    ref_count = mention_count = 0
    for a_pub in notebook.tqdm(db_pubs):
        data_refs = []
        data_sents = []
        if a_pub[0] >= start_processing:
            pub_id = a_pub[0]
            pub_title = a_pub[1]
            pub_doi = a_pub[2]
            pub_url = a_pub[3]
            pub_pdf = a_pub[4]
            if pub_pdf == 'None':
                print("*************************")
                print("Missing PDF for:", pub_doi)
                print("*************************")
            else:
                pdf_file = "pdf_files/" + pub_pdf
                if not Path(pdf_file).is_file():
                    print("*************************")
                    print("Missing file for:", pdf_file, "for", pub_doi)
                    print("*************************")
                else: 
                    print("PDF filename", pdf_file)
                    pdf_text = get_pdf_text(pdf_file) # gets the whole PDF text
                    ref_sentences = get_ref_sentences(pdf_text) # filter only references which mention data or information
                    data_sentences = get_all_data_sentences(pdf_text)
                    for r_sentence in ref_sentences:
                        dt_link = get_http_ref(r_sentence)
                        if 'supplem' in r_sentence.lower():
                            data_refs.append({'type':'supplementary',"desc":r_sentence, 'data_url':dt_link})
                        else:
                            data_refs.append({'type':'supporting',"desc":r_sentence, 'data_url':dt_link})
                    for d_sentence in data_sentences:
                        dt_link = get_http_ref(d_sentence)
                        if 'supplem' in d_sentence.lower():
                            data_sents.append({'type':'supplementary',"desc":d_sentence, 'data_url':dt_link})
                        else:
                            data_sents.append({'type':'supporting',"desc":d_sentence, 'data_url':dt_link})
            if data_refs != []:
                for data_ref in data_refs:
                    data_record = {'id':pub_id, 'doi':pub_doi}    
                    data_record.update(data_ref)
                    data_records[ref_count] = data_record
                    ref_count += 1
            if data_sents != []:
                for data_sent in data_sents:
                    sentence_record = {'id':pub_id, 'doi':pub_doi}    
                    sentence_record.update(data_sent)
                    data_mentions[mention_count] = sentence_record
                    mention_count += 1
        if a_pub[0] >= stop_processing :
            break # for debugging           
    # csvh.write_csv_data(data_records, 'pdf_data.csv')
    if len(data_mentions) > 0:
        csvh.write_csv_data(data_mentions, out_file)
    return out_name

## Mark for review

Verify if the mentions of data or information actually can be linked to data objects.

Results need to be reviewed interactively

In [23]:
def review_interactivex(data_refs, work_dir):
    in_name = data_refs + "_int"
    out_name = data_refs + "_rev"
    out_file = Path(work_dir, out_name + ".csv")
    if out_file.is_file():
        print ("Already checked data refences see:", out_file)
        return out_name
        
    print('Input File: ', in_name)
    # Open results file
    data_mentions, dm_headers = csvh.get_csv_data(Path(work_dir, in_name+'.csv'))
    print(dm_headers)
    art_id = ''
    terminate = False
    additional_rows = {}
    for dm in data_mentions:
        if data_mentions[dm]['action']=='review':
            clear_output()
            print ("*******************************************")
            print ("Article id  :", data_mentions[dm]['id'])
            print ("DOI         :", data_mentions[dm]['doi'])
            print ("Type        :", data_mentions[dm]['type'], '\tLine:', dm)
            print ("Description :\n\t", data_mentions[dm]['desc'])
            print ("data_url :", data_mentions[dm]['data_url'])
            print ("*******************************************")
            decide_action = False
            while not decide_action:
                print('Action:')
                print('\tr) review: https://doi.org/'+data_mentions[dm]['doi'])
                print('\ta) add new row')
                print('\tn) next')
                print('\tt) terminate')
                print('\tSelect r, a, n, t:')
                lts = input()
                if lts == "r":
                    data_mentions[dm]['action'] = 'reviewed'
                    print ('https://doi.org/'+data_mentions[dm]['doi'])
                    print ('link:',data_mentions[dm]['link'])
                    add_this = input()
                    data_mentions[dm]['link'] = add_this
                    print ('issue:',data_mentions[dm]['issue'])
                    add_this = input()
                    data_mentions[dm]['issue'] = add_this
                    print ('name:',data_mentions[dm]['name'])
                    add_this = input()
                    data_mentions[dm]['name'] = add_this
                    print ('file:',data_mentions[dm]['file'])
                    add_this = input()
                    data_mentions[dm]['file'] = add_this
                if lts == "a":
                    #add a new row
                    new_idx = len(data_mentions) + len(additional_rows) + 1
                    additional_rows[new_idx] = {}
                    additional_rows[new_idx]['id'] = data_mentions[dm]['id']
                    additional_rows[new_idx]['doi'] = data_mentions[dm]['doi']
                    additional_rows[new_idx]['type'] = data_mentions[dm]['type']
                    additional_rows[new_idx]['desc'] = data_mentions[dm]['desc']
                    additional_rows[new_idx]['action'] = 'reviewed'
                    print ('link:')
                    add_this = input()
                    additional_rows[new_idx]['link'] = add_this
                    print ('issue:')
                    add_this = input()
                    additional_rows[new_idx]['issue'] = add_this
                    print ('name:')
                    add_this = input()
                    additional_rows[new_idx]['name'] = add_this
                    print ('file:')
                    add_this = input()
                    additional_rows[new_idx]['file'] = add_this
                elif lts == "n":
                    if data_mentions[dm]['action'] != 'reviewed':
                        data_mentions[dm]['action'] = 'none'
                    decide_action = True
                elif lts == 't':
                    decide_action = True
                    terminate = True
        art__id = data_mentions[dm]['id']
        if dm > 1700 or terminate:
            break
    if len(additional_rows)> 0 :
        for nr in additional_rows:
           data_mentions[nr] = additional_rows[nr]
    if len(data_mentions) > 0:
       csvh.write_csv_data(data_mentions, out_file)
    return out_name

In [24]:
# clear the output after each loop cycle
from IPython.display import clear_output

def review_interactive(data_refs, work_dir):
    out_name = data_refs + "_int"
    out_file = Path(work_dir, out_name + ".csv")
    if out_file.is_file():
        print ("Already checked data refences see:", out_file)
        return out_name

    # Open results file
    data_mentions, dm_headers = csvh.get_csv_data(Path(work_dir,data_refs+ '.csv'))
    art_id = ''
    for dm in data_mentions:
        # only review if data statement is true
        if data_mentions[dm]['DataStatement'] == "1":
            clear_output()
            print ("*******************************************")
            print ("Article id  :", data_mentions[dm]['id'])
            print ("DOI         :", data_mentions[dm]['doi'])
            print ("Type        :", data_mentions[dm]['type'], '\tLine:', dm)
            print ("Description :\n\t", data_mentions[dm]['desc'])
            print ("data_url :", data_mentions[dm]['data_url'])
            print ("*******************************************")
            decide_action = False
            while not decide_action:
                print('Action:')
                print('\ta) review')
                print('\tb) none')
                print('\tSelect a or b:')
                lts = input()
                if lts == "a":
                    data_mentions[dm]['action'] = 'review'
                    decide_action = True
                elif lts == "b":
                    data_mentions[dm]['action'] = 'none'
                    decide_action = True
        else:
              data_mentions[dm]['action'] = 'none'
        art_id = data_mentions[dm]['id']
    if len(data_mentions) > 0:
        csvh.write_csv_data(data_mentions, out_file)
    return out_name

## Review references interactively

Check each marked reference to determine if they should be added

Run next to get the ones which need to be reviewed online

In [25]:
def revise_online(revised_refs, db_name, work_dir):
    print (revised_refs, db_name, work_dir)
    out_name = 'html_'+db_name
    out_file = Path(work_dir,out_name+'.csv')
    if out_file.is_file():
        print ("Already checked refences online:", out_file)
        return out_name
    in_file = Path(Path(work_dir),revised_refs+'.csv')
    data_mentions, dm_headers = csvh.get_csv_data(in_file)
    filter_mentions = {}
    for dm in data_mentions:
        if 'add' in data_mentions[dm].keys() and data_mentions[dm]['add'] == '1':
            filter_mentions[dm]={}
            for a_field in dm_headers:
                filter_mentions[dm][a_field] = data_mentions[dm][a_field]
    print('filtered mentions:', len(filter_mentions))

    new_do_id_list =[]
    for fm in filter_mentions:
        art_id = int(filter_mentions[fm]["id"])
        if not art_id in new_do_id_list:
            new_do_id_list.append(art_id)

    # currend app DB
    ukchapp_db = "db_files/"+db_name+".sqlite3"

    no_data_pubs = pr_fns.get_pub_app_no_data(ukchapp_db)

    print(len(no_data_pubs))
    print(new_do_id_list, len(new_do_id_list))
    filter_mentions


    int_idx = 0
    revised_list = {}
    if Path("./html_revised202111.csv").is_file():
        revised_list, rl_headers = csvh.get_csv_data('html_revised202111.csv')
        int_idx = len(revised_list)

    already_revised =[]
    for fm in revised_list:
        art_id = int(revised_list[fm]["id"])
        if not art_id in already_revised:
            already_revised.append(art_id)

    for ndp in no_data_pubs:
        if not ndp[0] in new_do_id_list and ndp[0] > 786 and not ndp[0] in already_revised:
            int_idx += 1
            pub_id = ndp[0]
            pub_title = ndp[1]
            pub_doi = ndp[2]
            pub_url = ndp[3]
            data_record = {'id':pub_id, 'doi':pub_doi, 'title':pub_title} 
            print ('id',pub_id, '\n', pub_title)
            decide_action = False
            terminate = False
            while not decide_action:
                print('Action:')
                print(pub_url)
                print("https://doi.org/"+pub_doi)
                print('\ts) skip (no data)' )
                print('\tr) review')
                print('\tn) next')
                print('\tt) terminate')
                print('\tSelect s, r, n, t:')
                lts = input()
                if lts == "s":
                    data_record['action'] = 'no data'
                    data_record['issue'] = "no data availability or supplementary data mentioned in html or pdf versions or article"
                    revised_list[int_idx] = data_record
                    decide_action = True
                if lts == "r":
                    data_record['action'] = 'review'
                    if 'issue' in data_mentions[dm].keys():
                        print ('issue:',data_mentions[dm]['issue'])
                    add_this = input()
                    data_record['issue'] = add_this
                    revised_list[int_idx] = data_record
                    decide_action = True
                if lts == "n":
                    decide_action = True
                elif lts == 't':
                    decide_action = True
                    terminate = True
            if terminate:
                break

    if len(revised_list) > 0:
        csvh.write_csv_data(revised_list, out_file)
    return out_name

In [34]:
# (0) Add column for pdf_file names
db_name = 'production202412'
add_pdf_file_column(db_name, "articles", "pdf_file", "varchar")

#(1) previously verified files:
last_processed = 800
prev_db_name = "production202410"

db_name, names_added = add_pdf_file_names(prev_db_name, db_name)
# working dir
pdf_data_search_dir = "./data_search_pdf_b"

if (names_added):
    print ("1. PDF file names copied to", db_name);
pdfs_ok = pdf_data_exists(db_name)
if pdfs_ok:
    print ("2. PDF file names copied to", db_name);
    not_indexed = check_files_in_db(db_name)
    if not_indexed:
        print ("3. All PDFs are indexed in", db_name )
else:
    pdfs_ok = get_missing_pdfs(db_name)


pdf_file Alredy exists in  articles
1. PDF file names copied to production202412


  0%|          | 0/774 [00:00<?, ?it/s]

*************************
1 Missing file for: pdf_files/Not available for 10.1002/9783527804085.ch10 64
*************************
2 Missing file for: pdf_files/NA for 10.1016/b978-0-12-805324-9.09989-1 599
*************************
3 Missing file for: pdf_files/Not available for 10.1142/q0035 603
*************************
4 Missing file for: pdf_files/NA for 10.1142/q0354 925
*************************
5 Missing file for: pdf_files/NA for None 1029
2. PDF file names copied to production202412


0it [00:00, ?it/s]

Not in DB: 10.1002_cplu.202300413.pdf
Not in DB: ChemBioChem-2023-Wahart-Harnessing_a_Biocatalyst_to_Bioremediate.pdf
Not in DB: ChemPlusChem - 2023 - Price - Impact of Porous Silica Nanosphere Architectures on the Catalytic Performance of Supported.pdf
Not in DB: ChemPlusChem-2023-Aljohani-Enhancing_Hydrogen_Production_from_the_Photoreforming _of_Lignin.pdf
Not in DB: ChemPlusChem-2023-Peng-A_Facile_Synthesis_Route_to_AuPd_Alloys.pdf
Not in DB: ChemSusChem-2023-Al_Sobhi-A_Comparison_of_the_Reactivity_of_the_Lattice_Nitrogen.pdf
Not in DB: dorota_matras_phd.PDF
Not in DB: Synology_RS816_Data_Sheet_enu.pdf


In [36]:
#review data references
start_from = 1047
stop_at = 1071
pdf_mentions = "pdf_mentionsproduction202402"
if pdfs_ok:
    pdf_mentions = get_data_refs(db_name, start_from, stop_at, pdf_data_search_dir)
    print ("4. PDFs data references stored in", pdf_mentions)
    # use perceptron to filter data
    ######
    # review marked as datasentences
    review_marked = review_interactive(pdf_mentions, pdf_data_search_dir)
    print ("5. PDFs data references marked for review", review_marked)
    html_refs = revise_online(pdf_mentions, db_name, pdf_data_search_dir)
    #print ("6. Online references reviewed", html_refs)    

  0%|          | 0/774 [00:00<?, ?it/s]

PDF filename pdf_files/d4ey00044g.pdf
PDF filename pdf_files/s41929-024-01181-w.pdf
PDF filename pdf_files/wang-et-al-2024-amphiphilic-janus-particles-for-aerobic-alcohol-oxidation-in-oil-foams.pdf
PDF filename pdf_files/farooq-et-al-2024-chemical-imaging-of-carbide-formation-and-its-effect-on-alcohol-selectivity-in-fischer-tropsch.pdf
PDF filename pdf_files/hardy-et-al-2024-probing-ferryl-reactivity-in-a-nonheme-iron-oxygenase-using-an-expanded-genetic-code.pdf
PDF filename pdf_files/catalysts-14-00463.pdf
PDF filename pdf_files/AdvancedScience-2024-ulHaq-DielectricBarrierPlasmaDischargeExsolutionNanoparticlesRoomTemperature.pdf
1303
1304
1305
1306
 repeated, are provided in the corresponding sections of this paper. Ad- ditional results and raw data underlying this work are available in the Supporting Information or on request following instructions provided at https://doi.org/10.15129/e2e11901-92c4-4b2e-a83e-ﬀ25052e972a.
1303
1304
1305
1306
 repeated, are provided in the correspondin

KeyError: 'DataStatement'

In [None]:
if len(revised_list) > 0:
    csvh.writre_csv_data(revised_list, 'html_revised202301.csv')
revised_list

Get the name of the current app db file:

In [None]:
# app db file with path: db_files/app_db.sqlite3
ukchapp_db = "db_files/app_db2.sqlite3"
while not Path(ukchapp_db).is_file():
    print('Please enter the name of app db file:')
    ukchapp_db = input()
ukchapp_db



In [None]:
# get names and links for references in data mentions
data_mentions, dm_fields = csvh.get_csv_data('pdf_mentions_filtered_02.csv', 'num')

for dm in data_mentions:
    print("https://doi.org/" + data_mentions[dm]['doi'])
    
    ref_name = data_mentions[dm]['ref_name']
    while ref_name == "":
        print('Please enter the name of data object:')
        ref_name = input()
    ref_link = data_mentions[dm]['ref_link']
    while ref_link == "":
        print('Please enter the data object link:')
        ref_link = input()
    data_mentions[dm]['ref_name'] = ref_name
    data_mentions[dm]['ref_link'] = ref_link


In [None]:
len(data_records)

In [None]:
data_mentions

In [None]:
from inspect import getmembers, isfunction

In [None]:
help(pdfminer.high_level)