# Get data for new articles and list of all articles not citing data
A list of publications is obtainded from the app database. This list will contain a titles, IDs and DOIs which need to be explored to look for the corresponding pdf files. 
The steps of the process are: 
 1. get a Title, DOI, and URL for each publication
 2. convert the DOI to a pdf file name and try to open de file
 3. use pdfMiner and/or CDE to get the reference to data
 4. add a new dataset entry each time a new data object is found

In [1]:
# Libraries
# library containign functions that read and write to csv files
import lib.handle_csv as csvh
# library for connecting to the db
import lib.handle_db as dbh
# library for handling text matchings
import lib.text_comp as txtc
# library for getting data from crossref
import lib.crossref_api as cr_api
# library for handling url searchs
import lib.handle_urls as urlh
# managing files and file paths
from pathlib import Path
# add aprogress bar
from tqdm import tqdm_notebook 
from tqdm import tqdm
#library for handling json files
import json
# library for using regular expressions
import re
# library for handling http requests
import requests
# import custom functions (common to various notebooks)
import processing_functions as pr_fns

current_step = 2

## Get pdf and HTML names into app DB

0. Add fields to articles table for holding pdf and html names
1. Open the previously verified DB and get the publications list
2. Open the current publication list from the appdb
3. Get pdf and html file names from previous and put it in current



In [2]:

# (0) Add fields
# ALTER TABLE articles 
# ADD COLUMN pdf_file text;
# ALTER TABLE articles 
# ADD COLUMN html_file text;

#(1) previously verified files:
prevapp_db = "db_files/app_db2.sqlite3"
while not Path(prevapp_db).is_file():
    print('Please enter the name of app db file:')
    prevapp_db = input()

# get publication data from the db
prev_pubs = pr_fns.get_pub_data(prevapp_db)

#2 currend app DB
ukchapp_db = "db_files/app_db20210702.sqlite3"
while not Path(ukchapp_db).is_file():
    print('Please enter the name of app db file:')
    ukchapp_db = input()

# get publication data from the ukch app
app_pubs = pr_fns.get_pub_data(ukchapp_db)

# 3 get pdf and html name from previous and put it in current
if current_step == 1:
    
    for a_pub in tqdm_notebook(prev_pubs):
        pub_id = a_pub[0]
        pub_title = a_pub[1]
        pub_doi = a_pub[2]
        pub_url = a_pub[3]
        pub_pdf = a_pub[4]
        pub_html = a_pub[5]
        match_found = False
        for curr_pub in app_pubs:
            if curr_pub[2] == pub_doi and pub_doi != None:
                pr_fns.set_pdf_file_value(pub_pdf, curr_pub[0], ukchapp_db)
                match_found = True
                break
            elif curr_pub[1] == pub_title:
                pr_fns.set_pdf_file_value(pub_pdf, curr_pub[0], ukchapp_db)
                match_found = True
                break
        if not match_found:
            print("*******************\n",a_pub)
    current_step = 2

## Check that pdf files exist 

Use the data on the articles table to verify if file are stored in the corresponding folder
We also check that the files in the folder are all accounted for (have a corersponding record)

In [6]:
if current_step == 2:
    # get publication data from the ukch app
    app_pubs = pr_fns.get_pub_data(ukchapp_db)

    i_indx = 1
    for a_pub in tqdm_notebook(app_pubs):
        pub_id = a_pub[0]
        pub_title = a_pub[1]
        pub_doi = a_pub[2]
        pub_url = a_pub[3]
        pub_pdf = a_pub[4]
        pub_html = a_pub[5]
        if pub_pdf == None:
            print("*************************")
            print(i_indx, "Missing PDF for:", pub_doi, pub_id)
            i_indx +=1
        else:
            pdf_file = "pdf_files/" + pub_pdf
            if not Path(pdf_file).is_file():
                print("*************************")
                print(i_indx, "Missing file for:", pdf_file, "for", pub_doi, pub_id)
                i_indx +=1



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=402.0), HTML(value='')))

*************************
1 Missing PDF for: 10.1002/9783527804085.ch10 64
*************************
2 Missing PDF for: 10.1016/b978-0-12-805324-9.09989-1 599
*************************
3 Missing PDF for: 10.1142/q0035 603



In [4]:
for infile in tqdm_notebook(Path("pdf_files").glob('*.pdf')):
    file_found = False
    for a_pub in app_pubs:
        if infile.name == a_pub[4]:
            file_found = True
            break
    if not file_found:
        print("Not in DB:", infile.name)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

Not in DB: Inns2021_Article_EvaluatingTheActivityAndStabil(1).pdf



## Get missing pdfs

In [5]:
# use regular expression to check if a given string
# is a valid DOI, using pattern from CR
def valid_doi(cr_doi):
    # CR DOIS: https://www.crossref.org/blog/dois-and-matching-regular-expressions/
    # CR DOIs re1
    # /^10.\d{4,9}/[-._;()/:A-Z0-9]+$/i
    if cr_doi == None:
        return False
    cr_re_01 = '^10.\d{4,9}/[-._;()/:A-Z0-9]+'
    compare = re.match(cr_re_01, cr_doi, re.IGNORECASE)
    if compare != None and cr_doi == compare.group():
        return True
    else:
        return False
# get publication data from the ukch app
db_pubs = pr_fns.get_pub_data(ukchapp_db)

for a_pub in tqdm_notebook(db_pubs):
    if a_pub[0] > 616:
        pub_id = a_pub[0]
        pub_title = a_pub[1]
        pub_doi = a_pub[2]
        pub_url = a_pub[3]
        pub_pdf = a_pub[4]
        pub_html = a_pub[5]
        if pub_pdf == None:
            not_in_url = True
            print("ID: ", pub_id, "Publication: ",pub_title,
                  "\n\tDOI: ", pub_doi, " URL: ", pub_url)
            if "pdf" in pub_url:
                print ("\tTry to get the pdf from URL: ", pub_url)
                try:
                    response = requests.get(pub_url)
                    content_type = response.headers['content-type']
                    if not 'text' in content_type:
                        #print(response.headers)
                        cd= response.headers['content-disposition']
                        #print(cd)
                        fname = re.findall("filename=(.+)", cd)[0]
                        #print(fname)
                        if not Path('pdf_files/' + pdf_file).is_file():
                            with open('pdf_files/'+ fname +'.pdf', 'wb') as f:
                                f.write(response.content)
                        else:
                            set_pdf_file_value(pdf_file, pub_id, ukchapp_db)
                        not_in_url = False
                except:
                    print("ID: ", pub_id, "\nPublication: ",pub_title, 
                           "\nDOI: ", pub_doi, "\nDOI: ", pub_url) 
            if not_in_url:
                print("\tTry to see if json file has link to pdf: ")
                if valid_doi(pub_doi):
                    crjd, doi_file = pr_fns.get_cr_json_object(pub_doi)
                    got_pdf = False
                    if "link" in crjd.keys():
                        for a_link in crjd["link"]:
                            if "\tURL" in a_link.keys() and ("pdf" in a_link["URL"] or "pdf" in a_link["content-type"]):
                                cr_url = a_link["URL"]
                                #print("URL: ", cr_url)
                                pdf_file = get_pdf_from_url(cr_url)
                                # if the name corresponds to a existing file, assign value to db_record
                                if Path('pdf_files/' + pdf_file).is_file():
                                    print("\tFile name:", pdf_file)
                                    set_pdf_file_value(pdf_file, pub_id, ukchapp_db)
                                    got_pdf = True
                                else:
                                    print("\tcould not get file from", cr_url)
                    else: 
                        print("\tno links in json", pub_doi)
                if not got_pdf and "elsevier" in pub_url:
                    print("\tTrying elsevier doi:" )
                    pdf_file = pr_fns.get_elsevier_pdf(pub_doi)
                    if Path('pdf_files/' + pdf_file).is_file():
                        print("\tFile name:", pdf_file)
                        set_pdf_file_value(pdf_file, pub_id, ukchapp_db)
                        got_pdf = True
                elif not got_pdf and "wiley" in pub_url:
                    print("\tTrying elsevier doi:" )
                    pdf_file = pr_fns.get_wiley_pdf(pub_doi)
                    if Path('pdf_files/' + pdf_file).is_file():
                        print("\tFile name:", pdf_file)
                        set_pdf_file_value(pdf_file, pub_id, ukchapp_db)
                        got_pdf = True
                if not got_pdf:
                    print("\tTry doi:  https://doi.org/" + pub_doi)
    


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=402.0), HTML(value='')))




## Use pdfminer to get metadata from pdf file

In [9]:
import pdfminer
from pdfminer.high_level import extract_text

# functions for PDFminer

def get_pdf_text(pdf_file):
    return extract_text(pdf_file)

# get the paragraph fragments with references to data
def get_ref_sentences(pdf_text):
    sentences = pdf_text.split("\n")
    groups=[]
    for sentence in sentences:
        if pr_fns.is_data_stmt(sentence.lower()):
            idx = sentences.index(sentence)
            groups.append([idx-1,idx,idx+1])
    reduced_groups = []
    for group in groups:
        idx_group = groups.index(group)
        if groups.index(group) > 0:
            set_g = set(group)
            # make the array before current a set
            set_bg = set(groups[idx_group - 1])
            # make the array after current a set
            set_ag = set()
            if idx_group + 1 < len(groups):    
                set_ag = set(groups[idx_group + 1])
            if len(set_bg.intersection(set_g)) > 0:
                ordered_union = list(set_bg.union(set_g))
                ordered_union.sort()
                reduced_groups.append(ordered_union)
            if len(set_ag.intersection(set_g)) > 0:
                ordered_union = list(set_ag.union(set_g))
                ordered_union.sort()
                reduced_groups.append(ordered_union)
            if len(reduced_groups) > 0:
                is_in_rg = False
                for a_rg in reduced_groups:
                    if set_g.issubset(a_rg):
                        is_in_rg = True
                        break
                if not is_in_rg:
                    reduced_groups.append(list(set_g))
    return_group = []
    for sentence_group in reduced_groups:
        full_sentence = ""
        for single_sentence in sentence_group:
            full_sentence += sentences[single_sentence].strip()
        return_group.append(full_sentence)
    return return_group

# get the paragraph fragments with references to data
def get_all_data_sentences(pdf_text):
    sentences = pdf_text.split("\n")
    groups=[]
    for sentence in sentences:
        if 'data' in sentence.lower() or 'inform' in sentence.lower():
            idx = sentences.index(sentence)
            groups.append([idx-1, idx, idx+1])
    reduced_groups = []
    for group in groups:
        idx_group = groups.index(group)
        if groups.index(group) > 0:
            set_g = set(group)
            # make the array before current a set
            set_bg = set(groups[idx_group - 1])
            # make the array after current a set
            set_ag = set()
            if idx_group + 1 < len(groups):    
                set_ag = set(groups[idx_group + 1])
            if len(set_bg.intersection(set_g)) > 0:
                ordered_union = list(set_bg.union(set_g))
                ordered_union.sort()
                reduced_groups.append(ordered_union)
            if len(set_ag.intersection(set_g)) > 0:
                ordered_union = list(set_ag.union(set_g))
                ordered_union.sort()
                reduced_groups.append(ordered_union)
            if len(reduced_groups) > 0:
                is_in_rg = False
                for a_rg in reduced_groups:
                    if set_g.issubset(a_rg):
                        is_in_rg = True
                        break
                if not is_in_rg:
                    reduced_groups.append(list(set_g))
    return_group = []
    for sentence_group in reduced_groups:
        full_sentence = ""
        for single_sentence in sentence_group:
            full_sentence += sentences[single_sentence].strip()
        if not full_sentence in return_group:
            return_group.append(full_sentence)
    return return_group

# get the http strings from references to data
def get_http_ref(sentence):
    http_frag = ""
    if 'http' in sentence.lower():
        idx_http = sentence.lower().index('http')
        http_frag = sentence[idx_http:]
        space_in_ref = True
        while " " in http_frag:
            space_idx = http_frag.rfind(" ")
            http_frag = http_frag[:space_idx]
        if(http_frag[-1:]=="."):
            http_frag = http_frag[:-1]
    return http_frag

In [10]:
# get publication data from the ukch app
db_pubs = pr_fns.get_pub_data(ukchapp_db)

# get the list of dois already mined for data 
input_file = 'pub_data_add202012.csv'
id_field = 'num'
processed, headings = csvh.get_csv_data(input_file, id_field)
for id_num in processed:
    current_title = processed[id_num]['doi']
processed[1]['num']

processed_dois = []
for entry in processed:
    if not processed[entry]['doi'] in processed_dois:
        processed_dois.append( processed[entry]['doi'])

data_records = {}
data_mentions = {}
ref_count = mention_count = 0
for a_pub in tqdm_notebook(db_pubs):
    data_refs = []
    data_sents = []
    if a_pub[0] > 616:
        pub_id = a_pub[0]
        pub_title = a_pub[1]
        pub_doi = a_pub[2]
        pub_url = a_pub[3]
        pub_pdf = a_pub[4]
        pub_html = a_pub[5]
        if pub_pdf == 'None':
            print("*************************")
            print("Missing PDF for:", pub_doi)
            print("*************************")
        else:
            pdf_file = "pdf_files/" + pub_pdf
            if not Path(pdf_file).is_file():
                print("*************************")
                print("Missing file for:", pdf_file, "for", pub_doi)
                print("*************************")
            else: 
                print("PDF filename", pdf_file)
                pdf_text = get_pdf_text(pdf_file)
                ref_sentences = get_ref_sentences(pdf_text)
                data_sentences = get_all_data_sentences(pdf_text)
                for r_sentence in ref_sentences:
                    dt_link = get_http_ref(r_sentence)
                    if 'supplem' in r_sentence.lower():
                        data_refs.append({'type':'supplementary',"desc":r_sentence, 'data_url':dt_link})
                    else:
                        data_refs.append({'type':'supporting',"desc":r_sentence, 'data_url':dt_link})
                for d_sentence in data_sentences:
                    dt_link = get_http_ref(d_sentence)
                    if 'supplem' in d_sentence.lower():
                        data_sents.append({'type':'supplementary',"desc":d_sentence, 'data_url':dt_link})
                    else:
                        data_sents.append({'type':'supporting',"desc":d_sentence, 'data_url':dt_link})
        if data_refs != []:
            for data_ref in data_refs:
                data_record = {'id':pub_id, 'doi':pub_doi}    
                data_record.update(data_ref)
                data_records[ref_count] = data_record
                ref_count += 1
        if data_sents != []:
            for data_sent in data_sents:
                sentence_record = {'id':pub_id, 'doi':pub_doi}    
                sentence_record.update(data_sent)
                data_mentions[mention_count] = sentence_record
                mention_count += 1

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=402.0), HTML(value='')))

PDF filename pdf_files/s41467-020-17852-8.pdf
PDF filename pdf_files/chem.202000067.pdf
PDF filename pdf_files/rsta.2020.0056.pdf
PDF filename pdf_files/aic.17007.pdf
PDF filename pdf_files/ange.202008370.pdf
PDF filename pdf_files/chem.201801871.pdf
PDF filename pdf_files/1-s2.0-S0926337320306718-main.pdf
PDF filename pdf_files/1-s2.0-S0920586118303055-main.pdf
PDF filename pdf_files/1-s2.0-S0920586120303096-main.pdf
PDF filename pdf_files/1-s2.0-S0920586120303370-main.pdf
PDF filename pdf_files/1-s2.0-S0021951718300496-main.pdf
PDF filename pdf_files/1-s2.0-S1385894719325884-main.pdf
PDF filename pdf_files/acscatal.9b03889.pdf
PDF filename pdf_files/acssuschemeng.8b03268.pdf
PDF filename pdf_files/s41586-020-2733-7.pdf
PDF filename pdf_files/d0cp01227k.pdf
PDF filename pdf_files/d0cy01061h.pdf
PDF filename pdf_files/btaa643.pdf
PDF filename pdf_files/rsta.2020.0058.pdf
PDF filename pdf_files/rsta.2020.0063.pdf
PDF filename pdf_files/daniel_dervin_thesis_final.pdf
PDF filename pdf_fil

In [14]:
#if len(data_records) > 0:
#    csvh.write_csv_data(data_records, 'pdf_data.csv')
    
if len(data_mentions) > 0:
    csvh.write_csv_data(data_mentions, 'pdf_mentions202107.csv')

In [None]:
# functions for ChemDataExtractor
# not used for mining data references (suplementary/raw) or to get pdf metadata
from chemdataextractor import Document

# A function for getting a list of files from the directory
# This will be modified to get the list from a csv file
def get_files_list (source_dir):
    i_counter = 0
    files_list = []
    for filepath in sorted(source_dir.glob('*.pdf')):
        i_counter += 1
        files_list.append(filepath)
    return files_list

def cde_read_pdfs(a_file):
    pdf_f = open(a_file, 'rb')
    doc = Document.from_file(pdf_f)
    return doc

def find_doi(element_text):
    cr_re_01 = '10.\d{4,9}/[-._;()/:A-Z0-9]+'
    compare = re.search(cr_re_01, element_text, re.IGNORECASE)
    if compare != None:
        return compare.group()
    return ""

def get_db_id(doi_value, db_name = "app_db.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    table = 'articles'   
    id_val = db_conn.get_value(table, "id", "doi", doi_value)
    db_conn.close()
    if id_val != None:
        return id_val[0]
    else:
        return 0

def get_db_title(doi_value, db_name = "app_db.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    table = 'articles'   
    id_val = db_conn.get_value(table, "title", "doi", doi_value)
    db_conn.close()
    if id_val != None:
        return id_val[0]
    else:
        return 0

def get_close_dois(str_name, db_name = "prev_search.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    search_in = 'articles'
    fields_required = "id, doi, title, pdf_file"
    filter_str = "doi like '%"+str_name+"%';"

    db_titles = db_conn.get_values(search_in, fields_required, filter_str)
    db_conn.close()
    return db_titles

Get the name of the current app db file:

In [None]:
# app db file with path: db_files/app_db.sqlite3
ukchapp_db = "db_files/app_db2.sqlite3"
while not Path(ukchapp_db).is_file():
    print('Please enter the name of app db file:')
    ukchapp_db = input()
ukchapp_db



In [None]:
# get names and links for references in data mentions
data_mentions, dm_fields = csvh.get_csv_data('pdf_mentions_filtered_02.csv', 'num')

for dm in data_mentions:
    print("https://doi.org/" + data_mentions[dm]['doi'])
    ref_name = data_mentions[dm]['ref_name']
    while ref_name == "":
        print('Please enter the name of data object:')
        ref_name = input()
    ref_link = data_mentions[dm]['ref_link']
    while ref_link == "":
        print('Please enter the data object link:')
        ref_link = input()
    data_mentions[dm]['ref_name'] = ref_name
    data_mentions[dm]['ref_link'] = ref_link
                      

In [None]:
len(data_records)

In [11]:
data_mentions

{0: {'id': 619,
  'doi': '10.1038/s41467-020-17852-8',
  'type': 'supporting',
  'desc': 'EDS investigations. Data were obtained from the probe-corrected JEM ARM200CF (JEOL, Japan) with large solid-angle dual EDS detectors for X-ray spec-troscopy and elemental mapping. The EDS data acquisition was carried out inSTEM imaging mode, with a probe current of 143 pA (probe size is 5 C) at 200 keV',
  'data_url': ''},
 1: {'id': 619,
  'doi': '10.1038/s41467-020-17852-8',
  'type': 'supporting',
  'desc': 'mappings were merged. Gatan Microscopy Suite Software was used for EDSspectrum imaging data acquisition.',
  'data_url': ''},
 2: {'id': 619,
  'doi': '10.1038/s41467-020-17852-8',
  'type': 'supporting',
  'desc': 'setup and data acquisition parameters. The catalysts powder was packed into aKapton foil reaction tube (diameter 6 mm) with quartz wool at both ends. Thecatalysts were performed in a plug-ﬂow microreactor with the same X-ray beam',
  'data_url': ''},
 3: {'id': 619,
  'doi': '10