# Get PDF Files for publications in the UK Catalysis Hub app db
A list of publications is obtainded from the app database. This list will contain a titles, IDs and DOIs which need to be explored to look for the corresponding pdf files. 
The steps of the process are: 
 1. get a Title, DOI, and URL for each publication
 2. convert the DOI to a pdf file name and try to open de file
 3. use pdfMiner and/or CDE to get the reference to data
 4. add a new dataset entry each time a new data object is found

In [1]:
# Libraries
# library containign functions that read and write to csv files
import lib.handle_csv as csvh
# library for connecting to the db
import lib.handle_db as dbh
# library for handling text matchings
import lib.text_comp as txtc
# library for getting data from crossref
import lib.crossref_api as cr_api
# library for handling url searchs
import lib.handle_urls as urlh
# managing files and file paths
from pathlib import Path
# add aprogress bar
from tqdm import tqdm_notebook 
from tqdm import tqdm
#library for handling json files
import json
# library for using regular expressions
import re
# library for handling http requests
import requests

# import custom functions (common to various notebooks)
import processing_functions as pr_fns



In [2]:
# functions for ChemDataExtractor
# not used for mining data references (suplementary/raw) or to get pdf metadata
from chemdataextractor import Document

# A function for getting a list of files from the directory
# This will be modified to get the list from a csv file
def get_files_list (source_dir):
    i_counter = 0
    files_list = []
    for filepath in sorted(source_dir.glob('*.pdf')):
        i_counter += 1
        files_list.append(filepath)
    return files_list

def cde_read_pdfs(a_file):
    pdf_f = open(a_file, 'rb')
    doc = Document.from_file(pdf_f)
    return doc

def find_doi(element_text):
    cr_re_01 = '10.\d{4,9}/[-._;()/:A-Z0-9]+'
    compare = re.search(cr_re_01, element_text, re.IGNORECASE)
    if compare != None:
        return compare.group()
    return ""

def get_db_id(doi_value, db_name = "app_db.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    table = 'articles'   
    id_val = db_conn.get_value(table, "id", "doi", doi_value)
    db_conn.close()
    if id_val != None:
        return id_val[0]
    else:
        return 0

def get_db_title(doi_value, db_name = "app_db.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    table = 'articles'   
    id_val = db_conn.get_value(table, "title", "doi", doi_value)
    db_conn.close()
    if id_val != None:
        return id_val[0]
    else:
        return 0

def get_close_dois(str_name, db_name = "prev_search.sqlite3"):
    db_conn = dbh.DataBaseAdapter(db_name)
    search_in = 'articles'
    fields_required = "id, doi, title, pdf_file"
    filter_str = "doi like '%"+str_name+"%';"

    db_titles = db_conn.get_values(search_in, fields_required, filter_str)
    db_conn.close()
    return db_titles

In [3]:
import pdfminer
from pdfminer.high_level import extract_text

# functions for PDFminer

def get_pdf_text(pdf_file):
    return extract_text(pdf_file)

# get the paragraph fragments with references to data
def get_ref_sentences(pdf_text):
    sentences = pdf_text.split("\n")
    groups=[]
    for sentence in sentences:
        if pr_fns.is_data_stmt(sentence.lower()):
            idx = sentences.index(sentence)
            groups.append([idx-1,idx,idx+1])
    reduced_groups = []
    for group in groups:
        idx_group = groups.index(group)
        if groups.index(group) > 0:
            set_g = set(group)
            # make the array before current a set
            set_bg = set(groups[idx_group - 1])
            # make the array after current a set
            set_ag = set()
            if idx_group + 1 < len(groups):    
                set_ag = set(groups[idx_group + 1])
            if len(set_bg.intersection(set_g)) > 0:
                ordered_union = list(set_bg.union(set_g))
                ordered_union.sort()
                reduced_groups.append(ordered_union)
            if len(set_ag.intersection(set_g)) > 0:
                ordered_union = list(set_ag.union(set_g))
                ordered_union.sort()
                reduced_groups.append(ordered_union)
            if len(reduced_groups) > 0:
                is_in_rg = False
                for a_rg in reduced_groups:
                    if set_g.issubset(a_rg):
                        is_in_rg = True
                        break
                if not is_in_rg:
                    reduced_groups.append(list(set_g))
    return_group = []
    for sentence_group in reduced_groups:
        full_sentence = ""
        for single_sentence in sentence_group:
            full_sentence += sentences[single_sentence].strip()
        return_group.append(full_sentence)
    return return_group

# get the paragraph fragments with references to data
def get_all_data_sentences(pdf_text):
    sentences = pdf_text.split("\n")
    groups=[]
    for sentence in sentences:
        if 'data' in sentence.lower() or 'inform' in sentence.lower():
            idx = sentences.index(sentence)
            groups.append([idx-1, idx, idx+1])
    reduced_groups = []
    for group in groups:
        idx_group = groups.index(group)
        if groups.index(group) > 0:
            set_g = set(group)
            # make the array before current a set
            set_bg = set(groups[idx_group - 1])
            # make the array after current a set
            set_ag = set()
            if idx_group + 1 < len(groups):    
                set_ag = set(groups[idx_group + 1])
            if len(set_bg.intersection(set_g)) > 0:
                ordered_union = list(set_bg.union(set_g))
                ordered_union.sort()
                reduced_groups.append(ordered_union)
            if len(set_ag.intersection(set_g)) > 0:
                ordered_union = list(set_ag.union(set_g))
                ordered_union.sort()
                reduced_groups.append(ordered_union)
            if len(reduced_groups) > 0:
                is_in_rg = False
                for a_rg in reduced_groups:
                    if set_g.issubset(a_rg):
                        is_in_rg = True
                        break
                if not is_in_rg:
                    reduced_groups.append(list(set_g))
    return_group = []
    for sentence_group in reduced_groups:
        full_sentence = ""
        for single_sentence in sentence_group:
            full_sentence += sentences[single_sentence].strip()
        if not full_sentence in return_group:
            return_group.append(full_sentence)
    return return_group

# get the http strings from references to data
def get_http_ref(sentence):
    http_frag = ""
    if 'http' in sentence.lower():
        idx_http = sentence.lower().index('http')
        http_frag = sentence[idx_http:]
        space_in_ref = True
        while " " in http_frag:
            space_idx = http_frag.rfind(" ")
            http_frag = http_frag[:space_idx]
        if(http_frag[-1:]=="."):
            http_frag = http_frag[:-1]
    return http_frag

Get the name of the current app db file:

In [4]:
# app db file with path: db_files/app_db.sqlite3
ukchapp_db = "db_files/app_db.sqlite3"
while not Path(ukchapp_db).is_file():
    print('Please enter the name of app db file:')
    ukchapp_db = input()

## Use pdfminer to get metadata from pdf file

In [5]:
# get publication data from the ukch app
db_pubs = pr_fns.get_pub_app_data(ukchapp_db)

# get the list of dois already mined for data 
input_file = 'pub_data_all.csv'
id_field = 'num'
processed, headings = csvh.get_csv_data(input_file, id_field)
for id_num in processed:
    current_title = processed[id_num]['doi']
processed[1]['num']

processed_dois = []
for entry in processed:
    if not processed[entry]['doi'] in processed_dois:
        processed_dois.append( processed[entry]['doi'])

data_records = {}
data_mentions = {}
ref_count = mention_count = 0
for a_pub in tqdm_notebook(db_pubs):
    data_refs = []
    data_sents = []
    pub_id = a_pub[0]
    pub_title = a_pub[1]
    pub_doi = a_pub[2]
    pub_url = a_pub[3]
    pub_pdf = a_pub[4]
    pub_html = a_pub[5]
    if pub_pdf == 'None':
        print("*************************")
        print("Missing PDF for:", pub_doi)
        print("*************************")
    else:
        pdf_file = "pdf_files/" + pub_pdf
        if not Path(pdf_file).is_file():
            print("*************************")
            print("Missing file for:", pdf_file, "for", pub_doi)
            print("*************************")
        else: 
            print("PDF filename", pdf_file)
            pdf_text = get_pdf_text(pdf_file)
            ref_sentences = get_ref_sentences(pdf_text)
            data_sentences = get_all_data_sentences(pdf_text)
            for r_sentence in ref_sentences:
                dt_link = get_http_ref(r_sentence)
                if 'supplem' in r_sentence.lower():
                    data_refs.append({'type':'supplementary',"desc":r_sentence, 'data_url':dt_link})
                else:
                    data_refs.append({'type':'supporting',"desc":r_sentence, 'data_url':dt_link})
            for d_sentence in data_sentences:
                dt_link = get_http_ref(d_sentence)
                if 'supplem' in d_sentence.lower():
                    data_sents.append({'type':'supplementary',"desc":d_sentence, 'data_url':dt_link})
                else:
                    data_sents.append({'type':'supporting',"desc":d_sentence, 'data_url':dt_link})
    if data_refs != []:
        for data_ref in data_refs:
            data_record = {'id':pub_id, 'doi':pub_doi}    
            data_record.update(data_ref)
            data_records[ref_count] = data_record
            ref_count += 1
    if data_sents != []:
        for data_sent in data_sents:
            sentence_record = {'id':pub_id, 'doi':pub_doi}    
            sentence_record.update(data_sent)
            data_mentions[mention_count] = sentence_record
            mention_count += 1

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=342.0), HTML(value='')))

PDF filename pdf_files/s41929-019-0334-3.pdf
PDF filename pdf_files/acscatal.9b00685.pdf
PDF filename pdf_files/cctc.201901268.pdf
PDF filename pdf_files/chem.201805250.pdf
PDF filename pdf_files/1-s2.0-S0968089618313233-main.pdf
PDF filename pdf_files/acssuschemeng.8b03568.pdf
PDF filename pdf_files/Smart anime donors.pdf
PDF filename pdf_files/C8OB00066B.pdf
PDF filename pdf_files/1-s2.0-S0968089617301268-main.pdf
PDF filename pdf_files/acs.biochem.8b00169.pdf
PDF filename pdf_files/acscatal.8b00389.pdf
PDF filename pdf_files/acscatal.8b00624.pdf
PDF filename pdf_files/jacs.7b12621.pdf
PDF filename pdf_files/acscatal.8b03169.pdf
PDF filename pdf_files/acscatal.9b01820(1).pdf
PDF filename pdf_files/C9CC02459J.pdf
PDF filename pdf_files/s41929-018-0213-3.pdf
PDF filename pdf_files/1-s2.0-S0926860X18305003-main.pdf
PDF filename pdf_files/1-s2.0-S0926860X18305817-main.pdf
PDF filename pdf_files/acs.jpcc.8b08420.pdf
PDF filename pdf_files/1-s2.0-S0926337318306167-main.pdf
PDF filename pdf

PDF filename pdf_files/nature21001.pdf
PDF filename pdf_files/jacs.5b13070.pdf
PDF filename pdf_files/anie.201602930.pdf
PDF filename pdf_files/jacs.6b00710.pdf
PDF filename pdf_files/rsta.2015.0085.pdf
PDF filename pdf_files/rspa.2016.0078.pdf
PDF filename pdf_files/C5SC03494A.pdf
PDF filename pdf_files/C5CC08714G.pdf
PDF filename pdf_files/C5CC08681G.pdf
PDF filename pdf_files/cssc.201501225.pdf
PDF filename pdf_files/jacs.5b09913.pdf
PDF filename pdf_files/cs502038y.pdf
PDF filename pdf_files/acs.inorgchem.5b02038.pdf
PDF filename pdf_files/acs.macromol.5b01293.pdf
PDF filename pdf_files/acs.inorgchem.5b02233.pdf
PDF filename pdf_files/acs.macromol.5b00225.pdf
PDF filename pdf_files/acscatal.5b01327.pdf
PDF filename pdf_files/om501252m.pdf
PDF filename pdf_files/C4RA16127K.pdf
PDF filename pdf_files/ja5062467.pdf
PDF filename pdf_files/C7CY00184C.pdf
PDF filename pdf_files/cctc.201600925.pdf
PDF filename pdf_files/cctc.201601603.pdf
PDF filename pdf_files/cctc.201601692.pdf
PDF file

In [6]:
if len(data_records) > 0:
    csvh.write_csv_data(data_records, 'pdf_data.csv')
    
if len(data_mentions) > 0:
    csvh.write_csv_data(data_mentions, 'pdf_mentions.csv')

In [7]:
len(data_records)

21