# Look up in the scholix registry

Read DOIs from rails app. Look up each DOI in the Scholix registry and save links to a csv file.




In [None]:
# library containign read and write functions to csv file
import lib.handle_csv as csvh

# library for handling url searchs
import lib.handle_urls as urlh

# library for connecting to the db
import lib.handle_db as dbh

# import custom functions (common to various notebooks)
import processing_functions as pr_fns

# managing files and file paths
from pathlib import Path

# add aprogress bar
from tqdm import tqdm_notebook 
from tqdm import tqdm

# regular expressions
import re

import json


In [11]:
# list of already searched DOIs
doi_list = ['10.1002/chem.202000067', '10.1016/j.jcat.2018.01.033', '10.1021/acscatal.9b03889', 
            '10.1039/d0cp01227k', '10.1039/d0cy01061h', '10.1098/rsta.2020.0058', '10.1098/rsta.2020.0063', 
            '10.1039/D0CY01608J', '10.1021/acs.est.0c04279', '10.1039/D0CP01192D', '10.1039/d0cy01779e', 
            '10.1021/acsenergylett.0c02614', '10.1039/d1fd00004g', '10.3390/catal10121370', 
            '10.1039/d1gc00901j', '10.1038/s41467-021-21062-1', '10.1021/acscatal.0c05413',
            '10.1021/acscatal.0c04858', '10.1088/1361-648x/abfe16', '10.1088/1361-6463/abe9e1', 
            '10.1039/d0sc03113e', '10.1007/s11244-021-01447-8', '10.1021/acs.organomet.1c00055', 
            '10.1021/acscatal.0c05019', '10.1021/acs.inorgchem.1c00327', '10.1002/smsc.202100032', 
            '10.1039/d0gc02295k', '10.1002/anie.201901592', '10.1021/acs.organomet.9b00845', 
            '10.1021/jacs.9b13106', '10.1002/anie.202006807', '10.1021/jacs.0c07980', '10.1039/d0cy01484b',
            '10.1039/d0cy02164d', '10.1002/anie.202101180', '10.1002/chem.202101140', 
            '10.1021/acsmacrolett.1c00216', '10.1002/anie.201810245', '10.1039/c9sc00385a', 
            '10.1021/acs.macromol.8b01224', '10.1039/c9dt02918d', '10.1038/s41467-019-10481-w', 
            '10.1002/ange.201901592', '10.1039/c9dt00595a', '10.1039/d1cy00238d', 
            '10.1021/acs.inorgchem.8b02923', '10.1002/ange.202006807', '10.1002/anie.201814320', 
            '10.1007/s10562-019-02876-7', '10.1021/acs.jpcc.9b09050', '10.1016/j.apcatb.2017.01.042',
            '10.1039/d0cc04036c', '10.1002/anie.202015016', '10.1039/d1ta01464a', '10.1002/smtd.202100512',
            '10.1107/s1600576720013576', '10.1039/d0cp00793e', '10.1039/d0ta01398f', 
            '10.1007/s11244-021-01450-z', '10.1039/d0ta08351h', '10.1021/acssuschemeng.1c01451',
            '10.1002/cphc.201800721', '10.1021/acssuschemeng.8b04073', '10.1002/cctc.202100286', 
            '10.1007/s11244-020-01245-8', '10.1021/acscatal.0c03620', '10.1016/j.cattod.2018.06.033', 
            '10.1016/j.apcatb.2020.118752', '10.1016/j.joule.2020.07.024', '10.1002/anie.201814381', 
            '10.1002/ange.201902857']

## Search for references direclty in scholexplorer
The next code makes a search of scholix references to data using the scholexeplorer of OpenAire

In [12]:
# look up in sholix
# Get pdf and html name from previous and put it in current
def search_scolix(db_name, work_dir, start_from = 0, cut_date="202408"):
    out_name = "search_scholix_"+db_name+"_"+cut_date
    out_file = Path(work_dir, out_name + ".csv")
    if out_file.is_file():
        print ("Already searched for", db_name)
        return out_name
    data_links = {}
    a_dl = {}
    url_base = 'http://api.scholexplorer.openaire.eu/v3/Links?sourcePid='
    ignore_types = ['References','IsReferencedBy'] #test reading all references and see what it comes out
    ignore_subtypes = ['IsCitedBy', 'cites', 'Cites','References','IsReferencedBy']
    terminate = False

    for a_pub in tqdm_notebook(app_pubs):
        pub_id = a_pub[0]
        pub_title = a_pub[1]
        pub_doi = a_pub[2]
        pub_url = a_pub[3]
        match_found = False
        if pr_fns.valid_doi(pub_doi) and pub_id > start_from:
            response = urlh.getPageFromURL(url_base + pub_doi.replace('/','%2f'))
            data_results = json.loads(response)
            id_dl = len(data_links)
            for a_result in data_results['result']:
                if not a_result['RelationshipType']['Name'] in ignore_types:
                    if 'SubType' in a_result['RelationshipType'].keys() \
                      and not a_result['RelationshipType']['SubType'] in ignore_subtypes:                        
                        id_dl += 1
                        source_doi = pub_doi
                        source_title = a_result['source']['Title'].replace('\n',' ')
                        source_published = a_result['source']['PublicationDate']
                        target_id = a_result['target']['Identifier'][0]['ID']
                        target_url = a_result['target']['Identifier'][0]['IDURL']
                        target_type = a_result['target']['Type']
                        if not pr_fns.valid_doi(target_id) and target_type != 'literature':
                            if a_result['target']['Identifier'][0]['IDScheme'] in ['uniprot','pdb']:
                                target_id = a_result['target']['Identifier'][0]['IDURL']
                            else:
                                for an_id in a_result['target']['Identifier']:
                                    print ("source", source_doi, "title", source_title)
                                    print (an_id)
                                terminate = True    
                        target_title = a_result['target']['Title'].replace('\n',' ')
                        target_published = a_result['target']['PublicationDate']

                        rel_type = a_result['RelationshipType']['Name']
                        rel_subtype = a_result['RelationshipType']['SubType']
                        a_dl = {"pub_id": pub_id,"pub_doi":source_doi,'source_title':source_title, 'source_published':source_published,
                                'target_id':target_id, 'target_title':target_title, 
                                'target_published': target_published, 'rel_type': rel_type, 'rel_subtype':rel_subtype}
                        data_links[id_dl]=a_dl
        if terminate:
            break
    print ('References found:', len(data_links))

    
    if len(data_links) > 0  and not terminate:
        csvh.write_csv_data(data_links, out_file)
    return out_name

## Remove duplicates
search for potential duplicates in list (hapens when same target with different relationship tipes)

In [13]:
import csv
# search for potential duplicates in list (hapens when same target with different relationship types)
def remove_dups(srf_name, work_dir):
    print(srf_name)
    in_file = Path(work_dir, srf_name + ".csv")
    out_name = srf_name + "_no_dups"
    out_file = Path(work_dir, out_name + ".csv")
    if out_file.is_file():
        print ("already removed duplicates from:", srf_name)
        return out_name
    data_links = {}
    a_dl = {}
    scholix_references, column_names = csvh.get_csv_data(in_file)

    scholix_look_up, look_up_names = csvh.get_csv_data(in_file)

    for a_ref in tqdm_notebook(scholix_references):
        for look_up in scholix_look_up:
            if scholix_look_up[look_up] != scholix_references[a_ref] \
               and not 'duplicate' in scholix_references[a_ref].keys():
                if scholix_look_up[look_up]['pub_doi'] == scholix_references[a_ref]['pub_doi'] \
                   and scholix_look_up[look_up]['target_id'] == scholix_references[a_ref]['target_id'] \
                   and scholix_look_up[look_up]['target_title'] == scholix_references[a_ref]['target_title']:
                    scholix_references[look_up]['duplicate'] = 'TRUE'
                    print (a_ref ,scholix_references[a_ref])
                    print (look_up, scholix_look_up[look_up])
        if not 'duplicate' in scholix_references[a_ref].keys():
            scholix_references[a_ref]['duplicate'] = 'FALSE'
            with open(out_file, 'a', newline='',encoding='utf8') as f:
                writer = csv.writer(f)
                writer.writerow(scholix_references[a_ref].values())
    return out_name
    

## Check if pdf mentions are on DB
Merge results with those of references mined from publications in preparation for fairnes validation before upload to DB

In [14]:
# look if already referenced in DB

def check_if_in_db(ndf_name, srf_name, work_dir):
    in_file = Path(work_dir, srf_name + ".csv")
    out_name = srf_name + "_not_in_DB"
    out_file = Path(work_dir, out_name + ".csv")
    if out_file.is_file():
        print ("Already checked DB for:", ndf_name)
        return out_name

    # get list of references with no duplicates

    scholix_references, column_names = csvh.get_csv_data(in_file)
    int_counter = 0
    unique_refs = {}

    pub_id = ''
    for a_ref in tqdm_notebook(scholix_references):
        pub_id = scholix_references[a_ref]['pub_id']
        ref_id = scholix_references[a_ref]['target_id']
        ref_title = scholix_references[a_ref]['target_title']
        pub_datsets = pr_fns.get_pub_datasets(ukchapp_db, pub_id)
        int_counter += 1
        #print(int_counter, scholix_references[a_ref], pub_datsets)
        print(f'***************PUBLICATION %s******************'%pub_id)
        identifier_found = False
        for a_ds in pub_datsets:
            ds_id = a_ds[0]
            ds_doi = a_ds[1]
            ds_url = a_ds[2]
            ds_name = a_ds[3]
            if ds_doi != None and ds_doi.strip().lower() == ref_id.strip().lower():
                #print ("DOI FOUND")
                identifier_found = True
            elif ds_url.strip().lower() == ref_id.strip().lower():
                #print ("URL FOUND")
                identifier_found = True
            elif '?' in ds_url and not pr_fns.valid_doi(ref_id):
                print("URL with extra parameters",ds_url)
                print ("compared to", ref_id)
            if identifier_found == True:
                scholix_references[a_ref]['in_db'] = 1
                if ds_name == ref_title:
                    #print('DS Name Match')
                    scholix_references[a_ref]['title_match'] = 1
                #else:
                    #print('DS Name Different')
                break
            else:
                scholix_references[a_ref]['in_db'] = 0

    if len(scholix_references) > 0:
        csvh.write_csv_data(scholix_references, out_file)
    return out_name

In [15]:
# get names and links for references in data mentions
def check_pdf_data(db_name, pdf_out_file, pdf_dir):
    in_name = pdf_out_file
    in_file = Path(pdf_dir, in_name)
    if not in_file.is_file():
        print ("In file not found:", in_name)
        return ""
    out_name = in_name.replace('valid','checked')
    out_file = Path(out_name)
    if out_file.is_file():
        print ("Already checked DB for:", in_name)
        return out_name


    data_mentions, dm_fields = csvh.get_csv_data(in_file, 'num')
    int_counter = 0
    for dm in data_mentions:
        pub_id = data_mentions[dm]['id']
        pub_doi = data_mentions[dm]['doi']
        ref_name = data_mentions[dm]['name']
        ref_link = data_mentions[dm]['data_url']
        ref_id =  data_mentions[dm]['do_id']
        #print (ref_name, ref_link, ref_id)
        pub_datsets = pr_fns.get_pub_datasets(ukchapp_db, pub_id)
        int_counter += 1
        #print(int_counter, scholix_references[a_ref], pub_datsets)
        print(f'***************PUBLICATION %s******************'%pub_id)
        identifier_found = False
        if data_mentions[dm]['add'] == '1':
            for a_ds in pub_datsets:
                ds_id = a_ds[0]
                ds_doi = a_ds[1]
                ds_url = a_ds[2]
                ds_name = a_ds[3]
                if ds_doi != None and ds_doi.strip().lower() == ref_id.strip().lower():
                    #print ("DOI FOUND")
                    identifier_found = True
                elif ds_url.strip().lower() == ref_id.strip().lower():
                    #print ("URL FOUND")
                    identifier_found = True
                elif '?' in ds_url and not pr_fns.valid_doi(ref_id):
                    print("URL with extra parameters",ds_url)
                    print ("compared to", ref_id)
                if identifier_found == True:
                    data_mentions[dm]['in_db'] = 1
                    if ds_name == ref_title:
                        #print('DS Name Match')
                        data_mentions[dm]['title_match'] = 1
                    #else:
                        #print('DS Name Different')
                    break

    if len(data_mentions) > 0:
        csvh.write_csv_data(data_mentions, out_name)
    return out_name

## Merge db filtered PDF results and Scholix results
Merge results with those of references mined from publications in preparation for fairnes validation before upload to DB

In [16]:
def merge_results (scholix_results, pdf_results, db_name, sclx_dir, pdf_dir, out_dir):
    scholix_file = Path(sclx_dir, scholix_results+".csv" )
    pdfresu_file = Path(pdf_dir, pdf_results)    
    print(scholix_file,pdfresu_file)
    # if the required files do not exist
    if (not scholix_file.is_file()) and (not pdfresu_file.is_file()):
        print ("In file not found:", in_name)
        return ""
    out_name = 'new_references_'+db_name
    out_file = Path(out_dir, out_name + ".csv")
    if out_file.is_file():
        print ("Already created merge DB for:", scholix_results, pdf_results)
        return out_name

    # get names and links for references in db checked data mentions
    data_mentions, dm_fields = csvh.get_csv_data(pdfresu_file, 'num')
    # get list of references with no duplicates
    scholix_references, column_names = csvh.get_csv_data(scholix_file)

    merged_references ={}
    # first just copy all the references in scholix to the merged set
    print("Copy all the references in scholix to the merged set")
    for a_ref in tqdm_notebook(scholix_references):
        if scholix_references[a_ref]['in_db']!= '1':
            merged_references[a_ref] = scholix_references[a_ref]

    print ("Check if the data mention is in the scholix references")
    new_idx = len(scholix_references) # start adding after the highest index for scholix
    ccdc_count = len(scholix_references)
    found_count = 0
    for dm in tqdm_notebook(data_mentions):
        pub_id = data_mentions[dm]['id']
        pub_doi = data_mentions[dm]['doi']
        ref_name = data_mentions[dm]['dataset_name']
        ref_link = data_mentions[dm]['data_url']
        ref_id =  data_mentions[dm]['do_id']
        ref_rel = data_mentions[dm]['type']
        print (pub_id,pub_doi,"REF ",ref_name,ref_link,ref_id,ref_rel)
        found_match = False
        if data_mentions[dm]['add'] == '1' and data_mentions[dm]['in_db'] != '1' :
            for a_ref in merged_references:
                if ccdc_count < a_ref:
                    break
                mr_pub_id = merged_references[a_ref]['pub_id']
                mr_pub_doi = merged_references[a_ref]['pub_doi']
                mr_id = merged_references[a_ref]['target_id']
                mr_title = merged_references[a_ref]['target_title']
                # pub_id, pub_doi, and ref_id must match if the reference is already found in scholix
                if pub_doi.strip().lower() == mr_pub_doi.strip().lower() and \
                    pub_id == mr_pub_id and \
                    ref_id.strip().lower() == mr_id.strip().lower():
                    found_count += 1
                    print("found match", found_count, dm, a_ref)
                    found_match = True
                    merged_references[a_ref]['in_pdf']=1
                    print (pub_doi.strip().lower(), mr_pub_doi.strip().lower(), pub_id, mr_pub_id,
                           ref_id, mr_id)
                    break
            if not found_match:
                new_idx += 1 
                a_dl = {"pub_id": pub_id,"pub_doi":pub_doi,'source_title':'', 
                        'source_published':'',
                        'target_id':ref_id, 
                        'target_title':ref_name, 
                        'target_published': '', 
                        'rel_type': ref_rel,
                        'in_pdf':1}
                if not pr_fns.valid_doi(ref_id):
                    #print(ref_id, ref_link)
                    a_dl['target_id'] = ref_link
                merged_references[new_idx] = a_dl

    if len(merged_references) > 0:
        csvh.write_csv_data(merged_references, out_file)
        print (len(merged_references), "merged references")
    return out_name

In [19]:
# Set the name of currend app DB
db_name = 'production202412'
ukchapp_db = "db_files/"+db_name+".sqlite3"
cut_date = "20241231"

pdf_results = "pdf_mentionsproduction202412_checked.csv"

while not Path(ukchapp_db).is_file():
    print('Please enter the name of app db file:')
    ukchapp_db = input()
start_from = 1047
# working dirs
pdf_data_search_dir = "./data_search_pdf"
scholix_data_search_dir = "./data_search_scholix"
data_load_dir = "./data_load"
    
# Get publication data from the ukch app
app_pubs = pr_fns.get_pub_app_data(ukchapp_db)

schlx_search_result = search_scolix(db_name, scholix_data_search_dir, start_from, cut_date)

print ("1. Shcolix results stored in:", schlx_search_result+".csv") 
schlx_search_nd = remove_dups(schlx_search_result, scholix_data_search_dir)
print ("2. Non duplicate shcolix results stored in:", schlx_search_nd+".csv") 
schlx_search_ndb = check_if_in_db(schlx_search_nd, schlx_search_result, scholix_data_search_dir)
print ("3. Not in DB shcolix results stored in:", schlx_search_ndb+".csv")
pdfsrc_db = check_pdf_data(db_name, pdf_results, pdf_data_search_dir)
print ("4. Not in DB pdf results stored in:", pdfsrc_db+".csv")
merged_res = merge_results(schlx_search_ndb, pdfsrc_db, db_name, scholix_data_search_dir,pdf_data_search_dir, data_load_dir)
print ("5. all data mentions stored in:", merged_res +".csv")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for a_pub in tqdm_notebook(app_pubs):


  0%|          | 0/774 [00:00<?, ?it/s]

References found: 11
1. Shcolix results stored in: search_scholix_production202412_20241231.csv
search_scholix_production202412_20241231


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for a_ref in tqdm_notebook(scholix_references):


  0%|          | 0/11 [00:00<?, ?it/s]

4 {'pub_id': '1056', 'pub_doi': '10.1002/cssc.201600246', 'source_title': 'Design of Highly Selective Platinum Nanoparticle Catalysts for the Aerobic Oxidation of KA‐Oil using Continuous‐Flow Chemistry', 'source_published': '2016-03-02', 'target_id': '10.1016/s0920-5861(98)00198-9', 'target_title': 'Catalytic synthesis of 2,6-dimethylphenol from methanol and KA-oil over magnesium oxide catalysts', 'target_published': '1998-09-01', 'rel_type': 'IsRelatedTo', 'rel_subtype': 'isamongtopnsimilardocuments'}
6 {'pub_id': '1056', 'pub_doi': '10.1002/cssc.201600246', 'source_title': 'Design of Highly Selective Platinum Nanoparticle Catalysts for the Aerobic Oxidation of KA‐Oil using Continuous‐Flow Chemistry', 'source_published': '2016-03-02', 'target_id': '10.1016/s0920-5861(98)00198-9', 'target_title': 'Catalytic synthesis of 2,6-dimethylphenol from methanol and KA-oil over magnesium oxide catalysts', 'target_published': '1998-09-01', 'rel_type': 'IsRelatedTo', 'rel_subtype': 'hasamongtopnsi

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for a_ref in tqdm_notebook(scholix_references):


  0%|          | 0/11 [00:00<?, ?it/s]

***************PUBLICATION 1055******************
***************PUBLICATION 1055******************
***************PUBLICATION 1055******************
***************PUBLICATION 1056******************
***************PUBLICATION 1056******************
***************PUBLICATION 1056******************
***************PUBLICATION 1056******************
***************PUBLICATION 1056******************
***************PUBLICATION 1056******************
***************PUBLICATION 1056******************
***************PUBLICATION 1056******************
3. Not in DB shcolix results stored in: search_scholix_production202412_20241231_not_in_DB.csv
In file not found: pdf_mentionsproduction202412_checked.csv
4. Not in DB pdf results stored in: .csv
data_search_scholix\search_scholix_production202412_20241231_not_in_DB.csv data_search_pdf


PermissionError: [Errno 13] Permission denied: 'data_search_pdf'

## Getting Data from article landing pages...

In [9]:
# Scrap landing page
# get abstract and graphic abstract url if possible
# also get reference to SI
# Get pdf and html name from previous and put it in current
def get_doi_landings(db_name, work_dir, start_from = 0, cut_date="202310"):
    out_name = "search_scholix_"+db_name+"_"+cut_date
    out_file = Path(work_dir, out_name + ".csv")
    if out_file.is_file():
        print ("Already searched for", db_name)
        return out_name
    data_links = {}
    a_dl = {}
    url_base = 'https://www.doi.org/'
    ignore_types = ['References','IsReferencedBy']

    terminate = False

    for a_pub in tqdm_notebook(app_pubs):
        pub_id = a_pub[0]
        pub_title = a_pub[1]
        pub_doi = a_pub[2]
        pub_url = a_pub[3]
        match_found = False
        if pr_fns.valid_doi(pub_doi) and pub_id >= start_from:
            print(pub_id,"\t", pub_doi)
            print(url_base+pub_doi)
            list_urls = []
            if '['in pub_url:
                list_urls = eval(pub_url.replace("=>",":"))
            for a_ref in list_urls:
                if "URL" in a_ref.keys() and not 'pdf' in a_ref['URL'] :
                    print ("\t -", a_ref['URL'])
                    #response = urlh.getPageFromURL(a_ref['URL'])
                    #print(len(response))
                    
            #for a_result in data_results['result']:
            #    if not a_result['RelationshipType']['Name'] in ignore_types:
            #        id_dl += 1
            #        source_doi = pub_doi
            #        source_title = a_result['source']['Title'].replace('\n',' ')
            #        source_published = a_result['source']['PublicationDate']
            #        target_id = a_result['target']['Identifier'][0]['ID']
            #        target_type = a_result['target']['Type']
            #        if not pr_fns.valid_doi(target_id) and target_type != 'literature':
            #            if a_result['target']['Identifier'][0]['IDScheme'] in ['uniprot','pdb']:
            #                target_id = a_result['target']['Identifier'][0]['IDURL']
            #            else:
            #                for an_id in a_result['target']['Identifier']:
            #                    print ("source", source_doi, "title", source_title)
            #                    print (an_id)
            #                terminate = True    
            #        target_title = a_result['target']['Title'].replace('\n',' ')
            #        target_published = a_result['target']['PublicationDate']

            #        rel_type = a_result['RelationshipType']['Name']

            #        a_dl = {"pub_id": pub_id,"pub_doi":source_doi,'source_title':source_title, 'source_published':source_published,
            #                'target_id':target_id, 'target_title':target_title, 
            #                'target_published': target_published, 'rel_type': rel_type}
            #        data_links[id_dl]=a_dl
        
        if terminate:
            break
        
        
    #print ('References found:', len(data_links))

    
    #if len(data_links) > 0  and not terminate:
    #    csvh.write_csv_data(data_links, out_file)
    #return out_name

In [10]:
# Set the name of currend app DB
db_name = 'production_n'
ukchapp_db = "db_files/"+db_name+".sqlite3"
cut_date = "20240415"

pdf_results = "pdf_mentionsproduction202312_checked.csv"

while not Path(ukchapp_db).is_file():
    print('Please enter the name of app db file:')
    ukchapp_db = input()
start_from = 972
# working dirs
pdf_data_search_dir = "./data_search_pdf"
scholix_data_search_dir = "./data_search_scholix"
data_load_dir = "./data_load"


# Get publication data from the ukch app
app_pubs = pr_fns.get_pub_app_data(ukchapp_db)

schlx_search_result = get_doi_landings(db_name, scholix_data_search_dir, start_from, cut_date)



Already searched for production_n
