# Look up in the scholix registry

Read DOIs from rails app. Look up each DOI in the Scholix registry and save links to a csv file.




In [1]:
# library containign read and write functions to csv file
import lib.handle_csv as csvh

# library for handling url searchs
import lib.handle_urls as urlh

# library for connecting to the db
import lib.handle_db as dbh

# import custom functions (common to various notebooks)
import processing_functions as pr_fns

# managing files and file paths
from pathlib import Path

# add aprogress bar
from tqdm import tqdm_notebook 
from tqdm import tqdm

# regular expressions
import re

import json


In [2]:
# list of already searched DOIs
doi_list = ['10.1002/chem.202000067', '10.1016/j.jcat.2018.01.033', '10.1021/acscatal.9b03889', 
            '10.1039/d0cp01227k', '10.1039/d0cy01061h', '10.1098/rsta.2020.0058', '10.1098/rsta.2020.0063', 
            '10.1039/D0CY01608J', '10.1021/acs.est.0c04279', '10.1039/D0CP01192D', '10.1039/d0cy01779e', 
            '10.1021/acsenergylett.0c02614', '10.1039/d1fd00004g', '10.3390/catal10121370', 
            '10.1039/d1gc00901j', '10.1038/s41467-021-21062-1', '10.1021/acscatal.0c05413',
            '10.1021/acscatal.0c04858', '10.1088/1361-648x/abfe16', '10.1088/1361-6463/abe9e1', 
            '10.1039/d0sc03113e', '10.1007/s11244-021-01447-8', '10.1021/acs.organomet.1c00055', 
            '10.1021/acscatal.0c05019', '10.1021/acs.inorgchem.1c00327', '10.1002/smsc.202100032', 
            '10.1039/d0gc02295k', '10.1002/anie.201901592', '10.1021/acs.organomet.9b00845', 
            '10.1021/jacs.9b13106', '10.1002/anie.202006807', '10.1021/jacs.0c07980', '10.1039/d0cy01484b',
            '10.1039/d0cy02164d', '10.1002/anie.202101180', '10.1002/chem.202101140', 
            '10.1021/acsmacrolett.1c00216', '10.1002/anie.201810245', '10.1039/c9sc00385a', 
            '10.1021/acs.macromol.8b01224', '10.1039/c9dt02918d', '10.1038/s41467-019-10481-w', 
            '10.1002/ange.201901592', '10.1039/c9dt00595a', '10.1039/d1cy00238d', 
            '10.1021/acs.inorgchem.8b02923', '10.1002/ange.202006807', '10.1002/anie.201814320', 
            '10.1007/s10562-019-02876-7', '10.1021/acs.jpcc.9b09050', '10.1016/j.apcatb.2017.01.042',
            '10.1039/d0cc04036c', '10.1002/anie.202015016', '10.1039/d1ta01464a', '10.1002/smtd.202100512',
            '10.1107/s1600576720013576', '10.1039/d0cp00793e', '10.1039/d0ta01398f', 
            '10.1007/s11244-021-01450-z', '10.1039/d0ta08351h', '10.1021/acssuschemeng.1c01451',
            '10.1002/cphc.201800721', '10.1021/acssuschemeng.8b04073', '10.1002/cctc.202100286', 
            '10.1007/s11244-020-01245-8', '10.1021/acscatal.0c03620', '10.1016/j.cattod.2018.06.033', 
            '10.1016/j.apcatb.2020.118752', '10.1016/j.joule.2020.07.024', '10.1002/anie.201814381', 
            '10.1002/ange.201902857']

# Set the name of currend app DB
ukchapp_db = "db_files/app_db20211005.sqlite3"
while not Path(ukchapp_db).is_file():
    print('Please enter the name of app db file:')
    ukchapp_db = input()

# Get publication data from the ukch app
app_pubs = pr_fns.get_pub_app_data(ukchapp_db)


In [None]:
# look up in sholix
# Get pdf and html name from previous and put it in current
data_links = {}
a_dl = {}

url_base = 'http://api.scholexplorer.openaire.eu/v2/Links?sourcePid='
ignore_types = ['References','IsReferencedBy']

terminate = False

for a_pub in tqdm_notebook(app_pubs):
    pub_id = a_pub[0]
    pub_title = a_pub[1]
    pub_doi = a_pub[2]
    pub_url = a_pub[3]
    match_found = False
    if pr_fns.valid_doi(pub_doi):
        response = urlh.getPageFromURL(url_base + pub_doi.replace('/','%2f'))
        data_results = json.loads(response)
        id_dl = len(data_links)
        for a_result in data_results['result']:
            if not a_result['RelationshipType']['Name'] in ignore_types:
                id_dl += 1
                source_doi = pub_doi
                source_title = a_result['source']['Title'].replace('\n',' ')
                source_published = a_result['source']['PublicationDate']
                target_id = a_result['target']['Identifier'][0]['ID']
                if not pr_fns.valid_doi(target_id):
                    if a_result['target']['Identifier'][0]['IDScheme'] in ['uniprot','pdb']:
                        target_id = a_result['target']['Identifier'][0]['IDURL']
                    else:
                        for an_id in a_result['target']['Identifier']:
                            print (an_id)
                        terminate = True    
                target_title = a_result['target']['Title'].replace('\n',' ')
                target_published = a_result['target']['PublicationDate']

                rel_type = a_result['RelationshipType']['Name']

                a_dl = {"pub_id": pub_id,"pub_doi":source_doi,'source_title':source_title, 'source_published':source_published,
                        'target_id':target_id, 'target_title':target_title, 
                        'target_published': target_published, 'rel_type': rel_type}
                data_links[id_dl]=a_dl
    if terminate:
        break
print ('References found:', len(data_links))
        
    
if len(data_links) > 0  and not terminate:
    csvh.write_csv_data(data_links, "ccdc_scholix3.csv")

In [None]:
# look if there are duplicates in the list
scholix_references, column_names = csvh.get_csv_data("ccdc_scholix3.csv")
int_counter = 0
unique_refs = {}
for a_ref in tqdm_notebook(scholix_references):
    if scholix_references[a_ref]['duplicate']=='FALSE':
        int_counter += 1
        print(int_counter, scholix_references[a_ref])
        unique_refs[int_counter] = scholix_references[a_ref]
        
if len(unique_refs) > 0:
    csvh.write_csv_data(unique_refs, "ccdc_scholix3_no_dups.csv")



In [None]:
# look if already referenced in DB

# get list of references with no duplicates
scholix_references, column_names = csvh.get_csv_data("ccdc_scholix3_no_dups.csv")
int_counter = 0
unique_refs = {}

pub_id = ''
for a_ref in tqdm_notebook(scholix_references):
    pub_id = scholix_references[a_ref]['pub_id']
    ref_id = scholix_references[a_ref]['target_id']
    ref_title = scholix_references[a_ref]['target_title']
    pub_datsets = pr_fns.get_pub_datasets(ukchapp_db, pub_id)
    int_counter += 1
    #print(int_counter, scholix_references[a_ref], pub_datsets)
    print(f'***************PUBLICATION %s******************'%pub_id)
    identifier_found = False
    for a_ds in pub_datsets:
        ds_id = a_ds[0]
        ds_doi = a_ds[1]
        ds_url = a_ds[2]
        ds_name = a_ds[3]
        if ds_doi != None and ds_doi.strip().lower() == ref_id.strip().lower():
            #print ("DOI FOUND")
            identifier_found = True
        elif ds_url.strip().lower() == ref_id.strip().lower():
            #print ("URL FOUND")
            identifier_found = True
        elif '?' in ds_url and not pr_fns.valid_doi(ref_id):
            print("URL with extra parameters",ds_url)
            print ("compared to", ref_id)
        if identifier_found == True:
            scholix_references[a_ref]['in_db'] = 1
            if ds_name == ref_title:
                #print('DS Name Match')
                scholix_references[a_ref]['title_match'] = 1
            #else:
                #print('DS Name Different')
            break

if len(scholix_references) > 0:
    csvh.write_csv_data(scholix_references, "ccdc_scholix3_db_checked.csv")

## Check if pdf mentions are on DB
Merge results with those of references mined from publications in preparation for fairnes validation before upload to DB

In [None]:
# get names and links for references in data mentions
data_mentions, dm_fields = csvh.get_csv_data('pdf_mentions202110_fairness.csv', 'num')

for dm in data_mentions:
    pub_id = data_mentions[dm]['id']
    pub_doi = data_mentions[dm]['doi']
    ref_name = data_mentions[dm]['name']
    ref_link = data_mentions[dm]['data_url']
    ref_id =  data_mentions[dm]['do_id']
    #print (ref_name, ref_link, ref_id)
    pub_datsets = pr_fns.get_pub_datasets(ukchapp_db, pub_id)
    int_counter += 1
    #print(int_counter, scholix_references[a_ref], pub_datsets)
    print(f'***************PUBLICATION %s******************'%pub_id)
    identifier_found = False
    if data_mentions[dm]['add'] == '1':
        for a_ds in pub_datsets:
            ds_id = a_ds[0]
            ds_doi = a_ds[1]
            ds_url = a_ds[2]
            ds_name = a_ds[3]
            if ds_doi != None and ds_doi.strip().lower() == ref_id.strip().lower():
                #print ("DOI FOUND")
                identifier_found = True
            elif ds_url.strip().lower() == ref_id.strip().lower():
                #print ("URL FOUND")
                identifier_found = True
            elif '?' in ds_url and not pr_fns.valid_doi(ref_id):
                print("URL with extra parameters",ds_url)
                print ("compared to", ref_id)
            if identifier_found == True:
                data_mentions[dm]['in_db'] = 1
                if ds_name == ref_title:
                    #print('DS Name Match')
                    data_mentions[dm]['title_match'] = 1
                #else:
                    #print('DS Name Different')
                break

if len(data_mentions) > 0:
    csvh.write_csv_data(data_mentions, "pdf_mentions202110_db_checked.csv")

## Merge db filtered PDF results and Scholix results
Merge results with those of references mined from publications in preparation for fairnes validation before upload to DB

In [20]:
# get names and links for references in db checked data mentions
data_mentions, dm_fields = csvh.get_csv_data('pdf_mentions202110_db_checked_1.csv', 'num')
# get list of references with no duplicates
scholix_references, column_names = csvh.get_csv_data("ccdc_scholix3_db_checked.csv")

merged_references ={}
# first just copy all the references in scholix to the merged set
print("Copy all the references in scholix to the merged set")
for a_ref in tqdm_notebook(scholix_references):
    if scholix_references[a_ref]['in_db']!= '1':
        merged_references[a_ref] = scholix_references[a_ref]

print ("Check if the data mention is in the scholix references")
new_idx = len(scholix_references) # start adding after the highest index for scholix
ccdc_count = len(scholix_references)
found_count = 0
for dm in tqdm_notebook(data_mentions):
    pub_id = data_mentions[dm]['id']
    pub_doi = data_mentions[dm]['doi']
    ref_name = data_mentions[dm]['name']
    ref_link = data_mentions[dm]['data_url']
    ref_id =  data_mentions[dm]['do_id']
    ref_rel = data_mentions[dm]['type']
    found_match = False
    if data_mentions[dm]['add'] == '1' and data_mentions[dm]['in_db'] != '1' :
        for a_ref in merged_references:
            if ccdc_count < a_ref:
                break
            mr_pub_id = merged_references[a_ref]['pub_id']
            mr_pub_doi = merged_references[a_ref]['pub_doi']
            mr_id = merged_references[a_ref]['target_id']
            mr_title = merged_references[a_ref]['target_title']
            # pub_id, pub_doi, and ref_id must match if the reference is already found in scholix
            if pub_doi.strip().lower() == mr_pub_doi.strip().lower() and \
                pub_id == mr_pub_id and \
                ref_id.strip().lower() == mr_id.strip().lower():
                found_count += 1
                print("found match", found_count, dm, a_ref)
                found_match = True
                merged_references[a_ref]['in_pdf']=1
                print (pub_doi.strip().lower(), mr_pub_doi.strip().lower(), pub_id, mr_pub_id,
                       ref_id, mr_id)
                break
        if not found_match:
            new_idx += 1 
            a_dl = {"pub_id": pub_id,"pub_doi":pub_doi,'source_title':'', 
                    'source_published':'',
                    'target_id':ref_id, 
                    'target_title':ref_name, 
                    'target_published': '', 
                    'rel_type': ref_rel,
                    'in_pdf':1}
            if not pr_fns.valid_doi(ref_id):
                #print(ref_id, ref_link)
                a_dl['target_id'] = ref_link
            merged_references[new_idx] = a_dl

if len(merged_references) > 0:
    csvh.write_csv_data(merged_references, "new_references202111.csv")

Copy all the references in scholix to the merged set


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for a_ref in tqdm_notebook(scholix_references):


  0%|          | 0/252 [00:00<?, ?it/s]

Check if the data mention is in the scholix references


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for dm in tqdm_notebook(data_mentions):


  0%|          | 0/151 [00:00<?, ?it/s]

found match 1 30 212
10.1021/acs.organomet.9b00845 10.1021/acs.organomet.9b00845 690 690 10.5517/ccdc.csd.cc24486s 10.5517/ccdc.csd.cc24486s
found match 2 36 222
10.1039/d0cy02164d 10.1039/d0cy02164d 696 696 10.5517/ccdc.csd.cc26jv2p 10.5517/ccdc.csd.cc26jv2p
found match 3 37 223
10.1039/d0cy02164d 10.1039/d0cy02164d 696 696 10.5517/ccdc.csd.cc26jv3q 10.5517/ccdc.csd.cc26jv3q
found match 4 38 224
10.1039/d0cy02164d 10.1039/d0cy02164d 696 696 10.5517/ccdc.csd.cc26jv4r 10.5517/ccdc.csd.cc26jv4r
found match 5 45 227
10.1039/c9dt02918d 10.1039/c9dt02918d 704 704 10.5517/ccdc.csd.cc233drd 10.5517/ccdc.csd.cc233drd
found match 6 46 228
10.1039/c9dt02918d 10.1039/c9dt02918d 704 704 10.5517/ccdc.csd.cc233dsf 10.5517/ccdc.csd.cc233dsf
found match 7 47 229
10.1039/c9dt02918d 10.1039/c9dt02918d 704 704 10.5517/ccdc.csd.cc233dtg 10.5517/ccdc.csd.cc233dtg
found match 8 48 230
10.1039/c9dt02918d 10.1039/c9dt02918d 704 704 10.5517/ccdc.csd.cc233dvh 10.5517/ccdc.csd.cc233dvh
found match 9 49 231
10.10

In [21]:
len(merged_references)


298

In [22]:
merged_references

{2: {'pub_id': '10',
  'pub_doi': '10.1021/acs.biochem.8b00169',
  'source_title': 'Biocatalytic Routes to Lactone Monomers for Polymer Production',
  'source_published': '2018-03-13',
  'target_id': 'http://www.ebi.ac.uk/pdbe-srv/view/entry/6er9/summary',
  'target_title': 'crystal structure of cyclohexanone monooxygenase from rhodococcus sp. phi1 bound to nadp+',
  'target_published': '2018-10-02',
  'rel_type': 'IsRelatedTo',
  'duplicate': 'FALSE',
  'more_one': '',
  'in_db': '',
  'title_match': ''},
 3: {'pub_id': '10',
  'pub_doi': '10.1021/acs.biochem.8b00169',
  'source_title': 'Biocatalytic Routes to Lactone Monomers for Polymer Production',
  'source_published': '2018-03-13',
  'target_id': 'http://www.uniprot.org/uniprot/Q84H73',
  'target_title': 'Cyclohexanone monooxygenase',
  'target_published': '2017-04-20',
  'rel_type': 'IsRelatedTo',
  'duplicate': 'FALSE',
  'more_one': '',
  'in_db': '',
  'title_match': ''},
 4: {'pub_id': '10',
  'pub_doi': '10.1021/acs.biochem