# Look up in the scholix registry

Read DOIs from rails app. Look up each DOI in the Scholix registry and save links to a csv file.




In [None]:
# library containign read and write functions to csv file
import lib.handle_csv as csvh

# library for handling url searchs
import lib.handle_urls as urlh

# library for connecting to the db
import lib.handle_db as dbh

# import custom functions (common to various notebooks)
import processing_functions as pr_fns

# managing files and file paths
from pathlib import Path

# add aprogress bar
from tqdm import tqdm_notebook 
from tqdm import tqdm

import json



# list of already searched DOIs
doi_list = ['10.1002/chem.202000067', '10.1016/j.jcat.2018.01.033', '10.1021/acscatal.9b03889', 
            '10.1039/d0cp01227k', '10.1039/d0cy01061h', '10.1098/rsta.2020.0058', '10.1098/rsta.2020.0063', 
            '10.1039/D0CY01608J', '10.1021/acs.est.0c04279', '10.1039/D0CP01192D', '10.1039/d0cy01779e', 
            '10.1021/acsenergylett.0c02614', '10.1039/d1fd00004g', '10.3390/catal10121370', 
            '10.1039/d1gc00901j', '10.1038/s41467-021-21062-1', '10.1021/acscatal.0c05413',
            '10.1021/acscatal.0c04858', '10.1088/1361-648x/abfe16', '10.1088/1361-6463/abe9e1', 
            '10.1039/d0sc03113e', '10.1007/s11244-021-01447-8', '10.1021/acs.organomet.1c00055', 
            '10.1021/acscatal.0c05019', '10.1021/acs.inorgchem.1c00327', '10.1002/smsc.202100032', 
            '10.1039/d0gc02295k', '10.1002/anie.201901592', '10.1021/acs.organomet.9b00845', 
            '10.1021/jacs.9b13106', '10.1002/anie.202006807', '10.1021/jacs.0c07980', '10.1039/d0cy01484b',
            '10.1039/d0cy02164d', '10.1002/anie.202101180', '10.1002/chem.202101140', 
            '10.1021/acsmacrolett.1c00216', '10.1002/anie.201810245', '10.1039/c9sc00385a', 
            '10.1021/acs.macromol.8b01224', '10.1039/c9dt02918d', '10.1038/s41467-019-10481-w', 
            '10.1002/ange.201901592', '10.1039/c9dt00595a', '10.1039/d1cy00238d', 
            '10.1021/acs.inorgchem.8b02923', '10.1002/ange.202006807', '10.1002/anie.201814320', 
            '10.1007/s10562-019-02876-7', '10.1021/acs.jpcc.9b09050', '10.1016/j.apcatb.2017.01.042',
            '10.1039/d0cc04036c', '10.1002/anie.202015016', '10.1039/d1ta01464a', '10.1002/smtd.202100512',
            '10.1107/s1600576720013576', '10.1039/d0cp00793e', '10.1039/d0ta01398f', 
            '10.1007/s11244-021-01450-z', '10.1039/d0ta08351h', '10.1021/acssuschemeng.1c01451',
            '10.1002/cphc.201800721', '10.1021/acssuschemeng.8b04073', '10.1002/cctc.202100286', 
            '10.1007/s11244-020-01245-8', '10.1021/acscatal.0c03620', '10.1016/j.cattod.2018.06.033', 
            '10.1016/j.apcatb.2020.118752', '10.1016/j.joule.2020.07.024', '10.1002/anie.201814381', 
            '10.1002/ange.201902857']


# Set the name of currend app DB
ukchapp_db = "db_files/app_db20211005.sqlite3"
while not Path(ukchapp_db).is_file():
    print('Please enter the name of app db file:')
    ukchapp_db = input()

# Get publication data from the ukch app
app_pubs = pr_fns.get_pub_app_data(ukchapp_db)

# Get pdf and html name from previous and put it in current
data_links = {}
a_dl = {}

url_base = 'http://api.scholexplorer.openaire.eu/v2/Links?sourcePid='
ignore_types = ['References','IsReferencedBy']

terminate = False


for a_pub in tqdm_notebook(app_pubs):
    pub_id = a_pub[0]
    pub_title = a_pub[1]
    pub_doi = a_pub[2]
    pub_url = a_pub[3]
    match_found = False
    if pr_fns.valid_doi(pub_doi):
        response = urlh.getPageFromURL(url_base + pub_doi.replace('/','%2f'))
        data_results = json.loads(response)
        id_dl = len(data_links)
        for a_result in data_results['result']:
            if not a_result['RelationshipType']['Name'] in ignore_types:
                id_dl += 1
                source_doi = pub_doi
                source_title = a_result['source']['Title']
                source_published = a_result['source']['PublicationDate']
                target_id = a_result['target']['Identifier'][0]['ID']
                if not pr_fns.valid_doi(target_id):
                    if a_result['target']['Identifier'][0]['IDScheme'] in ['uniprot','pdb']:
                        target_id = a_result['target']['Identifier'][0]['IDURL']
                    else:
                        for an_id in a_result['target']['Identifier']:
                            print (an_id)
                        terminate = True    
                target_title = a_result['target']['Title']
                target_published = a_result['target']['PublicationDate']

                rel_type = a_result['RelationshipType']['Name']

                a_dl = {"pub_id": pub_id,"pub_doi":source_doi,'source_title':source_title, 'source_published':source_published,
                        'target_id':target_id, 'target_title':target_title, 
                        'target_published': target_published, 'rel_type': rel_type}
                data_links[id_dl]=a_dl
    if terminate:
        break
print ('References found:', len(data_links))
        
    
if len(data_links) > 0  and not terminate:
    csvh.write_csv_data(data_links, "ccdc_scholix3.csv")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for a_pub in tqdm_notebook(app_pubs):


  0%|          | 0/451 [00:00<?, ?it/s]