We run a process in via a cloud serverless pipeline to fetch and cache DOI metadata for any DOIs encountered in any of our other processes to serve as a meta-store for building graphs and indexes. 

In [1]:
import isaid_helpers
import requests
import json
import pandas as pd
import os
import pickle
import re
import datetime
import click
from copy import copy
from pylinkedcmd import utilities, doi
import validators
import numpy as np
from joblib import Parallel, delayed
from tqdm import tqdm



In [2]:
%%time
if click.confirm('Are you sure you want to run the process to get all DOI data from the cache?', default=True):
    doi_cache = isaid_helpers.cache_chs_cache("doi")
    pickle.dump(doi_cache, open(isaid_helpers.f_raw_doi, "wb"))
    print(
        isaid_helpers.f_raw_doi, 
        "CREATED", 
        datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_raw_doi))
    )
else:
    doi_cache = pickle.load(open(isaid_helpers.f_raw_doi, "rb"))
    doi_cache = [i for i in doi_cache if "error" not in i]
    print("doi_cache loaded to memory from cache file")

Are you sure you want to run the process to get all DOI data from the cache? [Y/n]: n
doi_cache loaded to memory from cache file
CPU times: user 1.38 s, sys: 257 ms, total: 1.64 s
Wall time: 3.44 s


In [3]:
with isaid_helpers.graph_driver.session(database=isaid_helpers.graphdb) as session:
    results = session.run("""
    MATCH (w:CreativeWork)
    WHERE NOT w.doi IS NULL
    RETURN w.doi AS doi
    """)
    dois_in_graph = [i["doi"] for i in results.data()]

dois_in_cache = [x["DOI"] for x in doi_cache if "error" not in x]
dois_missing = list(np.setdiff1d(dois_in_graph, dois_in_cache))
print("In Cache", len(dois_in_cache))
print("Missing", len(dois_missing))

In Cache 15221
Missing 6611


In [4]:
def accumulator(doi_id):
    check_doi = requests.get(f"https://49l0anek31.execute-api.us-west-2.amazonaws.com/prod?es_search_index=cache_doi&identifier={doi_id}").json()
    
    if check_doi["hits"]["hits"] and "error" not in check_doi["hits"]["hits"][0]["_source"]:
        doi_cache.append(check_doi["hits"]["hits"][0]["_source"])

In [5]:
try:
    Parallel(n_jobs=10, prefer="threads")(
        delayed(accumulator)
        (
            i
        ) for i in tqdm(dois_missing)
    )
except Exception as e:
    pickle.dump(doi_cache, open(isaid_helpers.f_raw_doi, "wb"))
    print(f"dumped {len(doi_cache)} documents to DOI cache file")
    print(e)

 28%|██▊       | 1860/6611 [00:56<02:18, 34.23it/s]

dumped 15223 documents to DOI cache file
HTTPSConnectionPool(host='49l0anek31.execute-api.us-west-2.amazonaws.com', port=443): Max retries exceeded with url: /prod?es_search_index=cache_doi&identifier=10.15781/t2xw48b73 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x151954e80>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))


 28%|██▊       | 1860/6611 [01:10<02:18, 34.23it/s]

In [7]:
graphable_doi_entities = list()
for doi_doc in doi_cache:
    doi_entity = doi.entity_from_doi(doi_doc)
    if doi_entity is not None:
        graphable_doi_entities.append(doi_entity)

df_doi_entities = pd.DataFrame(graphable_doi_entities)
df_doi_entities.to_csv(isaid_helpers.f_graphable_doi, index=False)
print(
    isaid_helpers.f_graphable_doi, 
    "CREATED", 
    datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_graphable_doi))
)
display(df_doi_entities.head())

data/graphable_table_doi.csv CREATED 2021-06-08 07:21:18.471204


Unnamed: 0,doi,name,url,publisher,date_qualifier,year_published,entity_type,journal,description,event
0,10.3133/ofr2000247,Fish community structure in relation to enviro...,http://dx.doi.org/10.3133/OFR2000247,US Geological Survey,2021-03-31T06:34:09.293702,2000,CreativeWork,USGS Open-File Report,,
1,10.1002/eap.2243,Resistance and resilience of pelagic and litto...,http://dx.doi.org/10.1002/EAP.2243,Wiley,2021-03-31T06:37:06.414581,2021,CreativeWork,Ecological Applications,,
2,10.3133/sir20175079,"Groundwater discharge by evapotranspiration, f...",http://dx.doi.org/10.3133/SIR20175079,US Geological Survey,2021-04-01T07:31:15.102431,2017,CreativeWork,USGS Scientific Investigations Report,,
3,10.3133/ofr92484,Selected meteorological data for an arid site ...,http://dx.doi.org/10.3133/OFR92484,US Geological Survey,2021-04-01T07:31:17.221993,1992,CreativeWork,USGS Open-File Report,,
4,10.3133/ofr20071239,Visitor and community survey results for Prime...,http://dx.doi.org/10.3133/OFR20071239,US Geological Survey,2021-04-01T00:30:40.548623,2007,CreativeWork,USGS Open-File Report,,


In [8]:
graphable_doi_funders = list()
for doi_doc in [i for i in doi_cache if "funder" in i]:
    graphable_doi_funders.extend(doi.funders_from_doi(doi_doc))

df_doi_funders = pd.DataFrame(graphable_doi_funders)
df_doi_funders.to_csv(isaid_helpers.f_graphable_doi_funders, index=False)
print(
    isaid_helpers.f_graphable_doi_funders, 
    "CREATED", 
    datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_graphable_doi_funders))
)
display(df_doi_funders.head())

data/graphable_table_doi_funders.csv CREATED 2021-06-08 07:21:26.182458


Unnamed: 0,doi,reference,date_qualifier,name,rel_type,entity_type,funder_doi,funder_award
0,10.1002/eap.2243,http://dx.doi.org/10.1002/EAP.2243,2021,Department of Water Resources,FUNDER_OF,Organization,10.13039/100004813,
1,10.1002/eap.2243,http://dx.doi.org/10.1002/EAP.2243,2021,Bureau of Reclamation,FUNDER_OF,Organization,10.13039/100006450,
2,10.1016/j.jglr.2013.12.011,http://dx.doi.org/10.1016/J.JGLR.2013.12.011,2014,U.S. Environmental Protection Agency,FUNDER_OF,Organization,10.13039/100000139,
3,10.1080/10871209.2017.1324069,http://dx.doi.org/10.1080/10871209.2017.1324069,2017,"South Dakota Game, Fish and Parks",FUNDER_OF,Organization,10.13039/100014298,T-59-R1
4,10.1016/j.biocon.2016.05.015,http://dx.doi.org/10.1016/J.BIOCON.2016.05.015,2017,U.S. Forest Service and the Ecological Society...,FUNDER_OF,Organization,,12-CA-11221633-096


In [22]:
with isaid_helpers.graph_driver.session(database=isaid_helpers.graphdb) as session:
    results = session.run("""
    MATCH (p:Person)
    WHERE NOT p.orcid IS NULL
    RETURN p.orcid AS orcid
    """)
    orcids_in_graph = [i["orcid"] for i in results.data()]

graphable_doi_contacts = list()
for doi_doc in doi_cache:
    graphable_doi_contacts.extend(doi.contacts_from_doi(doi_doc))
    
graphable_doi_contacts = [i for i in graphable_doi_contacts if i["orcid"] in orcids_in_graph]

df_doi_contacts = pd.DataFrame(graphable_doi_contacts)
df_doi_contacts.to_csv(isaid_helpers.f_graphable_doi_contacts, index=False)
print(
    isaid_helpers.f_graphable_doi_contacts, 
    "CREATED", 
    datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_graphable_doi_contacts))
)
display(df_doi_contacts.head())

data/graphable_table_doi_contacts.csv CREATED 2021-06-08 07:48:33.260185


Unnamed: 0,doi,reference,date_qualifier,orcid,sequence,rel_type,entity_type,name
0,10.3133/ofr2000247,http://dx.doi.org/10.3133/OFR2000247,2000,0000-0001-6702-4531,additional,AUTHOR_OF,Person,Larry R. Brown
1,10.1002/eap.2243,http://dx.doi.org/10.1002/EAP.2243,2021,0000-0001-6702-4531,additional,AUTHOR_OF,Person,Larry Brown
2,10.3133/sir20175079,http://dx.doi.org/10.3133/SIR20175079,2017,0000-0002-2086-0417,additional,AUTHOR_OF,Person,Brian J. Andraski
3,10.3133/sir20175079,http://dx.doi.org/10.3133/SIR20175079,2017,0000-0003-3776-3565,additional,AUTHOR_OF,Person,C. Amanda Garcia
4,10.3133/ofr92484,http://dx.doi.org/10.3133/OFR92484,1992,0000-0002-2086-0417,additional,AUTHOR_OF,Person,Brian J. Andraski


In [10]:
graphable_doi_terms = list()
for doi_doc in doi_cache:
    graphable_doi_terms.extend(doi.terms_from_doi(doi_doc))
pd.DataFrame(graphable_doi_terms).head()

df_doi_terms = pd.DataFrame(graphable_doi_terms)
df_doi_terms.to_csv(isaid_helpers.f_graphable_doi_terms, index=False)
print(
    isaid_helpers.f_graphable_doi_terms, 
    "CREATED", 
    datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_graphable_doi_terms))
)
display(df_doi_terms.head())

data/graphable_table_doi_terms.csv CREATED 2021-06-08 07:21:32.146592


Unnamed: 0,doi,reference,date_qualifier,name,rel_type,entity_type
0,10.1002/eap.2243,http://dx.doi.org/10.1002/EAP.2243,2021,Ecology,ADDRESSES_SUBJECT,UndefinedSubjectMatter
1,10.1016/j.jglr.2013.12.011,http://dx.doi.org/10.1016/J.JGLR.2013.12.011,2014,Ecology,ADDRESSES_SUBJECT,UndefinedSubjectMatter
2,10.1016/j.jglr.2013.12.011,http://dx.doi.org/10.1016/J.JGLR.2013.12.011,2014,Aquatic Science,ADDRESSES_SUBJECT,UndefinedSubjectMatter
3,10.1016/j.jglr.2013.12.011,http://dx.doi.org/10.1016/J.JGLR.2013.12.011,2014,"Ecology, Evolution, Behavior and Systematics",ADDRESSES_SUBJECT,UndefinedSubjectMatter
4,10.1111/conl.12095,http://dx.doi.org/10.1111/CONL.12095,2014,Ecology,ADDRESSES_SUBJECT,UndefinedSubjectMatter
