We run a process in via a cloud serverless pipeline to fetch and cache DOI metadata for any DOIs encountered in any of our other processes to serve as a meta-store for building graphs and indexes. 

In [1]:
import isaid_helpers
import requests
import json
import pandas as pd
import os
import pickle
import re
import datetime
import click
from copy import copy
from pylinkedcmd import utilities, doi
import validators
import numpy as np
from joblib import Parallel, delayed
from tqdm import tqdm



In [2]:
%%time
if click.confirm('Are you sure you want to run the process to get all DOI data from the cache?', default=True):
    doi_cache = isaid_helpers.cache_chs_cache("doi")
    pickle.dump(doi_cache, open(isaid_helpers.f_raw_doi, "wb"))
    print(
        isaid_helpers.f_raw_doi, 
        "CREATED", 
        datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_raw_doi))
    )
else:
    doi_cache = pickle.load(open(isaid_helpers.f_raw_doi, "rb"))
    doi_cache = [i for i in doi_cache if "error" not in i]
    print("doi_cache loaded to memory from cache file")

Are you sure you want to run the process to get all DOI data from the cache? [Y/n]: n
doi_cache loaded to memory from cache file
CPU times: user 1.36 s, sys: 269 ms, total: 1.63 s
Wall time: 3.15 s


In [3]:
graphable_doi_entities = list()
for doi_doc in [i for i in doi_cache if "error" not in i]:
    doi_entity = doi.entity_from_doi(doi_doc)
    if doi_entity is not None:
        graphable_doi_entities.append(doi_entity)

df_doi_entities = pd.DataFrame(graphable_doi_entities)
df_doi_entities.to_csv(isaid_helpers.f_graphable_doi, index=False)
print(
    isaid_helpers.f_graphable_doi, 
    "CREATED", 
    datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_graphable_doi))
)
display(df_doi_entities.head())

data/graphable_table_doi.csv CREATED 2021-06-14 08:28:21.507282


Unnamed: 0,doi,name,url,publisher,date_qualifier,year_published,entity_type,journal,description,event
0,10.3133/ofr2000247,Fish community structure in relation to enviro...,http://dx.doi.org/10.3133/OFR2000247,US Geological Survey,2021-03-31T06:34:09.293702,2000,CreativeWork,USGS Open-File Report,,
1,10.1002/eap.2243,Resistance and resilience of pelagic and litto...,http://dx.doi.org/10.1002/EAP.2243,Wiley,2021-03-31T06:37:06.414581,2021,CreativeWork,Ecological Applications,,
2,10.3133/70039590,National Cartographic Information Center Newsl...,http://dx.doi.org/10.3133/70039590,US Geological Survey,2021-03-31T22:08:47.934746,1975,CreativeWork,USGS Newsletter,,
3,10.3133/wsp2163,"Ground-water levels in the United States, 1972...",http://dx.doi.org/10.3133/WSP2163,US Geological Survey,2021-03-31T22:25:31.855010,1977,CreativeWork,USGS [],,
4,10.3133/ofr83254,Water-resources investigations of the U.S. Geo...,http://dx.doi.org/10.3133/OFR83254,US Geological Survey,2021-03-31T22:28:57.198706,1983,CreativeWork,USGS Open-File Report,,


In [4]:
graphable_doi_funders = list()
for doi_doc in [i for i in doi_cache if "funder" in i]:
    graphable_doi_funders.extend(doi.funders_from_doi(doi_doc))

df_doi_funders = pd.DataFrame(graphable_doi_funders)
df_doi_funders.to_csv(isaid_helpers.f_graphable_doi_funders, index=False)
print(
    isaid_helpers.f_graphable_doi_funders, 
    "CREATED", 
    datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_graphable_doi_funders))
)
display(df_doi_funders.head())

data/graphable_table_doi_funders.csv CREATED 2021-06-14 08:28:33.690177


Unnamed: 0,doi,reference,date_qualifier,name,rel_type,entity_type,funder_doi,funder_award
0,10.1002/eap.2243,http://dx.doi.org/10.1002/EAP.2243,2021,Department of Water Resources,FUNDER_OF,Organization,10.13039/100004813,
1,10.1002/eap.2243,http://dx.doi.org/10.1002/EAP.2243,2021,Bureau of Reclamation,FUNDER_OF,Organization,10.13039/100006450,
2,10.1016/j.jglr.2013.12.011,http://dx.doi.org/10.1016/J.JGLR.2013.12.011,2014,U.S. Environmental Protection Agency,FUNDER_OF,Organization,10.13039/100000139,
3,10.1080/10871209.2017.1324069,http://dx.doi.org/10.1080/10871209.2017.1324069,2017,"South Dakota Game, Fish and Parks",FUNDER_OF,Organization,10.13039/100014298,T-59-R1
4,10.1016/j.biocon.2016.05.015,http://dx.doi.org/10.1016/J.BIOCON.2016.05.015,2017,U.S. Forest Service and the Ecological Society...,FUNDER_OF,Organization,,12-CA-11221633-096


In [5]:
with isaid_helpers.graph_driver.session(database=isaid_helpers.graphdb) as session:
    results = session.run("""
    MATCH (p:Person)
    WHERE NOT p.orcid IS NULL
    RETURN p.orcid AS orcid
    """)
    orcids_in_graph = [i["orcid"] for i in results.data()]

graphable_doi_contacts = list()
for doi_doc in doi_cache:
    graphable_doi_contacts.extend(doi.contacts_from_doi(doi_doc))
    
graphable_doi_contacts = [i for i in graphable_doi_contacts if i["orcid"] in orcids_in_graph]

df_doi_contacts = pd.DataFrame(graphable_doi_contacts)
df_doi_contacts.to_csv(isaid_helpers.f_graphable_doi_contacts, index=False)
print(
    isaid_helpers.f_graphable_doi_contacts, 
    "CREATED", 
    datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_graphable_doi_contacts))
)
display(df_doi_contacts.head())

data/graphable_table_doi_contacts.csv CREATED 2021-06-14 08:28:40.049818


Unnamed: 0,doi,reference,date_qualifier,orcid,sequence,rel_type,entity_type,name
0,10.3133/ofr2000247,http://dx.doi.org/10.3133/OFR2000247,2000,0000-0001-6702-4531,additional,AUTHOR_OF,Person,Larry R. Brown
1,10.1002/eap.2243,http://dx.doi.org/10.1002/EAP.2243,2021,0000-0001-6702-4531,additional,AUTHOR_OF,Person,Larry Brown
2,10.3133/sir20085004,http://dx.doi.org/10.3133/SIR20085004,2008,0000-0002-6060-9729,first,AUTHOR_OF,Person,Nathan Wood
3,10.3133/sir20085236,http://dx.doi.org/10.3133/SIR20085236,2008,0000-0002-4381-0746,first,AUTHOR_OF,Person,Stephen B. Gingerich
4,10.3133/fs20063139,http://dx.doi.org/10.3133/FS20063139,2006,0000-0002-6214-6182,first,AUTHOR_OF,Person,Cynthia A. Gardner


In [6]:
graphable_doi_terms = list()
for doi_doc in doi_cache:
    graphable_doi_terms.extend(doi.terms_from_doi(doi_doc))
pd.DataFrame(graphable_doi_terms).head()

df_doi_terms = pd.DataFrame(graphable_doi_terms)
df_doi_terms.to_csv(isaid_helpers.f_graphable_doi_terms, index=False)
print(
    isaid_helpers.f_graphable_doi_terms, 
    "CREATED", 
    datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_graphable_doi_terms))
)
display(df_doi_terms.head())

data/graphable_table_doi_terms.csv CREATED 2021-06-14 08:28:44.259135


Unnamed: 0,doi,reference,date_qualifier,name,rel_type,entity_type
0,10.1002/eap.2243,http://dx.doi.org/10.1002/EAP.2243,2021,Ecology,ADDRESSES_SUBJECT,UndefinedSubjectMatter
1,10.1016/j.jglr.2013.12.011,http://dx.doi.org/10.1016/J.JGLR.2013.12.011,2014,Ecology,ADDRESSES_SUBJECT,UndefinedSubjectMatter
2,10.1016/j.jglr.2013.12.011,http://dx.doi.org/10.1016/J.JGLR.2013.12.011,2014,Aquatic Science,ADDRESSES_SUBJECT,UndefinedSubjectMatter
3,10.1016/j.jglr.2013.12.011,http://dx.doi.org/10.1016/J.JGLR.2013.12.011,2014,"Ecology, Evolution, Behavior and Systematics",ADDRESSES_SUBJECT,UndefinedSubjectMatter
4,10.1111/conl.12095,http://dx.doi.org/10.1111/CONL.12095,2014,Ecology,ADDRESSES_SUBJECT,UndefinedSubjectMatter
