In [1]:
import isaid_helpers
import requests
import json
import pandas as pd
import os
import pickle
import re
import datetime
import click
from copy import copy
from pylinkedcmd import utilities
import validators

In [2]:
# ORCID Stuff
def orcid_doc_relationships(orcid_record):
    # Need to work identifier, url, alternateName, and alumniOf into this still
    if "error" in orcid_record:
        return list()

    relationship_mapper = {
        "@reverse": {
            "creator": {
                "entity": "CreativeWork",
                "rel": "AUTHOR_OF"
            },
            "funder": {
                "entity": "CreativeWork",
                "rel": "FUNDER_OF"
            }
        }
    }
    
    relationships = list()
    
    rel_stud = {
        "orcid": orcid_record["orcid"],
        "date_qualifier": orcid_record["_date_cached"],
        "reference": orcid_record["@id"]
    }
    
    if "@reverse" in orcid_record:
        for work_type in orcid_record["@reverse"].keys():
            if isinstance(orcid_record["@reverse"][work_type], list):
                item_list = orcid_record["@reverse"][work_type]
            else:
                item_list = [orcid_record["@reverse"][work_type]]
            for item in item_list:
                work_item = copy(rel_stud)
                work_item["name"] = item["name"]
                work_item["entity_type"] = relationship_mapper["@reverse"][work_type]["entity"]
                work_item["rel_type"] = relationship_mapper["@reverse"][work_type]["rel"]
                if "@id" in item:
                    work_item["url"] = item["@id"]
                    work_item["doi"] = utilities.doi_from_string(item["@id"])
                relationships.append(work_item)
                
    if "affiliation" in orcid_record:
        if isinstance(orcid_record["affiliation"], dict):
            affiliation_list = [orcid_record["affiliation"]]
        else:
            affiliation_list = orcid_record["affiliation"]

        for affiliation in affiliation_list:
            work_item = copy(rel_stud)
            work_item["name"] = affiliation["name"]
            if "alternateName" in affiliation:
                work_item["alternate_name"] = affiliation["alternateName"]
                work_item["name"] = f"{affiliation['name']}, {affiliation['alternateName']}"
            work_item["entity_type"] = affiliation["@type"]
            work_item["rel_type"] = "AFFILIATED_WITH"
            if "@id" in affiliation:
                doi_in_id = utilities.doi_from_string(affiliation["@id"])
                if doi_in_id:
                    work_item["doi"] = doi_in_id
                if validators.url(affiliation["@id"]):
                    work_item["url"] = affiliation["@id"]
                elif affiliation["@id"].split(".")[0] == "grid":
                    work_item["grid_id"] = affiliation["@id"]
            if "identifier" in affiliation and affiliation["identifier"]["propertyID"] == "RINGGOLD":
                work_item["ringgold_id"] = affiliation["identifier"]["value"]
                
            relationships.append(work_item)
                
    return relationships

def orcid_relationships(orcid_cache, return_format=None):
    orcid_rels = list()
    for orcid_doc in orcid_cache:
        orcid_rels.extend(orcid_doc_relationships(orcid_doc))

    if return_format == "dataframe":
        return pd.DataFrame(orcid_rels)
    else:
        return orcid_rels

In [3]:
%%time
if click.confirm('Are you sure you want to run the process to get all ORCID data from the cache?', default=True):
    orcid_cache = isaid_helpers.cache_chs_cache("orcid")
    pickle.dump(orcid_cache, open(isaid_helpers.f_raw_orcid, "wb"))
    print(
        isaid_helpers.f_raw_orcid, 
        "CREATED", 
        datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_raw_orcid))
    )
else:
    orcid_cache = pickle.load(open(isaid_helpers.f_raw_orcid, "rb"))
    print("orcid_cache loaded to memory from cache file")

Are you sure you want to run the process to get all ORCID data from the cache? [Y/n]: y
data/process_orcid.p CREATED 2021-06-24 11:22:44.764465
CPU times: user 461 ms, sys: 160 ms, total: 621 ms
Wall time: 11.9 s


In [4]:
%%time
orcid_relationships(
    orcid_cache,
    return_format="dataframe"
).to_csv(isaid_helpers.f_graphable_orcid, index=False)
print(
    isaid_helpers.f_graphable_orcid, 
    "CREATED", 
    datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_graphable_orcid))
)

data/graphable_table_orcid.csv CREATED 2021-06-24 11:24:44.024396
CPU times: user 374 ms, sys: 21.9 ms, total: 396 ms
Wall time: 418 ms
