In [1]:
import requests
import sys
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import json
from sciencebasepy import SbSession
from os import path

In [2]:
def login_sciencebase(username):
    sb = SbSession()
    sb.loginc(username)
    return sb

def lookup_wikidata_by_orcid(orc_id):
    # I previously ran the entire set of USGS persons with ORCID and dumped to a file
    if path.exists('usgs_orcid_wikidata.json'):
        with open('usgs_orcid_wikidata.json', 'r') as f:
            wikidata_pull = json.load(f)
            
        existing_record = next((i for i in wikidata_pull if i["orcid"] == orc_id), None)
        
        if existing_record is None:
            return None
        else:
            return existing_record["wikidata_id"]
        
    else:
        endpoint_url = "https://query.wikidata.org/sparql"

        query = """SELECT ?item ?itemLabel WHERE {
          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
          ?item wdt:P496 '%s'.
        }
        LIMIT 100""" % (orc_id)

        user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
        # TODO adjust user agent; see https://w.wiki/CX6
        sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)
        result_set = sparql.query().convert()

        if len(result_set["results"]["bindings"]) == 1:
            return result_set["results"]["bindings"][0]["item"]["value"]
        else:
            return None
        
def get_sb_person(q):
    # Will only return a person record that has an ORCID value
    request_url = f"https://www.sciencebase.gov/directory/people?format=json&dataset=all&lq=_exists_:orcId&max=1&q={q}"
    
    r = requests.get(
        f"https://www.sciencebase.gov/directory/people?format=json&dataset=all&lq=_exists_:orcId&max=1&q={q}"
    ).json()
    
    if len(r["people"]) == 0:
        return None
    else:
        return r["people"][0]
    
def package_new_ids(person_record, wikidata_id):
    person_record["identifiers"] = [
                {
                    "id": 1,
                    "key": person_record["orcId"],
                    "type": "ORCID"
                },
                {
                    "id": 2,
                    "key": wikidata_id,
                    "type": "WikiData"
                }
            ]

    return person_record

def update_sb_person(person_record=None, email_address=None):
    if person_record is None and email_address is None:
        return None, None
    
    if person_record is None and email_address is not None:
        current_person_record = get_sb_person(email_address)

        if current_person_record is None:
            return None, None
        
        wikidata_id = lookup_wikidata_by_orcid(current_person_record["orcId"])
        
        if wikidata_id is None:
            return None, None
        
        person_record = package_new_ids(current_person_record, wikidata_id)
    
    sb = login_sciencebase(input("User Name: "))
    
    put_link = person_record["link"]["href"]
    del person_record["link"]
    del person_record["links"]
    del person_record["_classSimpleName"]
    del person_record["permissions"]
    
    r = sb._session.put(
         put_link, 
         data=person_record, 
         headers={
             "content-type": "application/json",
             "accept": "application/json"
         }
    )
    
    return person_record, r

def check_all_orcid_wikidata():
    master_dataset = list()
    next_link = "https://www.sciencebase.gov/directory/people?format=json&dataset=all&lq=_exists_:orcId&max=1000"

    while next_link is not None:
        data = requests.get(next_link).json()

        if len(data["people"]) > 0:
            master_dataset.extend([{"id": i["id"], "displayName": i["displayName"], "email": i["email"], "orcid": i["orcId"], "wikidata_id": lookup_wikidata_by_orcid(i["orcId"])} for i in data["people"]])

        if "nextlink" in data.keys():
            next_link = data["nextlink"]["url"]
        else:
            next_link = None
            
    return master_dataset



I wrapped everything here up into a single function that will take a given email address, check to see if we can get a person record with an ORCID, check for a WikiData ID, and then attempt to update the record in the ScienceBase Directory with the new identifier. It also adds the ORCID to the identifiers list. Once we work out the process, I will just run this for every case and then figure out a way to keep it up to date in future.

There's a file where I already ran through and found as many WikiData identifiers as possible. Unfortunately, I saved that without the id values, so you have to run a lookup anyway. At this point, I'm getting a 400 error when I try to use the sbsession to PUT the new document.

In [3]:
updated_person_record, server_response = update_sb_person(email_address="sbristol@usgs.gov")

User Name: sbristol@usgs.gov
········


In [4]:
updated_person_record

{'type': 'person',
 'id': 1193,
 'name': 'Sky Bristol/RGIO/USGS/DOI',
 'displayName': 'Sky Bristol',
 'displayText': 'Sky Bristol',
 'distinguishedName': 'CN=Sky Bristol,OU=CSS,OU=Users,OU=EIT,OU=DI,DC=gs,DC=doi,DC=net',
 'url': 'https://my.usgs.gov/catalog/Global/catalogParty/show/1193',
 'email': 'sbristol@usgs.gov',
 'description': None,
 'richDescriptionHtml': '',
 'note': None,
 'active': True,
 'aliases': [{'name': 'Bristol, R. Sky'},
  {'name': 'Bristol, RS'},
  {'name': 'R. Sky Bristol'},
  {'name': 'Robert Schuyler Bristol'}],
 'identifiers': [{'id': 1, 'key': '0000-0003-1682-4031', 'type': 'ORCID'},
  {'id': 2,
   'key': 'http://www.wikidata.org/entity/Q98058015',
   'type': 'WikiData'}],
 'primaryLocation': {'id': 1866,
  'name': 'Sky Bristol/RGIO/USGS/DOI - Primary Location',
  'shortName': None,
  'description': None,
  'building': None,
  'buildingCode': 'KBT',
  'phone': '3032024181',
  'faxPhone': '3032024229',
  'areaCode': '303',
  'mailAddress': {'line1': None,
   'l

In [5]:
server_response

<Response [400]>