The [ScienceBase Directory](https://www.sciencebase.gov/directory) provides the best route for personnel records for both current and former USGS staff going back at least a few years. There is a daily synchronization service that runs to pull basic properties from the internal Active Directory, and ScienceBase provides a REST API that affords reasonable, though sometimes unstable, access to the Directory data. This notebook works through the process of pulling the raw ScienceBase Directory documents into a local cache and processing the important bits of information out of them into a table for building our graph. We may move this process to a regular pipeline at some point.

In [1]:
import isaid_helpers
import requests
import json
import pandas as pd
import xmltodict
import os
import pickle
from copy import copy
import re
import validators
import datetime
import string
import click


In [2]:
def get_raw_sb_people():
    next_link = "https://www.sciencebase.gov/directory/people?format=json&dataset=all&max=1000"
    sb_people = list()

    while True:
        r_people = requests.get(next_link).json()

        if r_people["people"]:
            sb_people.extend(r_people["people"])
        
        if "nextlink" in r_people:
            next_link = r_people["nextlink"]["url"]
        else:
            break
            
    return sb_people

def sb_person_by_id(sbid, raw_sb_people, return_format="supervisor"):
    person_doc = next((i for i in raw_sb_people if i["id"] == sbid), None)
    
    if person_doc is None:
        return
    
    if return_format == "supervisor":
        return {
            "supervisor_name": person_doc["displayName"],
            "supervisor_email": person_doc["email"],
            "supervisor_uri": person_doc["link"]["href"]
        }
    else:
        return person_doc

def summarize_sb_person(sb_person, raw_sb_people):
    if sb_person["email"] is None:
        return
    
    person = {
        "name": sb_person["displayName"],
        "last_name": sb_person["lastName"],
        "url": sb_person["url"],
        "email": sb_person["email"],
        "source_id_sb_directory": sb_person["link"]["href"],
        "fbms_code": sb_person["extensions"]["usgsPersonExtension"]["orgCode"],
        "active": sb_person["active"],
        "last_updated": sb_person["lastUpdated"]
    }
    
    if "orcId" in sb_person:
        person["orcid"] = sb_person["orcId"]
        
    if "firstName" in sb_person:
        person["first_name"] = sb_person["firstName"]
        
    if "middleName" in sb_person:
        person["middle_name"] = sb_person["middleName"] 
        
    if "jobTitle" in sb_person:
        person["job_title"] = sb_person["jobTitle"]
        
    if "organization" in sb_person:
        person["organization_name"] = sb_person["organization"]["displayText"]
        
    if "primaryLocation" in sb_person:
        summarized_location = summarize_sb_location(sb_person["primaryLocation"])
        if summarized_location is not None:
            person.update(summarized_location)
            
    if "supervisor" in sb_person["extensions"]["personExtension"]:
        supervisor_record = sb_person_by_id(
            sb_person["extensions"]["personExtension"]["supervisor"]["id"],
            raw_sb_people
        )
        if supervisor_record is not None:
            person.update(supervisor_record)
        
    return person

def sb_person_entities(raw_sb_people, return_format="list"):
    summarized_sb_people = [
        p for p in 
        [summarize_sb_person(i, raw_sb_people) for i in raw_sb_people] 
        if p is not None
    ]

    if return_format == "list":
        return summarized_sb_people
    elif return_format == "dataframe":
        return pd.DataFrame(summarized_sb_people)

def raw_sb_orgs(ignore_cache=False):
    if ignore_cache and os.path.exists(cache_file_sb_org):
        return pickle.load(open(cache_file_sb_org, "rb"))

    next_link = "https://www.sciencebase.gov/directory/organizations/get?format=json&max=1000"
    sb_orgs = list()

    while True:
        r_orgs = requests.get(next_link).json()

        if r_orgs["organizations"]:
            sb_orgs.extend(r_orgs["organizations"])
        
        if "nextlink" in r_orgs:
            next_link = r_orgs["nextlink"]["url"]
        else:
            break
            
    with open(cache_file_sb_org, "wb") as f:
        pickle.dump(sb_orgs, f)
        
    return sb_orgs

def summarize_sb_org(sb_org):
    if "usgsOrganizationExtension" not in sb_org["extensions"]:
        return
    
    org = {
        "name": sb_org["displayName"],
        "active": sb_org["active"],
        "last_updated": sb_org["lastUpdated"],
        "source_id_sb_directory": sb_org["link"]["href"],
        "url": sb_org["url"],
        "fbms_code": sb_org["extensions"]["usgsOrganizationExtension"]["fbmsCode"]
    }
    return org

def cache_sb_org(raw_sb_orgs, return_format=None):
    summarized_sb_orgs = [o for o in [summarize_sb_org(i) for i in raw_sb_orgs] if o is not None]
    df_summarized_sb_orgs = pd.DataFrame(summarized_sb_orgs)
    df_summarized_sb_orgs.to_csv(cache_file_sb_org_graph, index=False)

    if return_format == "dict":
        return summarized_sb_orgs
    elif return_format == "dataframe":
        return df_summarized_sb_orgs

def summarize_sb_location(sb_location):
    if sb_location["streetAddress"]["line1"] is None:
        return

    location = {
        "location_name": sb_location["building"],
        "location_description": sb_location["description"],
        "building_code": sb_location["buildingCode"],
        "address_line_1": sb_location["streetAddress"]["line1"],
        "address_line_2": sb_location["streetAddress"]["line2"],
        "city": sb_location["streetAddress"]["city"],
        "state": sb_location["streetAddress"]["state"],
        "zip": sb_location["streetAddress"]["zip"],
        "country": sb_location["streetAddress"]["country"],
        "string_address": f'{sb_location["streetAddress"]["line1"]}, {sb_location["streetAddress"]["city"]}, {sb_location["streetAddress"]["state"]} {sb_location["streetAddress"]["zip"]}'
    }

    if location["location_name"] is None and location["location_description"] is not None:
        location["location_name"] = location["location_description"]
        
    if location["location_name"] is None:
        location["location_name"] = location["string_address"]
        
    return location

In [3]:
%%time
if click.confirm('Are you sure you want to run the full process to get the latest from the ScienceBase Directory?', default=True):
    sb_people_cache = get_raw_sb_people()
    pickle.dump(sb_people_cache, open(isaid_helpers.f_raw_sb_people, "wb"))
    print(
        isaid_helpers.f_raw_sb_people, 
        "CREATED", 
        datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_raw_sb_people))
    )
else:
    sb_people_cache = pickle.load(open(isaid_helpers.f_raw_sb_people, "rb"))
    print("sb_people_cache loaded to memory from most recent cache")


Are you sure you want to run the full process to get the latest from the ScienceBase Directory? [Y/n]: Y
data/process_sb_people.p CREATED 2021-06-07 21:34:41.900977
CPU times: user 1.64 s, sys: 745 ms, total: 2.38 s
Wall time: 5min


In [10]:
%%time
sb_person_entities(
    sb_people_cache, 
    return_format="dataframe"
).to_csv(isaid_helpers.f_graphable_sb_people, index=False)
print(
    isaid_helpers.f_graphable_sb_people, 
    "CREATED", 
    datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_graphable_sb_people))
)

data/graphable_table_sb_people.csv CREATED 2021-06-07 21:37:32.954801
CPU times: user 17 s, sys: 123 ms, total: 17.1 s
Wall time: 18.3 s
