I pulled a listing of Cost Center names and a few details from an internal web service (SIPP). This gives us a reasonable set of organizations to work against and the important FBMS code that we can tie to other information (e.g., personnel directory from ScienceBase). However, the SIPP center data is horribly out of date when it comes to the placement within Regions and connections to Mission Areas.

This core function builds a flat table of just the pieces of information we want to use from the XML-based web service and dumps that to a local cache for processing. Running this and the missions/regions information first in building the graph will set up entities that can be further linked to in subsequent processes.

In [6]:
import requests
import json
import pandas as pd
import xmltodict
import os
import pickle
from copy import copy
import re
import validators
import datetime
import string
import click
import isaid_helpers

In [9]:
# SIPP Stuff
def mas_n_regions(return_format="dataframe"):
    df_data = pd.read_csv(isaid_helpers.f_mas_n_regions)
        
    if return_format == "dict":
        return df_data.to_dict(orient="records")
    else:
        return df_data

def simplify_center(center, missions_regions):
    include_props = ['CenterCode',
     'CenterName',
     'RegionCode',
     'WSC',
     'MissionArea',
     'SubBureauCode',
     'Active',
     'CenterDirectorName',
     'CenterDirectorEmail',
     'CostCenterPrimary']

    orgs = list()
    center_base_record = {k:v for k,v in center.items() if k in include_props}
    center_base_record["MissionAreaName"] = next((i["name"] for i in missions_regions if i["sipp_code"] == center_base_record["MissionArea"]), None)
    center_base_record["RegionName"] = next((i["name"] for i in missions_regions if i["sipp_code"] == center_base_record["RegionCode"]), None)

    if center_base_record["CostCenterPrimary"] is None:
        return orgs

    orgs.append(center_base_record)
    
    if "CostCenters" in center and "CostCenter" in center["CostCenters"]:
        if isinstance(center["CostCenters"]["CostCenter"], dict):
            child_centers = [center["CostCenters"]["CostCenter"]]
        else:
            child_centers = center["CostCenters"]["CostCenter"]
        
        for child_center in [i for i in child_centers if i["CostCenterCode"] != center_base_record["CostCenterPrimary"]]:
            orgs.append({
                "parent_cost_center_code": center_base_record["CostCenterPrimary"],
                "CenterCode": child_center["CenterCode"],
                "CenterName": child_center["CostCenterName"],
                "CostCenterPrimary": child_center["CostCenterCode"],
                "Active": child_center["Active"],
                "LastChangeInBASIS": child_center["LastChangeInBASIS"],
                "RegionCode": center_base_record["RegionCode"],
                "RegionName": center_base_record["RegionName"],
                "WSC": center_base_record["WSC"],
                "MissionArea": center_base_record["MissionArea"],
                "MissionAreaName": center_base_record["MissionAreaName"],
                "CenterDirectorEmail": center_base_record["CenterDirectorEmail"],
                "CenterDirectorName": center_base_record["CenterDirectorName"],
                "SubBureauCode": center_base_record["SubBureauCode"]
            })

    return orgs

def get_sipp_center_data(return_format="dataframe"):
    r_active = requests.get(isaid_helpers.center_info_link)

    active_centers = xmltodict.parse(r_active.text, dict_constructor=dict)

    mission_areas = mas_n_regions(return_format="dict")

    active_center_records = list()
    for center in active_centers["Centers"]["Center"]:
        active_center_records.extend(simplify_center(center, mission_areas))

    if return_format == "dict":
        return active_center_records
    elif return_format == "dataframe":
        return pd.DataFrame(active_center_records)
    

In [10]:
%%time
if click.confirm('Are you connected to the internal USGS network (TIC)?', default=True):
    get_sipp_center_data(
        return_format="dataframe"
    ).to_csv(isaid_helpers.f_usgs_centers, index=False)
    print(
        isaid_helpers.f_usgs_centers, 
        "CREATED", 
        datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_usgs_centers))
    )


Are you connected to the internal USGS network (TIC)? [Y/n]: Y
usgs_cost_centers.csv CREATED 2021-06-07 15:46:53.083927
CPU times: user 138 ms, sys: 30.3 ms, total: 169 ms
Wall time: 5.54 s
