In [1]:
import requests
import json
import pandas as pd
import xmltodict
import os
import pickle
from copy import copy
import re
import validators
import datetime
import string
import click

import isaid_helpers

In [4]:
# SDC Stuff
def get_raw_sdc_docs(limit=1000):
    offset = 0
    sdc_data = list()
    while True:
        sdc_url = f"https://4un8324n3h.execute-api.us-west-2.amazonaws.com/prodchs/search?size={limit}&from={offset}"
        r_sdc = requests.get(sdc_url).json()
        if r_sdc["hits"]:
            sdc_data.extend([i["_source"] for i in r_sdc["hits"]])
            offset += limit
        else:
            break
    
    return sdc_data

def sdc_dataset(sdc_record):
    if "identifier" not in sdc_record:
        return
    
    dataset = {
        "sdc_internal_id": sdc_record["identifier"],
        "name": sdc_record["title"],
        "description": sdc_record["description"]
    }
    
    if "landingPage" in sdc_record:
        dataset["url"] = sdc_record["landingPage"]
        
    if "modified" in sdc_record:
        dataset["last_updated"] = sdc_record["modified"]
        
    return dataset

def sdc_terms(sdc_record):
    viable_terms = list()

    if "identifier" not in sdc_record:
        return viable_terms
    
    rel_stub = {
        "sdc_internal_id": sdc_record["identifier"],
        "reference": f"https://data.usgs.gov/datacatalog/data/{sdc_record['identifier']}"
    }
    
    if "modified" in sdc_record:
        rel_stub["date_qualifier"] = sdc_record["modified"]
        
    terms = list()
        
    if "placeKeyword"in sdc_record:
        terms.extend([
            {
                "entity_type": "Location", 
                "declared_term_source": None,
                "rel_type": "ADDRESSES_PLACE",
                "term": i
            } for i in sdc_record["placeKeyword"]
        ])

    if "usgsThesaurusKeyword"in sdc_record:
        terms.extend([
            {
                "entity_type": "DefinedSubjectMatter",
                "declared_term_source": "USGS Thesaurus",
                "rel_type": "ADDRESSES_SUBJECT",
                "term": i
            } for i in sdc_record["usgsThesaurusKeyword"]
        ])

    if "otherKeyword"in sdc_record:
        terms.extend([
            {
                "entity_type": "UndefinedSubjectMatter", 
                "declared_term_source": None,
                "rel_type": "ADDRESSES_SUBJECT",
                "term": i
            } for i in sdc_record["otherKeyword"]
        ])
    
    for term in terms:
        check_term = term["term"].strip()
        if len(check_term) == 0:
            continue

        if len(term["term"]) == 1:
            continue
            
        term.update(rel_stub)
        viable_terms.append(term)
    
    return viable_terms

def graphable_datasets_from_sdc(sdc_cache, return_format="list"):
    sdc_graphable_datasets = list()
    for record in sdc_cache:
        dataset = sdc_dataset(record)
        if dataset is not None:
            sdc_graphable_datasets.append(dataset)
    
    if return_format == "list":
        return sdc_graphable_datasets
    elif return_format == "dataframe":
        return pd.DataFrame(sdc_graphable_datasets)

def graphable_terms_from_sdc(
    sdc_cache, 
    term_source="USGS Thesaurus", 
    valid_terms=None,
    return_format="list"
):
    sdc_graphable_terms = list()
    for record in sdc_cache:
        sdc_graphable_terms.extend(sdc_terms(record))
        
    if term_source is not None:
        sdc_graphable_terms = [
            i for i in sdc_graphable_terms 
            if i["declared_term_source"] == "USGS Thesaurus" 
        ]
    
    if valid_terms is not None:
        sdc_graphable_terms = [
            i for i in sdc_graphable_terms 
            if i["term"] in valid_terms
        ]
        
    if return_format == "list":
        return sdc_graphable_terms
    elif return_format == "dataframe":
        return pd.DataFrame(sdc_graphable_terms)
    
def graphable_places_from_sdc(
    sdc_cache, 
    valid_terms=None,
    return_format="list"
):
    sdc_graphable_places = list()
    for record in sdc_cache:
        sdc_graphable_places.extend(sdc_terms(record))
        
    if valid_terms is not None:
        sdc_graphable_places = [
            i for i in sdc_graphable_places 
            if i["term"] in valid_terms
        ]
        
    if return_format == "list":
        return sdc_graphable_places
    elif return_format == "dataframe":
        return pd.DataFrame(sdc_graphable_places)
    
def sdc_contacts(sdc_record, return_format="list"):
    viable_contacts = list()
    
    if "identifier" not in sdc_record:
        return viable_contacts
    
    rel_stub = {
        "sdc_internal_id": sdc_record["identifier"],
        "reference": f"https://data.usgs.gov/datacatalog/data/{sdc_record['identifier']}"
    }
    
    if "modified" in sdc_record:
        rel_stub["date_qualifier"] = sdc_record["modified"]
    
    if "metadataContact" in sdc_record and "hasEmail" in sdc_record["metadataContact"]:
        metadata_contact = copy(rel_stub)
        metadata_contact["rel_type"] = "METADATA_CONTACT"
        metadata_contact["entity_type"] = "Person"
        metadata_contact["email"] = sdc_record["metadataContact"]["hasEmail"].split(":")[-1].strip()
        viable_contacts.append(metadata_contact)
        
    if "contactPoint" in sdc_record and "hasEmail" in sdc_record["contactPoint"]:
        poc_contact = copy(rel_stub)
        poc_contact["rel_type"] = "POINT_OF_CONTACT"
        poc_contact["entity_type"] = "Person"
        poc_contact["email"] = sdc_record["contactPoint"]["hasEmail"].split(":")[-1].strip()
        viable_contacts.append(poc_contact)
        
    if "authors" in sdc_record and isinstance(sdc_record["authors"], list):
        for author_record in [i for i in sdc_record["authors"] if "orcid" in i and i["orcid"]]:
            author_contact = copy(rel_stub)
            author_contact["rel_type"] = "AUTHOR_OF"
            author_contact["entity_type"] = "Person"
            author_contact["orcid"] = author_record["orcid"]
            viable_contacts.append(author_contact)
        
    if return_format == "list":
        return viable_contacts
    elif return_format == "dataframe":
        return pd.DataFrame(viable_contacts)   

def graphable_contacts_from_sdc(
    sdc_cache, 
    return_format="list"
):
    sdc_graphable_contacts = list()
    for record in sdc_cache:
        sdc_graphable_contacts.extend(sdc_contacts(record))
        
    if return_format == "list":
        return sdc_graphable_contacts
    elif return_format == "dataframe":
        return pd.DataFrame(sdc_graphable_contacts)


In [3]:
%%time
if click.confirm('Do you really want to proceed with rebuilding the local SDC cache from source?', default=True):
    sdc_cache = get_raw_sdc_docs()
    pickle.dump(sdc_cache, open(isaid_helpers.f_raw_sdc, "wb"))
    print(isaid_helpers.f_raw_sdc, "CREATED", datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_raw_sdc)))
else:
    sdc_cache = pickle.load(open(isaid_helpers.f_raw_sdc, "rb"))
    print("sdc_cache available in local memory")

Do you really want to proceed with rebuilding the local SDC cache from source? [Y/n]: n
sdc_cache available in local memory
CPU times: user 755 ms, sys: 156 ms, total: 911 ms
Wall time: 2.43 s


In [None]:
%%time
graphable_datasets_from_sdc(
    sdc_cache=sdc_cache,
    return_format="dataframe"
).to_csv(isaid_helpers.f_graphable_sdc, index=False)
print(
    isaid_helpers.f_graphable_sdc, 
    "CREATED", 
    datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_graphable_sdc))
)

In [None]:
%%time
df_verified_thesaurus_terms = pd.read_csv(isaid_helpers.f_graphable_thesaurus_terms)

graphable_terms_from_sdc(
    sdc_cache=sdc_cache,
    valid_terms=list(df_verified_thesaurus_terms.term),
    return_format="dataframe"
).to_csv(isaid_helpers.f_graphable_sdc_rels_usgs_thesaurus, index=False)
print(
    isaid_helpers.f_graphable_sdc_rels_usgs_thesaurus, 
    "CREATED", 
    datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_graphable_sdc_rels_usgs_thesaurus))
)

In [5]:
%%time
df_verified_places = pd.read_csv(isaid_helpers.f_graphable_place_names)

graphable_places_from_sdc(
    sdc_cache=sdc_cache,
    valid_terms=list(df_verified_places.term),
    return_format="dataframe"
).to_csv(isaid_helpers.f_graphable_sdc_rels_places, index=False)
print(
    isaid_helpers.f_graphable_sdc_rels_places, 
    "CREATED", 
    datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_graphable_sdc_rels_places))
)

graphable_table_sdc_places.csv CREATED 2021-06-07 14:57:51.475345
CPU times: user 10.9 s, sys: 287 ms, total: 11.2 s
Wall time: 13.4 s


In [None]:
%%time
df_graphable_contacts = graphable_contacts_from_sdc(
    sdc_cache=sdc_cache,
    return_format="dataframe"
)

df_graphable_contacts.loc[df_graphable_contacts.rel_type == "METADATA_CONTACT"].to_csv(isaid_helpers.f_graphable_sdc_rels_poc, index=False)
print(
    isaid_helpers.f_graphable_sdc_rels_poc, 
    "CREATED", 
    datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_graphable_sdc_rels_poc))
)

df_graphable_contacts.loc[df_graphable_contacts.rel_type == "POINT_OF_CONTACT"].to_csv(isaid_helpers.f_graphable_sdc_rels_md, index=False)
print(
    isaid_helpers.f_graphable_sdc_rels_md, 
    "CREATED", 
    datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_graphable_sdc_rels_md))
)

df_graphable_contacts.loc[df_graphable_contacts.rel_type == "AUTHOR_OF"].to_csv(isaid_helpers.f_graphable_sdc_rels_author, index=False)
print(
    isaid_helpers.f_graphable_sdc_rels_author, 
    "CREATED", 
    datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_graphable_sdc_rels_author))
)

In [None]:
pd.read_csv(isaid_helpers.f_graphable_sdc_rels_author)