In [1]:
import requests
import json
import pandas as pd
import xmltodict
import os
import pickle
from copy import copy
import re
import validators
import datetime
import string
import click
from collections import Counter
import dateutil.parser

import isaid_helpers

In [14]:
# SDC Stuff
def get_raw_sdc_docs(limit=1000):
    offset = 0
    sdc_data = list()
    while True:
        sdc_url = f"https://4un8324n3h.execute-api.us-west-2.amazonaws.com/prodchs/search?size={limit}&from={offset}"
        r_sdc = requests.get(sdc_url).json()
        if r_sdc["hits"]:
            sdc_data.extend([i["_source"] for i in r_sdc["hits"]])
            offset += limit
        else:
            break
    
    return sdc_data

def sdc_dataset(sdc_record):
    if "identifier" not in sdc_record:
        return
    
    dataset = {
        "sdc_internal_id": sdc_record["identifier"],
        "name": sdc_record["title"],
        "description": sdc_record["description"],
        "source": "USGS Science Data Catalog",
        "source_reference": "https://data.usgs.gov/catalog/"
    }
    
    if "landingPage" in sdc_record:
        dataset["url"] = sdc_record["landingPage"]
        
    if "modified" in sdc_record:
        dataset["last_updated"] = sdc_record["modified"]
        
    return dataset

def sdc_terms(sdc_record):
    viable_terms = list()

    if "identifier" not in sdc_record:
        return viable_terms
    
    rel_stub = {
        "sdc_internal_id": sdc_record["identifier"],
        "reference": f"https://data.usgs.gov/datacatalog/data/{sdc_record['identifier']}",
        "date_qualifier": None
    }
    
    if "modified" in sdc_record:
        try:
            rel_stub["date_qualifier"] = str(dateutil.parser.parse(sdc_record["modified"]).isoformat())
        except:
            rel_stub["date_qualifier"] = str(datetime.datetime.strptime(sdc_record["modified"], "%Y%m%d").isoformat())
        
    terms = list()
        
    if "placeKeyword"in sdc_record:
        terms.extend([
            {
                "entity_type": "Location", 
                "declared_term_source": None,
                "rel_type": "ADDRESSES_PLACE",
                "term": i.strip()
            } for i in sdc_record["placeKeyword"]
        ])

    if "usgsThesaurusKeyword"in sdc_record:
        terms.extend([
            {
                "entity_type": "DefinedSubjectMatter",
                "declared_term_source": "USGS Thesaurus",
                "rel_type": "ADDRESSES_SUBJECT",
                "term": i.strip()
            } for i in sdc_record["usgsThesaurusKeyword"]
        ])

    if "otherKeyword"in sdc_record:
        terms.extend([
            {
                "entity_type": "UndefinedSubjectMatter", 
                "declared_term_source": None,
                "rel_type": "ADDRESSES_SUBJECT",
                "term": i.strip()
            } for i in sdc_record["otherKeyword"]
        ])
    
    for term in terms:
        check_term = term["term"].strip()
        if len(check_term) == 0:
            continue

        if len(term["term"]) == 1:
            continue
            
        term.update(rel_stub)
        viable_terms.append(term)
    
    return viable_terms

def graphable_datasets_from_sdc(sdc_cache, return_format="list"):
    sdc_graphable_datasets = list()
    for record in sdc_cache:
        dataset = sdc_dataset(record)
        if dataset is not None:
            sdc_graphable_datasets.append(dataset)
    
    if return_format == "list":
        return sdc_graphable_datasets
    elif return_format == "dataframe":
        return pd.DataFrame(sdc_graphable_datasets)

def graphable_places_from_sdc(
    sdc_cache, 
    valid_terms=None,
    return_format="list"
):
    sdc_graphable_places = list()
    for record in sdc_cache:
        sdc_graphable_places.extend(sdc_terms(record))
        
    if valid_terms is not None:
        sdc_graphable_places = [
            i for i in sdc_graphable_places 
            if i["term"] in valid_terms
        ]
        
    if return_format == "list":
        return sdc_graphable_places
    elif return_format == "dataframe":
        return pd.DataFrame(sdc_graphable_places)
    
def sdc_contacts(sdc_record, return_format="list"):
    viable_contacts = list()
    
    if "identifier" not in sdc_record:
        return viable_contacts
    
    rel_stub = {
        "sdc_internal_id": sdc_record["identifier"],
        "reference": f"https://data.usgs.gov/datacatalog/data/{sdc_record['identifier']}"
    }
    
    if "modified" in sdc_record:
        rel_stub["date_qualifier"] = sdc_record["modified"]
    
    if "metadataContact" in sdc_record and "hasEmail" in sdc_record["metadataContact"]:
        metadata_contact = copy(rel_stub)
        metadata_contact["rel_type"] = "METADATA_CONTACT"
        metadata_contact["entity_type"] = "Person"
        metadata_contact["email"] = sdc_record["metadataContact"]["hasEmail"].split(":")[-1].strip()
        viable_contacts.append(metadata_contact)
        
    if "contactPoint" in sdc_record and "hasEmail" in sdc_record["contactPoint"]:
        poc_contact = copy(rel_stub)
        poc_contact["rel_type"] = "POINT_OF_CONTACT"
        poc_contact["entity_type"] = "Person"
        poc_contact["email"] = sdc_record["contactPoint"]["hasEmail"].split(":")[-1].strip()
        viable_contacts.append(poc_contact)
        
    if "authors" in sdc_record and isinstance(sdc_record["authors"], list):
        for author_record in [i for i in sdc_record["authors"] if "orcid" in i and i["orcid"]]:
            author_contact = copy(rel_stub)
            author_contact["rel_type"] = "AUTHOR_OF"
            author_contact["entity_type"] = "Person"
            author_contact["orcid"] = author_record["orcid"]
            viable_contacts.append(author_contact)
        
    if return_format == "list":
        return viable_contacts
    elif return_format == "dataframe":
        return pd.DataFrame(viable_contacts)   

def graphable_contacts_from_sdc(
    sdc_cache, 
    return_format="list"
):
    sdc_graphable_contacts = list()
    for record in sdc_cache:
        sdc_graphable_contacts.extend(sdc_contacts(record))
        
    if return_format == "list":
        return sdc_graphable_contacts
    elif return_format == "dataframe":
        return pd.DataFrame(sdc_graphable_contacts)


In [3]:
%%time
if click.confirm('Do you really want to proceed with rebuilding the local SDC cache from source?', default=True):
    sdc_cache = get_raw_sdc_docs()
    pickle.dump(sdc_cache, open(isaid_helpers.f_raw_sdc, "wb"))
    print(isaid_helpers.f_raw_sdc, "CREATED", datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_raw_sdc)))
else:
    sdc_cache = pickle.load(open(isaid_helpers.f_raw_sdc, "rb"))
    print("sdc_cache available in local memory")

Do you really want to proceed with rebuilding the local SDC cache from source? [Y/n]: n
sdc_cache available in local memory
CPU times: user 761 ms, sys: 137 ms, total: 898 ms
Wall time: 4.23 s


In [4]:
%%time
graphable_datasets_from_sdc(
    sdc_cache=sdc_cache,
    return_format="dataframe"
).to_csv(isaid_helpers.f_graphable_sdc, index=False)
print(
    isaid_helpers.f_graphable_sdc, 
    "CREATED", 
    datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_graphable_sdc))
)

data/graphable_table_sdc.csv CREATED 2021-07-07 14:11:40.666025
CPU times: user 694 ms, sys: 35.7 ms, total: 730 ms
Wall time: 757 ms


In [5]:
reference_terms = pickle.load(open(isaid_helpers.f_ner_reference, "rb"))
display(Counter([i["source"] for i in reference_terms]))
display(Counter(i['concept_label'] for i in reference_terms if "concept_label" in i))

Counter({'Wikidata Mineral Species': 10314,
         'Wikidata Chemical Elements': 667,
         'Wikidata Sedimentary Rocks': 91,
         'Wikidata Clastic Sediments': 7,
         'Wikidata Sovereign States': 1409,
         'Wikidata US States': 50,
         'Wikidata Global Seas and Oceans': 258,
         'Wikidata Global Faults': 3102,
         'Wikidata Global Volcanos': 1548,
         'Wikidata Global Earthquakes': 1500,
         'Wikidata US National Parks': 106,
         'Wikidata US National Monuments': 184,
         'Wikidata US National Forests': 221,
         'Wikidata US Wild and Scenic Rivers': 50,
         'Wikidata Geologic Formations': 9299,
         'Wikidata Aquifers': 27,
         'Wikidata Fields of Science': 457,
         'Wikidata Additional Commodities': 10,
         'Wikidata US Territories': 38,
         'Wikidata US Counties': 3108,
         'EPA Climate Change Glossary': 123,
         'Common geographic areas': 66096,
         'USGS Thesaurus': 1151,
       

Counter({'MINERAL_SPECIES': 10314,
         'CHEMICAL_ELEMENT': 667,
         'SEDIMENTARY_ROCK': 91,
         'CLASTIC_SEDIMENT': 7,
         'SOVEREIGN_STATE': 1409,
         'US_STATE': 50,
         'SEA_OR_OCEAN': 258,
         'GEOLOGIC_FAULT': 3102,
         'NAMED_VOLCANO': 1548,
         'NAMED_EARTHQUAKE': 1500,
         'NATIONAL_PARK': 106,
         'NATIONAL_MONUMENT': 184,
         'NATIONAL_FOREST': 221,
         'WILD_AND_SCENIC_RIVER': 50,
         'GEOLOGIC_FORMATION': 9299,
         'NAMED_GROUNDWATER_AQUIFER': 27,
         'FIELD_OF_SCIENCE': 457,
         'GEOLOGIC_COMMODITY_OR_MATERIAL': 10,
         'US_TERRITORY': 38,
         'US_COUNTY': 3108,
         'CLIMATE_CHANGE_TERM': 123,
         'USGS_COMMON_GEOGRAPHIC_AREAS': 66096,
         'USGS_SCIENCE_TOPICS': 570,
         'USGS_SCIENTIFIC_METHODS': 220,
         'USGS_SCIENTIFIC_DISCIPLINES': 87,
         'USGS_PRODUCT_TYPES': 38,
         'USGS_GEOLOGIC_TIME_PERIODS': 39,
         'USGS_INSTITUTIONAL_STRUCTURE

In [15]:
all_sdc_terms = list()
for record in sdc_cache:
    all_sdc_terms.extend(sdc_terms(record))


In [16]:
%%time
usgs_thesaurus_terms_in_sdc = list(set([i["term"] for i in all_sdc_terms if i["declared_term_source"] == "USGS Thesaurus"]))
usgs_thesaurus_terms_in_sdc.sort()

usgs_thesaurus_terms_in_source = [i["label"] for i in reference_terms if i["source"] == "USGS Thesaurus"]
verified_thesaurus_terms_in_sdc = [i for i in usgs_thesaurus_terms_in_sdc if i in usgs_thesaurus_terms_in_source]

graphable_usgs_thesaurus_linked_datasets = list()
for thesaurus_item in [i for i in reference_terms if i["source"] == "USGS Thesaurus" and i["label"] in verified_thesaurus_terms_in_sdc]:
    for sdc_term in [i for i in all_sdc_terms if i["declared_term_source"] == "USGS Thesaurus" and i["term"] == thesaurus_item["label"]]:
        graphable_usgs_thesaurus_linked_datasets.append({
            "sdc_internal_id": sdc_term["sdc_internal_id"],
            "date_qualifier": sdc_term["date_qualifier"],
            "reference": sdc_term["reference"],
            "DefinedSubjectMatter_url": thesaurus_item["url"],
            "DefinedSubjectMatter_name": thesaurus_item["label"],
            "DefinedSubjectMatter_source": thesaurus_item["source"],
            "DefinedSubjectMatter_source_reference": thesaurus_item["source_reference"],
            "DefinedSubjectMatter_concept_label": thesaurus_item["concept_label"],
            "DefinedSubjectMatter_description": thesaurus_item["description"],
        })

CPU times: user 26.2 s, sys: 106 ms, total: 26.3 s
Wall time: 26.7 s


In [17]:
pd.DataFrame(
    graphable_usgs_thesaurus_linked_datasets
).to_csv(
    isaid_helpers.f_graphable_sdc_rels_usgs_thesaurus, 
    index=False
)

print(
    isaid_helpers.f_graphable_sdc_rels_usgs_thesaurus, 
    "CREATED", 
    datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_graphable_sdc_rels_usgs_thesaurus))
)

data/graphable_table_sdc_usgs_thesaurus.csv CREATED 2021-07-07 14:22:25.128888


In [36]:
%%time
place_terms_in_sdc = list(set([i["term"] for i in all_sdc_terms if i["entity_type"] == "Location"]))
place_terms_in_sdc.sort()

place_concept_labels = [
         'USGS_COMMON_GEOGRAPHIC_AREAS'
         'SOVEREIGN_STATE',
         'US_STATE',
         'SEA_OR_OCEAN',
         'GEOLOGIC_FAULT',
         'NAMED_VOLCANO',
         'NATIONAL_PARK',
         'NATIONAL_MONUMENT',
         'NATIONAL_FOREST',
         'WILD_AND_SCENIC_RIVER',
         'US_TERRITORY',
         'US_COUNTY'
    ]

place_terms_in_source = list(set([
    i["label"] for i in reference_terms 
    if i["concept_label"] in place_concept_labels and not i["label"].isnumeric()
]))
verified_place_terms_in_sdc = [i for i in place_terms_in_sdc if i in place_terms_in_source]
verified_place_terms_in_sdc.sort()

graphable_place_linked_datasets = list()
for found_term in verified_place_terms_in_sdc:
    place_term = next((i for i in reference_terms if i["concept_label"] in place_concept_labels and i["label"] == found_term), None)
    if place_term is not None:
        for sdc_term in [i for i in all_sdc_terms if i["entity_type"] == "Location" and i["term"] == found_term]:
            graphable_place_linked_datasets.append({
                "sdc_internal_id": sdc_term["sdc_internal_id"],
                "date_qualifier": sdc_term["date_qualifier"],
                "reference": sdc_term["reference"],
                "DefinedSubjectMatter_name": place_term["label"],
                "DefinedSubjectMatter_source": place_term["source"],
                "DefinedSubjectMatter_source_reference": place_term["source_reference"],
                "DefinedSubjectMatter_concept_label": place_term["concept_label"],
                "DefinedSubjectMatter_url": place_term["url"] if "url" in place_term else place_term["identifier"],
                "DefinedSubjectMatter_description": place_term["description"] if "description" in place_term else None
            })

CPU times: user 59 s, sys: 232 ms, total: 59.2 s
Wall time: 1min


In [37]:
pd.DataFrame(
    graphable_place_linked_datasets
).to_csv(
    isaid_helpers.f_graphable_sdc_rels_places, 
    index=False
)

print(
    isaid_helpers.f_graphable_sdc_rels_places, 
    "CREATED", 
    datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_graphable_sdc_rels_places))
)

data/graphable_table_sdc_places.csv CREATED 2021-07-07 14:46:57.604312


In [32]:
%%time
df_graphable_contacts = graphable_contacts_from_sdc(
    sdc_cache=sdc_cache,
    return_format="dataframe"
)

df_graphable_contacts.loc[df_graphable_contacts.rel_type == "METADATA_CONTACT"].to_csv(isaid_helpers.f_graphable_sdc_rels_poc, index=False)
print(
    isaid_helpers.f_graphable_sdc_rels_poc, 
    "CREATED", 
    datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_graphable_sdc_rels_poc))
)

df_graphable_contacts.loc[df_graphable_contacts.rel_type == "POINT_OF_CONTACT"].to_csv(isaid_helpers.f_graphable_sdc_rels_md, index=False)
print(
    isaid_helpers.f_graphable_sdc_rels_md, 
    "CREATED", 
    datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_graphable_sdc_rels_md))
)

df_graphable_contacts.loc[df_graphable_contacts.rel_type == "AUTHOR_OF"].to_csv(isaid_helpers.f_graphable_sdc_rels_author, index=False)
print(
    isaid_helpers.f_graphable_sdc_rels_author, 
    "CREATED", 
    datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_graphable_sdc_rels_author))
)

data/graphable_table_sdc_poc.csv CREATED 2021-07-07 14:41:11.961488
data/graphable_table_sdc_md.csv CREATED 2021-07-07 14:41:12.082196
data/graphable_table_sdc_author.csv CREATED 2021-07-07 14:41:12.255399
CPU times: user 544 ms, sys: 184 ms, total: 727 ms
Wall time: 779 ms


In [38]:
pd.read_csv(isaid_helpers.f_graphable_sdc_rels_places)

Unnamed: 0,sdc_internal_id,date_qualifier,reference,DefinedSubjectMatter_name,DefinedSubjectMatter_source,DefinedSubjectMatter_source_reference,DefinedSubjectMatter_concept_label,DefinedSubjectMatter_url,DefinedSubjectMatter_description
0,USGS:5cf01a85e4b0b51330e22aa6,2020-08-27T00:00:00,https://data.usgs.gov/datacatalog/data/USGS:5c...,Abbeville County,Wikidata US Counties,Wikidata county of state instances,US_COUNTY,http://www.wikidata.org/entity/Q306343,"county in South Carolina, United States"
1,USGS:5847137ee4b0f34b016ff271,2020-08-31T00:00:00,https://data.usgs.gov/datacatalog/data/USGS:58...,Abu,Wikidata Global Volcanos,https://www.wikidata.org/wiki/Q8072,NAMED_VOLCANO,http://www.wikidata.org/entity/Q334728,"mountain in Yamaguchi Prefecture, Japan"
2,USGS:5eb1ca8782cefae35a29c3d3,2020-08-19T00:00:00,https://data.usgs.gov/datacatalog/data/USGS:5e...,Acadia National Park,Wikidata US National Parks,https://www.wikidata.org/wiki/Q34918903,NATIONAL_PARK,http://www.wikidata.org/entity/Q337396,national park in the US state of Maine
3,USGS:5c018adae4b0815414cc70bc,2020-09-25T00:00:00,https://data.usgs.gov/datacatalog/data/USGS:5c...,Acadia National Park,Wikidata US National Parks,https://www.wikidata.org/wiki/Q34918903,NATIONAL_PARK,http://www.wikidata.org/entity/Q337396,national park in the US state of Maine
4,USGS:5b92cffce4b0702d0e80a2d5,2021-06-01T00:00:00,https://data.usgs.gov/datacatalog/data/USGS:5b...,Acadia National Park,Wikidata US National Parks,https://www.wikidata.org/wiki/Q34918903,NATIONAL_PARK,http://www.wikidata.org/entity/Q337396,national park in the US state of Maine
...,...,...,...,...,...,...,...,...,...
40173,USGS:5eb1ca8782cefae35a29c3d3,2020-08-19T00:00:00,https://data.usgs.gov/datacatalog/data/USGS:5e...,Zion National Park,Wikidata US National Parks,https://www.wikidata.org/wiki/Q34918903,NATIONAL_PARK,http://www.wikidata.org/entity/Q205325,"national park in Washington, Iron, and Kane co..."
40174,USGS:5b7d8290e4b045b1dc7bd758,2020-08-27T00:00:00,https://data.usgs.gov/datacatalog/data/USGS:5b...,Zion National Park,Wikidata US National Parks,https://www.wikidata.org/wiki/Q34918903,NATIONAL_PARK,http://www.wikidata.org/entity/Q205325,"national park in Washington, Iron, and Kane co..."
40175,USGS:5d113038e4b0941bde55058e,2020-08-27T00:00:00,https://data.usgs.gov/datacatalog/data/USGS:5d...,Zion National Park,Wikidata US National Parks,https://www.wikidata.org/wiki/Q34918903,NATIONAL_PARK,http://www.wikidata.org/entity/Q205325,"national park in Washington, Iron, and Kane co..."
40176,USGS:5b92cffce4b0702d0e80a2d5,2021-06-01T00:00:00,https://data.usgs.gov/datacatalog/data/USGS:5b...,Zion National Park,Wikidata US National Parks,https://www.wikidata.org/wiki/Q34918903,NATIONAL_PARK,http://www.wikidata.org/entity/Q205325,"national park in Washington, Iron, and Kane co..."
