In order to further examine the expertise terms from USGS Staff Profiles and start toward more robust modeling of scientific and technical expertise from these and other sources, it's useful to start aligning terms and concepts with more semantically robust sources. The USGS Thesaurus is suggested as a source for expertise keywords in the staff profile system, but the storage (or at least exposure) of those terms in that system do not include any type of identifier reference to indicate source. So, we need to run through a process to try and align as many of these keyword terms as we can with some other source. The USGS Thesaurus is a reasonable start, though it has some issues with semantically aligning with the concept of "expertise," is limited to concepts with a significant bias toward USGS ways of defining the world, and does not explicitly link to other knowledge systems.

This notebook runs through the process of finding reasonable alignment from user-provided expertise terms in the staff profile pages with USGS Thesaurus terms. I continue building a simple data structure that indicates the logical source for where our intelligence gathering system get a term, the term itself, and what it references.

In [1]:
import requests
from sqlite_utils import Database
from joblib import Parallel, delayed
import tqdm
import json

db = Database("usgs_profiles.db")

In [3]:
profile_terms = [
    {
        "profile": i["source_identifier"],
        "term": i["term"].lower()
    } for i in db["expertise_terms"].rows_where("term_source = 'USGS Staff Profiles'")
]

expertise_terms = list(set([i["term"] for i in profile_terms]))
expertise_terms.sort()

print("Total number of profile terms:", len(profile_terms))
print("Unique terms for lookup:", len(expertise_terms))

Total number of profile terms: 341
Unique terms for lookup: 260


In [4]:
usgs_thesaurus_search_url = "https://www2.usgs.gov/science/term-search.php?thcode=2"
usgs_thesaurus_term_url = "https://www2.usgs.gov/science/term.php"

def usgs_thesaurus_best(term):
    term_search = f"{usgs_thesaurus_search_url}&rel=contains&term={term}"
    r = requests.get(term_search).json()
    term_results = {
        "term": term
    }

    if len(r) == 0:
        term_results["usgs_thesaurus_match_method"] = None
        return term_results

    
    exact_match = next((i for i in r if i["label"].lower() == term or i["value"].lower() == term), None)
    
    if exact_match is not None:
        term_results["usgs_thesaurus_match_method"] = "exact"
        term_results["usgs_thesaurus_value"] = exact_match["value"]
        term_results["usgs_thesaurus_label"] = exact_match["label"]
        term_results["usgs_thesaurus_uri"] = f"{usgs_thesaurus_term_url}?code={exact_match['code']}"
        return term_results
    
    term_results["usgs_thesaurus_match_method"] = "multiple"
    term_results["usgs_thesaurus_possible_terms"] = [
        {
            "label": i["label"],
            "value": i["value"],
            "uri": f"{usgs_thesaurus_term_url}?code={i['code']}"
        } for i in r
    ]
    
    return term_results

In [5]:
term_alignment = list()

def accumulator(term):
    term_alignment.append(usgs_thesaurus_best(term))

In [6]:
Parallel(n_jobs=10, prefer="threads")(
    delayed(accumulator)
    (
        i
    ) for i in tqdm.tqdm(expertise_terms)
)

100%|██████████| 260/260 [00:14<00:00, 17.74it/s]


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [9]:
print("Terms with an exact match:", len([i for i in term_alignment if i["usgs_thesaurus_match_method"] == "exact"]))
print("Terms with possible matches:", len([i for i in term_alignment if i["usgs_thesaurus_match_method"] == "multiple"]))
print("Terms with no match:", len([i for i in term_alignment if i["usgs_thesaurus_match_method"] is None]))

Terms with an exact match: 98
Terms with possible matches: 12
Terms with no match: 150


In [10]:
update_package = list()
for term_match in [i for i in term_alignment if i["usgs_thesaurus_match_method"] == "exact"]:
    for source_id in [i["profile"] for i in profile_terms if i["term"] == term_match["term"]]:
        exact_match_terms = [term_match["usgs_thesaurus_label"], term_match["usgs_thesaurus_value"]]
        exact_match_terms = list(set(exact_match_terms))
        
        for the_term in exact_match_terms:
            add_term = {
                "term_source": "USGS Thesaurus Exact Match from Staff Profile",
                "source_identifier": source_id,
                "term": the_term,
            }
            add_term["identifier"] = ":".join(v for k,v in add_term.items())
            add_term["term_uri"] = term_match["usgs_thesaurus_uri"]

            update_package.append(add_term)
        
for term_match in [i for i in term_alignment if i["usgs_thesaurus_match_method"] == "multiple"]:
    for source_id in [i["profile"] for i in profile_terms if i["term"] == term_match["term"]]:
        suggested_terms = [i["value"] for i in term_match["usgs_thesaurus_possible_terms"]]
        suggested_terms.extend([i["label"] for i in term_match["usgs_thesaurus_possible_terms"]])
        suggested_terms = list(set(suggested_terms))
        
        for term_suggestion in term_match["usgs_thesaurus_possible_terms"]:
            add_term = {
                "term_source": "USGS Thesaurus Suggested Terms from Staff Profile",
                "source_identifier": source_id,
                "term": term_suggestion["value"],
            }
            add_term["identifier"] = ":".join(v for k,v in add_term.items())
            add_term["term_uri"] = term_suggestion['uri']

            update_package.append(add_term)

In [11]:
db["expertise_terms"].upsert_all(update_package, pk="identifier", alter=True)

<Table expertise_terms (term_source, source_identifier, term, identifier, usgs_thesaurus_match_method, usgs_thesaurus_value, usgs_thesaurus_label, usgs_thesaurus_uri, usgs_thesaurus_possible_terms, term_uri)>