In this notebook, we examine specific declared vocabulary terms from the USGS Thesaurus and for geographic place names in metadata that we are building into the graph to determine whether or not the terms are actually found in an associated vocabulary. We put these specific terms into a data file that can be loaded into our graph and then limit processing of related items to just those cases where we are matching defined and referenceable terms.

In [17]:
import pandas as pd
import sqlite3
import os
from joblib import Parallel, delayed
from tqdm import tqdm
import pickle
import isaid_helpers
import datetime
import requests
import click


We need to work through the entire SDC database to evaluate all terms claiming to be from the USGS Thesaurus and all supposed place name keywords. This code gets the raw SDC from our local cache and builds lists of unique terms in each category.

In [2]:
sdc_cache = pickle.load(open(isaid_helpers.f_raw_sdc, "rb"))

declared_terms = list()
for item in [i for i in sdc_cache if "usgsThesaurusKeyword" in i]:
    declared_terms.extend(item["usgsThesaurusKeyword"])
declared_terms = [{"term": i} for i in list(set(declared_terms))]

declared_places = list()
for item in [i for i in sdc_cache if "placeKeyword" in i]:
    declared_places.extend(item["placeKeyword"])
declared_places = [{"term": i} for i in list(set(declared_places))]


These two functions handle the parallel processing of all terms against the reference datasets, making a few logical choices to set up reference data that can be used to create entities in our graph that can be linked to.

In [7]:
term_eval = list()
def accumulator(term):
    matching_terms = [
        i for i in all_terms 
        if i["name"].lower() == term["term"].lower()
    ]

    if not matching_terms:
        term.update({
            "valid_term": False,
            "usable_term": False
        })

    elif len(matching_terms) == 1:
        term.update({
            "valid_term": True,
            "usable_term": True
        })
        term.update(matching_terms[0])

    elif len(matching_terms) > 1:
        thesaurus_match = next((i for i in matching_terms if i["thesaurus_id"] == 2), None)
        term.update({"possible_sources": [i for i in matching_terms if i["thesaurus_id"] != 2]})
        if thesaurus_match is not None:
            term.update({
                "valid_term": True, 
                "usable_term": True
            })
            term.update(thesaurus_match)
        else:
            term.update({
                "valid_term": True,
                "usable_term": False,
                "number_results": len(matching_terms),
                "thesaurus_names": list(set([i["thesaurus_name"] for i in matching_terms]))
            })
    
    term_eval.append(term)

place_eval = list()
def accumulator_places(term):
    matching_terms = [
        i for i in geo_names 
        if i["name"].lower() == term["term"].lower()
    ]

    if not matching_terms:
        term.update({
            "valid_term": False,
            "usable_term": False
        })

    elif len(matching_terms) == 1:
        term.update({
            "valid_term": True,
            "usable_term": True
        })
        term.update(matching_terms[0])

    elif len(matching_terms) > 1:
        term.update({"possible_sources": matching_terms})
        first_match = term["possible_sources"][0]
        term.update({
            "valid_term": True, 
            "usable_term": True
        })
        term.update(first_match)
    
    place_eval.append(term)
        

In [None]:
con_thesaurus = sqlite3.connect("thesauri.db")
df_thesaurus = pd.read_sql_query("SELECT * from thesaurus", con_thesaurus)

all_terms = list()

for index, row in df_thesaurus.iterrows():
    try:
        df = pd.read_sql_query(f"SELECT * FROM {row.tblname}", con)
        df["thesaurus_name"] = row["name"]
        df["thesaurus_id"] = row["tag"]
        d = df.to_dict(orient="records")
        all_terms.extend(d)
    except:
        pass

In [None]:
try:
    Parallel(n_jobs=50, prefer="threads")(
        delayed(accumulator)
        (
            i
        ) for i in tqdm(declared_terms)
    )
except Exception as e:
    print(e)

In [None]:
print("Terms declared as USGS Thesaurus but not in USGS Thesaurus:", len([i for i in term_eval if not i["usable_term"]]))

In [None]:
def add_url(thesaurus_id, code):
    return f"https://apps.usgs.gov/thesaurus/term-simple.php??thcode={code}&code={thesaurus_id}"

df_usable_terms = pd.DataFrame([i for i in term_eval if i["usable_term"]])
df_usable_terms["url"] = df_usable_terms.apply(lambda x: add_url(x.thesaurus_id, x.code), axis=1)

df_usable_terms.head()

In [None]:
df_usable_terms.to_csv(isaid_helpers.f_graphable_thesaurus_terms, index=False)
print(
    isaid_helpers.f_graphable_thesaurus_terms, 
    "CREATED", 
    datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_graphable_thesaurus_terms))
)

In [None]:
%%time
if click.confirm('Do you really really need to download the Common Geographic Areas DB from source?', default=True):
    r = requests.get("https://apps.usgs.gov/thesaurus/cga/CommonGeographicAreas.db", stream=True)
    if r.status_code == 200:
        with open(isaid_helpers.f_common_geo_areas, 'wb') as f:
            for chunk in r:
                f.write(chunk)

In [4]:
con_geo_areas = sqlite3.connect(isaid_helpers.f_common_geo_areas)


In [5]:
df_geo_names = pd.read_sql_query("SELECT * from geo", con_geo_areas)
df_geo_names["thesaurus_name"] = "Common geographic areas (USGS Thesaurus)"
geo_names = df_geo_names.to_dict(orient="records")

In [8]:
try:
    Parallel(n_jobs=50, prefer="threads")(
        delayed(accumulator_places)
        (
            i
        ) for i in tqdm(declared_places)
    )
except Exception as e:
    print(e)

100%|██████████| 14857/14857 [03:26<00:00, 71.92it/s]


In [9]:
print("Terms declared as 'places' but not in USGS Thesaurus' Common Geographic Areas:", len([i for i in place_eval if not i["usable_term"]]))

Terms declared as 'places' but not in USGS Thesaurus' Common Geographic Areas: 12147


In [19]:
df_usable_places = pd.DataFrame([i for i in place_eval if i["usable_term"]])

df_usable_places.to_csv(isaid_helpers.f_graphable_place_names, index=False)
print(
    isaid_helpers.f_graphable_place_names, 
    "CREATED", 
    datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_graphable_place_names))
)

graphable_table_usable_usgs_thesaurus_places.csv CREATED 2021-06-07 14:25:14.050873


In [20]:
df_usable_places

Unnamed: 0,term,possible_sources,valid_term,usable_term,code,name,parent,scope,thesaurus_name
0,TOGO,"[{'code': 'fTO', 'name': 'Togo', 'parent': 'fL...",True,True,fTO,Togo,fLD50,country,Common geographic areas (USGS Thesaurus)
1,Ronceverte,,True,True,q38082NEE3,Ronceverte,q38082NE,"map quadrangle, 7.5 minute",Common geographic areas (USGS Thesaurus)
2,Arlington,"[{'code': 'f51013', 'name': 'Arlington', 'pare...",True,True,f51013,Arlington,fUS51,county,Common geographic areas (USGS Thesaurus)
3,New Hanover,,True,True,f37129,New Hanover,fUS37,county,Common geographic areas (USGS Thesaurus)
4,Moriches,,True,True,q41074NEB2,Moriches,q41074NE,"map quadrangle, 7.5 minute",Common geographic areas (USGS Thesaurus)
...,...,...,...,...,...,...,...,...,...
2705,Lake Winnipesaukee,,True,True,q44072NW,Lake Winnipesaukee,q44072,"map quadrangle, 30x60 minute",Common geographic areas (USGS Thesaurus)
2706,Fort Bragg,,True,True,q40124SWB1,Fort Bragg,q40124SW,"map quadrangle, 7.5 minute",Common geographic areas (USGS Thesaurus)
2707,AUSTRALIA,,True,True,fAS,Australia,fLD60,country,Common geographic areas (USGS Thesaurus)
2708,Santa Rosa,"[{'code': 'f12113', 'name': 'Santa Rosa', 'par...",True,True,f12113,Santa Rosa,fUS12,county,Common geographic areas (USGS Thesaurus)
