This notebook works up the USGS web inventory table of "Science Centers" to add or clarify claims in the GeoKB for these organizational entities. This is a fairly imperfect source for these organizations as it is by no means authoritative, but it is the best public-facing listing of "USGS Centers." The main things we extract from the table are the center names that we link to existing items where we have them, the USGS Region so that we can establish a part-of linkage, and the states/territories that we incorporate as "operating area" claims. We also work from the Regions to add "has part" claims for the Centers within Regions.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from wbmaker import WikibaseConnection

In [2]:
geokb = WikibaseConnection("GEOKB_CLOUD")

In [3]:
usgs_base_url = "https://www.usgs.gov"
usgs_sc_url = "https://www.usgs.gov/science/science-centers"

# Science Centers from Web Listing

In [14]:
science_centers = []

r_sc = requests.get(usgs_sc_url)
if r_sc.status_code == 200:
    soup_sc = BeautifulSoup(r_sc.content, 'html.parser')
    sc_table = soup_sc.find('table')
    table_body = sc_table.find('tbody')
    for row in table_body.find_all('tr'):
        columns = row.find_all('td')
        science_centers.append({
            "sc_name": columns[0].text.strip(),
            "sc_link": f"{usgs_base_url}{columns[0].find('a')['href']}",
            "sc_director": columns[1].text.strip(),
            "sc_region": columns[2].text.strip(),
            "sc_region_link": columns[2].find('a')['href'],
            "states_territories": [i.strip() for i in columns[3].text.replace(' and ', ', ').replace('.', '').replace('Louisana', 'Louisiana').split(",")]
        })

    df_science_centers = pd.DataFrame(science_centers)

df_science_centers["sc_name_acronym"] = df_science_centers["sc_name"].str.extract(r'\((.*?)\)')
df_science_centers["sc_label"] = df_science_centers["sc_name"].apply(lambda x: x.split('(')[0].strip())

# GeoKB References for Linking

In [5]:
q_geokb_orgs = "PREFIX%20wd%3A%20%3Chttps%3A%2F%2Fgeokb.wikibase.cloud%2Fentity%2F%3E%0APREFIX%20wdt%3A%20%3Chttps%3A%2F%2Fgeokb.wikibase.cloud%2Fprop%2Fdirect%2F%3E%0A%0ASELECT%20%3Fitem%20%3FitemLabel%20%3FitemAltLabel%20%3Furl%20%3Finstance_ofLabel%20%23%20Aliases%20can%20contain%20alternate%20names%20and%20acronyms%20for%20organizations%0AWHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP62*%20wd%3AQ44210%20.%20%23%20%22part%20of%22%20(transitive)%20%22USGS%22%0A%20%20%3Fitem%20wdt%3AP1%20%3Finstance_of%20.%0A%20%20OPTIONAL%20%7B%0A%20%20%20%20%3Fitem%20wdt%3AP31%20%3Furl%20.%0A%20%20%7D%0A%20%20SERVICE%20wikibase%3Alabel%20%7B%20bd%3AserviceParam%20wikibase%3Alanguage%20%22en%22%20.%20%7D%0A%7D"
df_geokb_orgs = geokb.wb_ref_data(query=q_geokb_orgs)
df_geokb_orgs["qid"] = df_geokb_orgs["item"].apply(lambda x: x.split("/")[-1])

q_states_territories = "PREFIX%20wdt%3A%20%3Chttps%3A%2F%2Fgeokb.wikibase.cloud%2Fprop%2Fdirect%2F%3E%0A%0ASELECT%20%3Fitem%20%3FitemLabel%20%3Ffips_alpha%0AWHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP13%20%3Ffips_alpha%20.%20%0A%20%20SERVICE%20wikibase%3Alabel%20%7B%20bd%3AserviceParam%20wikibase%3Alanguage%20%22en%22%20.%20%7D%0A%7D"
df_states_territories = geokb.wb_ref_data(query=q_states_territories)
df_states_territories["qid"] = df_states_territories["item"].apply(lambda x: x.split("/")[-1])


# Prep Data

In [17]:
df_science_centers_to_geokb = pd.merge(
    left=df_science_centers,
    right=df_geokb_orgs[["qid","itemLabel"]].rename(columns={"qid": "sc_qid", "itemLabel": "sc_label"}),
    how="left",
    on="sc_label"
)

df_science_centers_to_geokb = pd.merge(
    left=df_science_centers_to_geokb,
    right=df_geokb_orgs[["qid","itemLabel"]].rename(columns={"qid": "region_qid", "itemLabel": "sc_region"}),
    how="left",
    on="sc_region"
)

sc_st_list = df_science_centers_to_geokb[["sc_qid","states_territories"]].explode("states_territories")

sc_states_territories = pd.merge(
    left=sc_st_list[sc_st_list.states_territories.str.len() > 0],
    right=df_states_territories[["qid","itemLabel"]].rename(columns={"qid": "state_territory_qid", "itemLabel": "states_territories"}),
    how="left",
    on="states_territories"
)[["sc_qid","state_territory_qid"]].groupby(by="sc_qid", as_index=False).agg(list)

df_science_centers_to_geokb = pd.merge(
    left=df_science_centers_to_geokb,
    right=sc_states_territories,
    how="left",
    on="sc_qid"
)

df_science_centers_to_geokb["instance_of"] = df_science_centers_to_geokb["sc_label"].apply(lambda x: "Q50863" if "Water Science Center" in x else "Q50870")

# Preview

In [7]:
df_science_centers_to_geokb.head()

Unnamed: 0,sc_name,sc_link,sc_director,sc_region,sc_region_link,states_territories,sc_name_acronym,sc_label,sc_qid,region_qid,state_territory_qid,instance_of
0,Alaska Science Center,https://www.usgs.gov/centers/asc,Christian Zimmerman,Alaska Region,https://www.usgs.gov/regions/alaska,[Alaska],,Alaska Science Center,Q44211,Q44362,[Q254],Q50870
1,Arizona Water Science Center,https://www.usgs.gov/centers/az-water,James Leenhouts,Southwest Region,https://www.usgs.gov/regions/southwest,[Arizona],,Arizona Water Science Center,Q44219,Q44358,[Q247],Q50863
2,Astrogeology Science Center,https://www.usgs.gov/centers/astrogeology-scie...,Justin Hagerty,Southwest Region,https://www.usgs.gov/regions/southwest,[Arizona],,Astrogeology Science Center,Q44235,Q44358,[Q247],Q50870
3,California Water Science Center,https://www.usgs.gov/centers/ca-water,Anke Mueller-Solger,Southwest Region,https://www.usgs.gov/regions/southwest,[California],,California Water Science Center,Q44227,Q44358,[Q233],Q50863
4,Caribbean-Florida Water Science Center,https://www.usgs.gov/centers/car-fl-water,David Sumner,Southeast Region,https://www.usgs.gov/regions/southeast,"[Puerto Rico, Florida]",,Caribbean-Florida Water Science Center,Q44285,Q44343,"[Q26698, Q273]",Q50863


# Commit Center Claims

In [21]:
from wikibaseintegrator.wbi_enums import ActionIfExists, WikibaseDatePrecision

refs = geokb.models.References()
refs.add(
    geokb.datatypes.URL(
        prop_nr=geokb.prop_lookup['reference URL'],
        value=usgs_sc_url
    )
)

quals = geokb.models.Qualifiers()
quals.add(
    geokb.datatypes.Time(
        prop_nr=geokb.prop_lookup['point in time'],
        time='+2023-01-01T00:00:00Z',
        precision=WikibaseDatePrecision.YEAR
    )
)

for index, row in df_science_centers_to_geokb.iterrows():
    item = geokb.wbi.item.get(row['sc_qid'])

    instance_of_claim = geokb.datatypes.Item(
        prop_nr=geokb.prop_lookup['instance of'],
        value=row["instance_of"],
        qualifiers=quals,
        references=refs
    )

    item.claims.add(
       claims=instance_of_claim,
       action_if_exists=ActionIfExists.REPLACE_ALL
    )

    part_of_claim = geokb.datatypes.Item(
        prop_nr=geokb.prop_lookup['part of'],
        value=row["region_qid"],
        qualifiers=quals,
        references=refs
    )

    item.claims.add(
        claims=part_of_claim,
        action_if_exists=ActionIfExists.REPLACE_ALL
    )

    ao_claims = []
    for st_qid in row["state_territory_qid"]:
        if isinstance(st_qid, str) and len(st_qid) > 0:
            ao_claims.append(
                geokb.datatypes.Item(
                    prop_nr=geokb.prop_lookup['operating area'],
                    value=st_qid,
                    qualifiers=quals,
                    references=refs
                )
            )

    item.claims.add(
        claims=ao_claims,
        action_if_exists=ActionIfExists.REPLACE_ALL
    )

    response = item.write(
        summary="Built out Science Center claims from USGS web listing"
    )
    print(row["sc_label"], response.id)

Nebraska Water Science Center Q44268
Nevada Water Science Center Q44221
New England Water Science Center Q44220
New Jersey Water Science Center Q44240
New Mexico Water Science Center Q44286
New York Water Science Center Q44241
Northern Prairie Wildlife Research Center Q44256
Northern Rocky Mountain Science Center Q44222
Ohio-Kentucky-Indiana Water Science Center Q44345
Oklahoma-Texas Water Science Center Q44275
Oregon Water Science Center Q44242
Pacific Coastal and Marine Science Center Q44243
Pacific Island Ecosystems Research Center Q44317
Pacific Islands Water Science Center Q44282
Pennsylvania Water Science Center Q44270
Science and Decisions Center Q44296
South Atlantic Water Science Center Q44236
Southwest Biological Science Center Q44252
St. Petersburg Coastal and Marine Science Center Q44247
Upper Midwest Environmental Sciences Center Q44383
Upper Midwest Water Science Center Q44326
Utah Water Science Center Q44269
Virginia and West Virginia Water Science Center Q44266
Volcano 

Service unavailable (HTTP Code 504). Sleeping for 60 seconds.


Western Ecological Research Center Q44324
Western Fisheries Research Center Q44284
Western Geographic Science Center Q44245
Wetland and Aquatic Research Center Q44217
Woods Hole Coastal and Marine Science Center Q44230
Wyoming-Montana Water Science Center Q44290


# Regional "has part" Claims

In [23]:
for index, row in df_science_centers_to_geokb[["region_qid","sc_qid"]].groupby("region_qid", as_index=False).agg(list).iterrows():
    item = geokb.wbi.item.get(row["region_qid"])
    
    has_part_claims = []
    for sc_qid in row["sc_qid"]:
        has_part_claims.append(
            geokb.datatypes.Item(
                prop_nr=geokb.prop_lookup['has part'],
                value=sc_qid,
                qualifiers=quals,
                references=refs
            )
        )

    item.claims.add(
        claims=has_part_claims,
        action_if_exists=ActionIfExists.REPLACE_ALL
    )

    response = item.write(
        summary="Added Science Centers as has part claims to Region"
    )
    print(response.id)

Q44338
Q44343
Q44348
Q44357
Q44358
Q44362
Q44363
