This notebook uses the US Census data for U.S. States from the Microsoft Planetary Computer to instantiate State and Territory items in the knowledgebase. This is a later iteration of something I developed using somewhat different methodology and a different source earlier for this Wikibase instance. The main thing that's interesting to build on here is a further evolution of the upsert process where we may often need to run back through a given source and build parts of or whole items in a new way.

One dependency here is getting the Legal/Statistical Area Description codes and items created using that notebook.

In [1]:
import planetary_computer
import pystac_client
import dask_geopandas
import pandas as pd

from wbmaker import WikibaseConnection

In [2]:
geokb = WikibaseConnection("GEOKB_CLOUD")

In [3]:
# Select source item to process
source_item_id = geokb.ref_lookup['U.S. State names and identifiers from U.S. Census via Microsoft Planetary Computer']
source_item = geokb.wbi.item.get(source_item_id)

In [4]:
# Extract STAC connection details from source item
stac_source = source_item.claims.get_json()[geokb.prop_lookup['STAC Catalog URL']][0]
stac_catalog_url = stac_source['mainsnak']['datavalue']['value']
stac_collection_name = stac_source['qualifiers'][geokb.prop_lookup['STAC Collection Name']][0]['datavalue']['value']
stac_item_name = stac_source['qualifiers'][geokb.prop_lookup['STAC Item Name']][0]['datavalue']['value']

In [5]:
# Query STAC catalog for asset
stac_catalog = pystac_client.Client.open(
    stac_catalog_url,
    modifier=planetary_computer.sign_inplace,
)
stac_collection = stac_catalog.get_collection(stac_collection_name)
stac_asset = stac_collection.get_item(stac_item_name).assets["data"]

In [6]:
# Read source data and compute coordinates
gdf_us_states = dask_geopandas.read_parquet(
    stac_asset.href,
    storage_options=stac_asset.extra_fields["table:storage_options"],
    calculate_divisions=True,
)
gdf_us_states = gdf_us_states.to_crs(epsg=4326).compute()
gdf_us_states['coordinates'] = gdf_us_states.to_crs('+proj=cea').geometry.centroid.to_crs(gdf_us_states.crs)

In [7]:
# Lookup QID for LSAD and merge with source data
query_lsad = "PREFIX%20wdt%3A%20%3Chttps%3A%2F%2Fgeokb.wikibase.cloud%2Fprop%2Fdirect%2F%3E%0A%0ASELECT%20%3Fitem%20%3Flsad%0AWHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP90%20%3Flsad%20.%0A%7D%0A"
df_lsad = geokb.wb_ref_data(query=query_lsad)
df_lsad['lsad_qid'] = df_lsad.item.apply(lambda x: x.split('/')[-1])

state_records = pd.merge(
    left=gdf_us_states,
    right=df_lsad[["lsad","lsad_qid"]],
    how="left",
    left_on="LSAD",
    right_on="lsad"
)


In [8]:
# Set the instance of for the item based on FIPS code
def instance_of_name(STATEFP):
    int_statefp = int(STATEFP)
    if int_statefp == 11:
        return 'U.S. federal district'
    if int_statefp > 56:
        return 'U.S. Territory'
    return 'U.S. State'

state_records['instance_of_name'] = state_records.STATEFP.apply(instance_of_name)
state_records['instance_of_qid'] = state_records.instance_of_name.apply(lambda x: geokb.class_lookup[x])


In [9]:
def lookup_state(fips_alpha):
    q = """
    %(namespaces)s

    SELECT ?st ?fips_alpha
    WHERE {
    ?st wdt:%(p_fips_alpha)s "%(v_fips_alpha)s" .
    ?st wdt:%(p_fips_alpha)s ?fips_alpha .
    SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
    }
    """ % {
        "namespaces": geokb.sparql_namespaces(),
        "v_fips_alpha": fips_alpha,
        "p_fips_alpha": geokb.prop_lookup['FIPS 5-2 alpha']
    }

    return geokb.sparql_query(query=q, output="lookup")


In [10]:
references = geokb.models.References()
references.add(
    geokb.datatypes.Item(
        prop_nr=geokb.prop_lookup['data source'],
        value=source_item_id
    )
)

for index, row in state_records.iterrows():
    wb_item_id = lookup_state(row.STUSPS)
    if wb_item_id:
        wb_item = geokb.wbi.item.get(wb_item_id[row.STUSPS])
    else:
        wb_item = geokb.wbi.item.new()

    wb_item.labels.set('en', row.NAME)
    wb_item.aliases.set('en', row.STUSPS)
    wb_item.descriptions.set('en', f'a {row.instance_of_name}')

    claims = geokb.models.Claims()
    claims.add(
        geokb.datatypes.Item(
            prop_nr=geokb.prop_lookup['instance of'],
            value=row.instance_of_qid,
            references=references
        )
    )
    claims.add(
        geokb.datatypes.ExternalID(
            prop_nr=geokb.prop_lookup['FIPS 5-2 alpha'],
            value=row.STUSPS,
            references=references
        )
    )
    claims.add(
        geokb.datatypes.ExternalID(
            prop_nr=geokb.prop_lookup['FIPS 5-2 numeric'],
            value=str(row.STATEFP),
            references=references
        )
    )
    claims.add(
        geokb.datatypes.ExternalID(
            prop_nr=geokb.prop_lookup['FIPS 10-4'],
            value=f"US{row.STATEFP}",
            references=references
        )
    )
    claims.add(
        geokb.datatypes.ExternalID(
            prop_nr=geokb.prop_lookup['ISO 3166-2 code'],
            value=f"US-{row.STUSPS}",
            references=references
        )
    )
    claims.add(
        geokb.datatypes.ExternalID(
            prop_nr=geokb.prop_lookup['TIGER GEOID'],
            value=str(row.GEOID),
            references=references
        )
    )
    claims.add(
        geokb.datatypes.ExternalID(
            prop_nr=geokb.prop_lookup['GNIS ID'],
            value=str(row.STATENS),
            references=references
        )
    )
    claims.add(
        geokb.datatypes.GlobeCoordinate(
            prop_nr=geokb.prop_lookup['coordinate location'],
            latitude=row.coordinates.y,
            longitude=row.coordinates.x,
            references=references
        )
    )
    claims.add(
        geokb.datatypes.Item(
            prop_nr=geokb.prop_lookup['Legal/Statistical Area Description'],
            value=row.lsad_qid,
            references=references
        )
    )

    wb_item.add_claims(claims=claims)
    response = wb_item.write(
        summary="Updated item from Census source",
        clear=True
    )
    print("UPDATED:", row.NAME, response.id)

UPDATED: Guam Q26696
UPDATED: Texas Q235
UPDATED: Wisconsin Q267
UPDATED: Rhode Island Q264
UPDATED: New York Q253
UPDATED: United States Virgin Islands Q26697
UPDATED: New Hampshire Q243
UPDATED: Minnesota Q239
UPDATED: Puerto Rico Q26698
UPDATED: Missouri Q244
UPDATED: North Carolina Q251
UPDATED: Michigan Q230
UPDATED: Louisiana Q231
UPDATED: Nebraska Q241
UPDATED: California Q233
UPDATED: Wyoming Q248
UPDATED: South Carolina Q275
UPDATED: Commonwealth of the Northern Mariana Islands Q26699
UPDATED: Kansas Q269
UPDATED: Delaware Q262
UPDATED: Alaska Q254
UPDATED: New Jersey Q240
UPDATED: North Dakota Q278
UPDATED: District of Columbia Q26700
UPDATED: Colorado Q256
UPDATED: Virginia Q249
UPDATED: Indiana Q272
UPDATED: Nevada Q255
UPDATED: New Mexico Q252
UPDATED: Alabama Q246
UPDATED: Tennessee Q279
UPDATED: Kentucky Q245
UPDATED: Oregon Q266
UPDATED: Mississippi Q238
UPDATED: Connecticut Q271
UPDATED: Georgia Q234
UPDATED: Utah Q236
UPDATED: Idaho Q268
UPDATED: Illinois Q242
UPDATED