The original version of this process used US Census data for U.S. States from the Microsoft Planetary Computer to instantiate State and Territory items in the knowledgebase. I'm now updating this with some current thinking on this kind of process. In future, however, leveraging the MPC for anything with Census data is highly efficient, and I've kept some notes here for future reference.

In [1]:
from wbmaker import WikibaseConnection
import pandas as pd

geokb = WikibaseConnection("GEOKB_CLOUD")

In [2]:
def aggregator(x):
    return str(x.unique()[0]) if len(x.unique()) == 1 else list(sorted(x.unique()))

geokb_states_query = """
PREFIX wdt: <https://geokb.wikibase.cloud/prop/direct/>

SELECT ?state ?stateLabel ?stateAltLabel ?stateDescription
?iso3166_2 ?fips_52_alpha ?fips_52_numeric ?fips_10_4 ?gnis_id
?same_as ?instance_ofLabel
WHERE {
  ?state wdt:P1 ?instance_of ;
         wdt:P12 ?iso3166_2 ;
         wdt:P13 ?fips_52_alpha ;
         wdt:P14 ?fips_52_numeric ;
         wdt:P47 ?fips_10_4 ;
         wdt:P21 ?gnis_id .
  OPTIONAL {
    ?state wdt:P84 ?same_as .
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
}
"""

geokb_states = geokb.sparql_query(geokb_states_query)
geokb_states['qid'] = geokb_states['state'].apply(lambda x: x.split('/')[-1])

wd_state_query = """
SELECT ?state ?stateLabel ?stateAltLabel ?stateDescription
?iso3166_2 ?fips_52_alpha ?fips_52_numeric ?fips_10_4 ?gnis_id
WHERE {
  ?state wdt:P300 ?iso3166_2 ;
         wdt:P5086 ?fips_52_alpha ;
         wdt:P5087 ?fips_52_numeric ;
         wdt:P901 ?fips_10_4 ;
         wdt:P590 ?gnis_id .
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
}
"""

wd_states = geokb.sparql_query(
    query=wd_state_query,
    endpoint="https://query.wikidata.org/sparql"
)

In [3]:
merged_states = pd.merge(
    left=geokb_states[['qid','stateLabel','fips_10_4','same_as']],
    right=wd_states[['state','fips_10_4']].rename(columns={'state':'wd_item'}),
    how='left',
    on='fips_10_4'
)

merged_states[merged_states['same_as'].isna()].head()

Unnamed: 0,qid,stateLabel,fips_10_4,same_as,wd_item
0,Q230,Michigan,US26,,http://www.wikidata.org/entity/Q1166
1,Q231,Louisiana,US22,,http://www.wikidata.org/entity/Q1588
2,Q232,Oklahoma,US40,,http://www.wikidata.org/entity/Q1649
3,Q233,California,US06,,http://www.wikidata.org/entity/Q99
4,Q234,Georgia,US13,,http://www.wikidata.org/entity/Q1428


In [6]:
for _, row in merged_states[merged_states['same_as'].isna()].iterrows():
    item = geokb.wbi.item.get(row['qid'])

    item.claims.add(
        geokb.datatypes.URL(
            prop_nr=geokb.prop_lookup['same as'],
            value=row['wd_item']
        )
    )

    response = item.write(
        summary="Added same as link to Wikidata item"
    )
    print(response.id)

Q230
Q231
Q232
Q233
Q234
Q235
Q236
Q237
Q238
Q239
Q240
Q241
Q242
Q243
Q244
Q245
Q246
Q247
Q248
Q249
Q250
Q251
Q252
Q253
Q254
Q255
Q256
Q257
Q258
Q259
Q260
Q261
Q262
Q263
Q264
Q265
Q266
Q267
Q268
Q269
Q270
Q271
Q272
Q273
Q274
Q275
Q276
Q277
Q278
Q279
Q26700


# Older Approach

In [None]:
import planetary_computer
import pystac_client
import dask_geopandas
import pandas as pd

from wbmaker import WikibaseConnection

In [None]:
geokb = WikibaseConnection("GEOKB_CLOUD")

In [None]:
# Select source item to process
source_item_id = geokb.ref_lookup['U.S. State names and identifiers from U.S. Census via Microsoft Planetary Computer']
source_item = geokb.wbi.item.get(source_item_id)

In [None]:
# Extract STAC connection details from source item
stac_source = source_item.claims.get_json()[geokb.prop_lookup['STAC Catalog URL']][0]
stac_catalog_url = stac_source['mainsnak']['datavalue']['value']
stac_collection_name = stac_source['qualifiers'][geokb.prop_lookup['STAC Collection Name']][0]['datavalue']['value']
stac_item_name = stac_source['qualifiers'][geokb.prop_lookup['STAC Item Name']][0]['datavalue']['value']

In [None]:
# Query STAC catalog for asset
stac_catalog = pystac_client.Client.open(
    stac_catalog_url,
    modifier=planetary_computer.sign_inplace,
)
stac_collection = stac_catalog.get_collection(stac_collection_name)
stac_asset = stac_collection.get_item(stac_item_name).assets["data"]

In [None]:
# Read source data and compute coordinates
gdf_us_states = dask_geopandas.read_parquet(
    stac_asset.href,
    storage_options=stac_asset.extra_fields["table:storage_options"],
    calculate_divisions=True,
)
gdf_us_states = gdf_us_states.to_crs(epsg=4326).compute()
gdf_us_states['coordinates'] = gdf_us_states.to_crs('+proj=cea').geometry.centroid.to_crs(gdf_us_states.crs)

In [None]:
# Lookup QID for LSAD and merge with source data
query_lsad = "PREFIX%20wdt%3A%20%3Chttps%3A%2F%2Fgeokb.wikibase.cloud%2Fprop%2Fdirect%2F%3E%0A%0ASELECT%20%3Fitem%20%3Flsad%0AWHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP90%20%3Flsad%20.%0A%7D%0A"
df_lsad = geokb.wb_ref_data(query=query_lsad)
df_lsad['lsad_qid'] = df_lsad.item.apply(lambda x: x.split('/')[-1])

state_records = pd.merge(
    left=gdf_us_states,
    right=df_lsad[["lsad","lsad_qid"]],
    how="left",
    left_on="LSAD",
    right_on="lsad"
)


In [None]:
# Set the instance of for the item based on FIPS code
def instance_of_name(STATEFP):
    int_statefp = int(STATEFP)
    if int_statefp == 11:
        return 'U.S. federal district'
    if int_statefp > 56:
        return 'U.S. Territory'
    return 'U.S. State'

state_records['instance_of_name'] = state_records.STATEFP.apply(instance_of_name)
state_records['instance_of_qid'] = state_records.instance_of_name.apply(lambda x: geokb.class_lookup[x])


In [None]:
def lookup_state(fips_alpha):
    q = """
    %(namespaces)s

    SELECT ?st ?fips_alpha
    WHERE {
    ?st wdt:%(p_fips_alpha)s "%(v_fips_alpha)s" .
    ?st wdt:%(p_fips_alpha)s ?fips_alpha .
    SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
    }
    """ % {
        "namespaces": geokb.sparql_namespaces(),
        "v_fips_alpha": fips_alpha,
        "p_fips_alpha": geokb.prop_lookup['FIPS 5-2 alpha']
    }

    return geokb.sparql_query(query=q, output="lookup")


In [None]:
references = geokb.models.References()
references.add(
    geokb.datatypes.Item(
        prop_nr=geokb.prop_lookup['data source'],
        value=source_item_id
    )
)

for index, row in state_records.iterrows():
    wb_item_id = lookup_state(row.STUSPS)
    if wb_item_id:
        wb_item = geokb.wbi.item.get(wb_item_id[row.STUSPS])
    else:
        wb_item = geokb.wbi.item.new()

    wb_item.labels.set('en', row.NAME)
    wb_item.aliases.set('en', row.STUSPS)
    wb_item.descriptions.set('en', f'a {row.instance_of_name}')

    claims = geokb.models.Claims()
    claims.add(
        geokb.datatypes.Item(
            prop_nr=geokb.prop_lookup['instance of'],
            value=row.instance_of_qid,
            references=references
        )
    )
    claims.add(
        geokb.datatypes.ExternalID(
            prop_nr=geokb.prop_lookup['FIPS 5-2 alpha'],
            value=row.STUSPS,
            references=references
        )
    )
    claims.add(
        geokb.datatypes.ExternalID(
            prop_nr=geokb.prop_lookup['FIPS 5-2 numeric'],
            value=str(row.STATEFP),
            references=references
        )
    )
    claims.add(
        geokb.datatypes.ExternalID(
            prop_nr=geokb.prop_lookup['FIPS 10-4'],
            value=f"US{row.STATEFP}",
            references=references
        )
    )
    claims.add(
        geokb.datatypes.ExternalID(
            prop_nr=geokb.prop_lookup['ISO 3166-2 code'],
            value=f"US-{row.STUSPS}",
            references=references
        )
    )
    claims.add(
        geokb.datatypes.ExternalID(
            prop_nr=geokb.prop_lookup['TIGER GEOID'],
            value=str(row.GEOID),
            references=references
        )
    )
    claims.add(
        geokb.datatypes.ExternalID(
            prop_nr=geokb.prop_lookup['GNIS ID'],
            value=str(row.STATENS),
            references=references
        )
    )
    claims.add(
        geokb.datatypes.GlobeCoordinate(
            prop_nr=geokb.prop_lookup['coordinate location'],
            latitude=row.coordinates.y,
            longitude=row.coordinates.x,
            references=references
        )
    )
    claims.add(
        geokb.datatypes.Item(
            prop_nr=geokb.prop_lookup['Legal/Statistical Area Description'],
            value=row.lsad_qid,
            references=references
        )
    )

    wb_item.add_claims(claims=claims)
    response = wb_item.write(
        summary="Updated item from Census source",
        clear=True
    )
    print("UPDATED:", row.NAME, response.id)