This is a further reworking of the process, following the example I just set with U.S. States from the MPC source.

In [1]:
import planetary_computer
import pystac_client
import dask_geopandas
import pandas as pd

from wbmaker import WikibaseConnection

In [2]:
eew = WikibaseConnection('EEW')

In [3]:
# Select source item to process
source_item_id = eew.ref_lookup['U.S. County names and identifiers from U.S. Census via Microsoft Planetary Computer']
source_item = eew.wbi.item.get(source_item_id)

In [4]:
# Extract STAC connection details from source item
stac_source = source_item.claims.get_json()[eew.prop_lookup['STAC catalog URL']][0]
stac_catalog_url = stac_source['mainsnak']['datavalue']['value']
stac_collection_name = stac_source['qualifiers'][eew.prop_lookup['STAC Collection Name']][0]['datavalue']['value']
stac_item_name = stac_source['qualifiers'][eew.prop_lookup['STAC Item Name']][0]['datavalue']['value']

In [5]:
# Query STAC catalog for asset
stac_catalog = pystac_client.Client.open(
    stac_catalog_url,
    modifier=planetary_computer.sign_inplace,
)
stac_collection = stac_catalog.get_collection(stac_collection_name)
stac_asset = stac_collection.get_item(stac_item_name).assets["data"]

In [6]:
# Read source data and compute coordinates
gdf_us_counties = dask_geopandas.read_parquet(
    stac_asset.href,
    storage_options=stac_asset.extra_fields["table:storage_options"],
    calculate_divisions=True,
)
gdf_us_counties = gdf_us_counties.to_crs(epsg=4326).compute()
gdf_us_counties['coordinates'] = gdf_us_counties.to_crs('+proj=cea').geometry.centroid.to_crs(gdf_us_counties.crs)

In [7]:
# Lookup existing county records
query_county_geoid = "PREFIX%20wd%3A%20%3Chttps%3A%2F%2Feew-edgi.wikibase.cloud%2Fentity%2F%3E%0APREFIX%20wdt%3A%20%3Chttps%3A%2F%2Feew-edgi.wikibase.cloud%2Fprop%2Fdirect%2F%3E%0A%0ASELECT%20%3Fcounty%20%3FGEOID%0AWHERE%20%7B%0A%20%20%3Fcounty%20wdt%3AP1%20wd%3AQ655%20.%0A%20%20%3Fcounty%20wdt%3AP36%20%3FGEOID%20.%0A%7D%0A"
df_geoid_lookup = eew.wb_ref_data(query=query_county_geoid)
df_geoid_lookup["county_qid"] = df_geoid_lookup.county.apply(lambda x: x.split('/')[-1])

gdf_us_counties = pd.merge(
    left=gdf_us_counties,
    right=df_geoid_lookup[["GEOID","county_qid"]],
    how="left",
    on="GEOID"
)

gdf_us_counties['county_qid'] = gdf_us_counties.apply(lambda x: 'Q1995' if x.GEOID == '48245' else x.county_qid, axis=1)
gdf_us_counties['county_qid'] = gdf_us_counties.apply(lambda x: 'Q3613' if x.GEOID == '13113' else x.county_qid, axis=1)

In [8]:
# Lookup QID for LSAD and merge with source data
query_lsad = "PREFIX%20wdt%3A%20%3Chttps%3A%2F%2Feew-edgi.wikibase.cloud%2Fprop%2Fdirect%2F%3E%0A%0ASELECT%20%3Fitem%20%3FLSAD%0AWHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP54%20%3FLSAD%20.%0A%7D%0A"
df_lsad = eew.wb_ref_data(query=query_lsad)
df_lsad = df_lsad[df_lsad.LSAD.isin(gdf_us_counties.LSAD)]
df_lsad['lsad_qid'] = df_lsad.item.apply(lambda x: x.split('/')[-1])
df_lsad = df_lsad[df_lsad.lsad_qid != 'Q30120']

gdf_us_counties = pd.merge(
    left=gdf_us_counties,
    right=df_lsad[["LSAD","lsad_qid"]],
    how="left",
    on="LSAD"
)

In [9]:
# Lookup QID for State FIPS
query_state_fips = "PREFIX%20wdt%3A%20%3Chttps%3A%2F%2Feew-edgi.wikibase.cloud%2Fprop%2Fdirect%2F%3E%0A%0ASELECT%20%3Fitem%20%3FSTATEFP%0AWHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP22%20%3FSTATEFP%20.%0A%7D%0A"
df_state_fips = eew.wb_ref_data(query=query_state_fips)
df_state_fips['state_qid'] = df_state_fips.item.apply(lambda x: x.split('/')[-1])

gdf_us_counties = pd.merge(
    left=gdf_us_counties,
    right=df_state_fips[["STATEFP","state_qid"]],
    how="left",
    on="STATEFP"
)

In [14]:
references = eew.models.References()
references.add(
    eew.datatypes.Item(
        prop_nr=eew.prop_lookup['data source'],
        value=eew.ref_lookup['U.S. County names and identifiers from U.S. Census via Microsoft Planetary Computer']
    )
)

for index, row in gdf_us_counties.iterrows():
    if isinstance(row.county_qid, str):
        county_item = eew.wbi.item.get(row.county_qid)
        action = "Updated"
    else:
        county_item = eew.wbi.item.new()
        action = "Added"

    county_label = f"{row.NAMELSAD}, {row.STATE_NAME}"
    county_item.labels.set('en', county_label)
    county_item.descriptions.set('en', f"a county in {row.STATE_NAME}")
    county_item.aliases.set('en', [row.NAMELSAD, row.NAME, f"{row.NAMELSAD}, {row.STUSPS}"])

    claims = eew.models.Claims()
    claims.add(
        eew.datatypes.Item(
            prop_nr=eew.prop_lookup['instance of'],
            value=eew.class_lookup['U.S. County (or equivalent)'],
            references=references
        )
    )

    claims.add(
        eew.datatypes.Item(
            prop_nr=eew.prop_lookup['Legal/Statistical Area Description'],
            value=row.lsad_qid,
            references=references
        )
    )

    claims.add(
        eew.datatypes.Item(
            prop_nr=eew.prop_lookup['U.S. state'],
            value=row.state_qid,
            references=references
        )
    )

    claims.add(
        eew.datatypes.ExternalID(
            prop_nr=eew.prop_lookup['GNIS ID'],
            value=str(row.COUNTYNS),
            references=references
        )
    )

    claims.add(
        eew.datatypes.ExternalID(
            prop_nr=eew.prop_lookup['FIPS 10-4'],
            value=str(row.GEOID),
            references=references
        )
    )

    claims.add(
        eew.datatypes.ExternalID(
            prop_nr=eew.prop_lookup['TIGER GEOID'],
            value=str(row.GEOID),
            references=references
        )
    )

    claims.add(
        eew.datatypes.GlobeCoordinate(
            prop_nr=eew.prop_lookup['coordinate location'],
            latitude=row.coordinates.y,
            longitude=row.coordinates.x,
            references=references
        )
    )

    county_item.add_claims(claims=claims)

    try:
        response = county_item.write(
            summary=f"{action} county item from U.S. Census source",
            clear=True
        )
        print(action, county_label, response.id)
    except Exception as e:
        print("PROBLEM:", county_label)

Updated Logan County, Kentucky Q957
Updated Queens County, New York Q1190
Updated Hudson County, New Jersey Q3297
Updated Hunterdon County, New Jersey Q3298
Updated McCreary County, Kentucky Q960
Updated Pike County, Kentucky Q984
Updated Lawrence County, Kentucky Q950
Updated Cattaraugus County, New York Q1154
Updated Terrebonne Parish, Louisiana Q3780
Updated De Soto Parish, Louisiana Q3741
Updated Suwannee County, Florida Q866
Updated Orleans Parish, Louisiana Q3761
Updated Washington Parish, Louisiana Q3784
Updated Madison Parish, Louisiana Q3758
Updated Bradford County, Florida Q809
Updated East Carroll Parish, Louisiana Q3743
Updated Ulster County, New York Q1205
Updated East Feliciana Parish, Louisiana Q3744
Updated Bethel Census Area, Alaska Q1378
Updated Nome Census Area, Alaska Q1394
Updated Dorchester County, Maryland Q2250
Updated Allegany County, Maryland Q2242
Updated Cecil County, Maryland Q2248
Updated Carroll County, Maryland Q2247
Updated Kent County, Maryland Q2255
U