The new version of this notebook works through the items I created for some American Indian areas via the TIGER web table, aligning with the Microsoft Planetary Computer source of U.S. Census data. I found that the GEOID values didn't line up exactly (there was an R suffix in the HTML table form of the records).

The MPC source is different in that it is a blend of American Indian, Alaska Native, and Native Hawaiian Tribes and other entities. Ultimately, I need to revisit this with other sources of information, including connecting the dots with how Tribes are represented in Wikidata.

In [1]:
import planetary_computer
import pystac_client
import dask_geopandas
import pandas as pd

from wbmaker import WikibaseConnection

In [39]:
eew = WikibaseConnection('EEW')

In [3]:
# Select source item to process
source_item_id = eew.ref_lookup['American Indian, Alaskan Native, and Native Hawaiian lands from U.S. Census']
source_item = eew.wbi.item.get(source_item_id)

In [4]:
# Extract STAC connection details from source item
stac_source = source_item.claims.get_json()[eew.prop_lookup['STAC catalog URL']][0]
stac_catalog_url = stac_source['mainsnak']['datavalue']['value']
stac_collection_name = stac_source['qualifiers'][eew.prop_lookup['STAC Collection Name']][0]['datavalue']['value']
stac_item_name = stac_source['qualifiers'][eew.prop_lookup['STAC Item Name']][0]['datavalue']['value']

In [5]:
# Query STAC catalog for asset
stac_catalog = pystac_client.Client.open(
    stac_catalog_url,
    modifier=planetary_computer.sign_inplace,
)
stac_collection = stac_catalog.get_collection(stac_collection_name)
stac_asset = stac_collection.get_item(stac_item_name).assets["data"]

In [6]:
# Read source data and compute coordinates
gdf_aiannh = dask_geopandas.read_parquet(
    stac_asset.href,
    storage_options=stac_asset.extra_fields["table:storage_options"],
    calculate_divisions=True,
)
gdf_aiannh = gdf_aiannh.to_crs(epsg=4326).compute()
gdf_aiannh['coordinates'] = gdf_aiannh.to_crs('+proj=cea').geometry.centroid.to_crs(gdf_aiannh.crs)
gdf_aiannh['gnis_id'] = gdf_aiannh.AIANNHNS.astype(str)

In [7]:
# Lookup existing GEOID records
query_gnis = "PREFIX%20wd%3A%20%3Chttps%3A%2F%2Feew-edgi.wikibase.cloud%2Fentity%2F%3E%0APREFIX%20wdt%3A%20%3Chttps%3A%2F%2Feew-edgi.wikibase.cloud%2Fprop%2Fdirect%2F%3E%0A%0ASELECT%20%3Fitem%20%3FitemLabel%20%3Fgnis_id%20%3Fgeoid%0AWHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP1%20wd%3AQ338%20.%0A%20%20%3Fitem%20wdt%3AP25%20%3Fgnis_id%20.%0A%20%20%3Fitem%20wdt%3AP36%20%3Fgeoid%20.%0A%20%20SERVICE%20wikibase%3Alabel%20%7B%20bd%3AserviceParam%20wikibase%3Alanguage%20%22en%22%20%7D%0A%7D%0A"
df_gnis_lookup = eew.wb_ref_data(query=query_gnis)
df_gnis_lookup["aiannh_qid"] = df_gnis_lookup.item.apply(lambda x: x.split('/')[-1])

existing_aiannh = pd.merge(
    left=gdf_aiannh[gdf_aiannh.gnis_id.isin(df_gnis_lookup.gnis_id)],
    right=df_gnis_lookup[["gnis_id","aiannh_qid","itemLabel","geoid"]],
    how="left",
    on="gnis_id"
).reset_index(drop=True)

new_aiannh = gdf_aiannh[~gdf_aiannh.gnis_id.isin(df_gnis_lookup.gnis_id)]

In [10]:
# Lookup QID for LSAD and merge with source data
query_lsad = "PREFIX%20wdt%3A%20%3Chttps%3A%2F%2Feew-edgi.wikibase.cloud%2Fprop%2Fdirect%2F%3E%0A%0ASELECT%20%3Flsad%20%3FlsadLabel%20%3FLSAD%0AWHERE%20%7B%0A%20%20%3Flsad%20wdt%3AP54%20%3FLSAD%20.%0A%20%20SERVICE%20wikibase%3Alabel%20%7B%20bd%3AserviceParam%20wikibase%3Alanguage%20%22en%22%20%7D%0A%7D"
df_lsad = eew.wb_ref_data(query=query_lsad)
df_lsad['lsad_qid'] = df_lsad.lsad.apply(lambda x: x.split('/')[-1])

In [13]:
existing_aiannh = pd.merge(
    left=existing_aiannh,
    right=df_lsad[["LSAD","lsad_qid","lsadLabel"]],
    how="left",
    on="LSAD"
)

In [46]:
references = eew.models.References()
references.add(
    eew.datatypes.Item(
        prop_nr=eew.prop_lookup['data source'],
        value=eew.ref_lookup['American Indian, Alaskan Native, and Native Hawaiian lands from U.S. Census']
    )
)

<References @3ab3a0 _References__references=[<Reference @3a99f0 _Reference__hash=None _Reference__snaks=<Snaks @3a8a60 snaks={'P6': [<Snak @3a9030 _Snak__snaktype=<WikibaseSnakType.KNOWN_VALUE: 'value'> _Snak__property_number='P6' _Snak__hash=None _Snak__datavalue={'value': {'entity-type': 'item', 'numeric-id': 30255, 'id': 'Q30255'}, 'type': 'wikibase-entityid'} _Snak__datatype='wikibase-item'>]}> _Reference__snaks_order=[]>]>

In [27]:
for index, row in existing_aiannh.iterrows():
    item = eew.wbi.item.get(row.aiannh_qid)
    
    alias_list = [i["value"] for i in item.aliases.get_json()["en"]]
    if row.NAMELSAD not in alias_list:
        alias_list.append(row.NAMELSAD)
        item.aliases.set('en', alias_list)

    new_claims = eew.models.Claims()

    new_claims.add(
        eew.datatypes.ExternalID(
            prop_nr=eew.prop_lookup['TIGER GEOID'],
            value=row.GEOID,
            references=references
        )
    )

    new_claims.add(
        eew.datatypes.Item(
            prop_nr=eew.prop_lookup['Legal/Statistical Area Description'],
            value=row.lsad_qid,
            references=references
        )
    )

    item.claims.add(claims=new_claims)
    try:
        response = item.write(
            summary="Updated item with information from MPC source",
            clear=True
        )
        print("UPDATED:", row.itemLabel, response.id)
    except Exception as e:
        print("PROBLEM:", row.itemLabel)

UPDATED: Chemehuevi Reservation Q376
UPDATED: Wells Colony Q636
UPDATED: Colville Reservation Q385
UPDATED: Tohono O'odham Nation Reservation Q615
UPDATED: Uintah and Ouray Reservation Q627
UPDATED: Guidiville Rancheria Q431
UPDATED: Immokalee Reservation Q443
UPDATED: Elk Valley Rancheria Q400
UPDATED: Tonawanda Reservation Q616
UPDATED: Pleasant Point Reservation Q533
UPDATED: Zuni Reservation Q652
UPDATED: Jamul Indian Village Q451
UPDATED: Las Vegas Indian Colony Q470
UPDATED: San Carlos Reservation Q564
UPDATED: Upper Lake Rancheria Q629
UPDATED: White Earth Reservation Q637
UPDATED: Jackson Rancheria Q449
UPDATED: Onondaga Nation Reservation Q518
UPDATED: Fort Pierce Reservation Q421
UPDATED: Navajo Nation Reservation Q506
UPDATED: Stewarts Point Rancheria Q601
UPDATED: Big Sandy Rancheria Q357
UPDATED: Annette Island Reserve Q344
UPDATED: Fort Sill Apache Indian Reservation Q422
UPDATED: Yurok Reservation Q650
UPDATED: Standing Rock Reservation Q599
UPDATED: Quartz Valley Reserv

In [28]:
new_aiannh = pd.merge(
    left=new_aiannh,
    right=df_lsad[["LSAD","lsad_qid","lsadLabel"]],
    how="left",
    on="LSAD"
)

In [43]:
def set_instance_of(LSAD):
    if LSAD == "78":
        return eew.class_lookup['Hawaiian Home Land']
    if LSAD == "79":
        return eew.class_lookup['Alaska Native Village']
    return eew.class_lookup['American Indian Tribal Area']

new_aiannh['instance_of_qid'] = new_aiannh.LSAD.apply(set_instance_of)

In [47]:
new_aiannh.head()

Unnamed: 0,AIANNHCE,AIANNHNS,AFFGEOID,GEOID,NAME,NAMELSAD,LSAD,ALAND,AWATER,geometry,coordinates,gnis_id,lsad_qid,lsadLabel,instance_of_qid
0,9515,2418775,2500000US9515,9515,Apache Choctaw,Apache Choctaw SDTSA,92,221751364,2632531,"POLYGON ((-93.77547 31.61936, -93.77411 31.619...",POINT (-93.66701 31.62824),2418775,Q30181,SDTSA (suffix),Q30257
1,9370,979494,2500000US9370,9370,Shinnecock,Shinnecock (state) Reservation,86,3494292,0,"POLYGON ((-72.44070 40.87749, -72.43870 40.879...",POINT (-72.43098 40.87373),979494,Q30175,Reservation (suffix),Q30257
2,9820,2418693,2500000US9820,9820,MaChis Lower Creek,MaChis Lower Creek SDTSA,92,1680767035,6816074,"MULTIPOLYGON (((-85.54654 31.21440, -85.54342 ...",POINT (-86.02243 31.33836),2418693,Q30181,SDTSA (suffix),Q30257
3,6125,2418774,2500000US6125,6125,Anvik,Anvik ANVSA,79,24578643,6308736,"POLYGON ((-160.24545 62.69478, -160.24517 62.6...",POINT (-160.21395 62.65408),2418774,Q30168,ANVSA (suffix),Q30256
4,6350,2418836,2500000US6350,6350,Circle,Circle ANVSA,79,274634016,1398608,"POLYGON ((-144.38284 65.73496, -144.37907 65.7...",POINT (-144.18135 65.79338),2418836,Q30168,ANVSA (suffix),Q30256


In [48]:
for index, row in new_aiannh.iterrows():
    item = eew.wbi.item.new()

    item.labels.set('en', row.NAMELSAD)
    item.aliases.set('en', row.NAME)
    if row.LSAD == '78':
        item.descriptions.set('en', 'a Hawaiian Native Homeland')
    elif row.LSAD == '79':
        item.descriptions.set('en', 'an Alaska Native Village')
    else:
        item.descriptions.set('en', 'an American Indian Tribal area')

    item_claims = eew.models.Claims()

    item_claims.add(
        eew.datatypes.Item(
            prop_nr=eew.prop_lookup['instance of'],
            value=row.instance_of_qid,
            references=references
        )
    )

    item_claims.add(
        eew.datatypes.ExternalID(
            prop_nr=eew.prop_lookup['GNIS ID'],
            value=str(row.AIANNHNS),
            references=references
        )
    )

    item_claims.add(
        eew.datatypes.ExternalID(
            prop_nr=eew.prop_lookup['TIGER GEOID'],
            value=str(row.GEOID),
            references=references
        )
    )

    item_claims.add(
        eew.datatypes.Item(
            prop_nr=eew.prop_lookup['Legal/Statistical Area Description'],
            value=row.lsad_qid,
            references=references
        )
    )

    item_claims.add(
        eew.datatypes.GlobeCoordinate(
            prop_nr=eew.prop_lookup['coordinate location'],
            latitude=row.coordinates.y,
            longitude=row.coordinates.x,
            references=references
        )
    )

    item.claims.add(claims=item_claims)

    try:
        response = item.write(
            summary="Added AIANNH item with information from MPC source"
        )
        print("ADDED:", row.NAMELSAD, response.id)
    except Exception as e:
        print("PROBLEM:", row.NAMELSAD)    


ADDED: Apache Choctaw SDTSA Q30258
ADDED: Shinnecock (state) Reservation Q30259
ADDED: MaChis Lower Creek SDTSA Q30260
ADDED: Anvik ANVSA Q30261
ADDED: Circle ANVSA Q30262
ADDED: Akhiok ANVSA Q30263
ADDED: Chevak ANVSA Q30264
ADDED: Selawik ANVSA Q30265
ADDED: Ambler ANVSA Q30266
ADDED: Scammon Bay ANVSA Q30267
ADDED: Kaohe-Olaa Hawaiian Home Land Q30268
ADDED: Kickapoo OTSA Q30269
ADDED: Tetlin ANVSA Q30270
ADDED: Kaltag ANVSA Q30271
ADDED: Paukukalo Hawaiian Home Land Q30272
ADDED: Haliwa-Saponi SDTSA Q30273
ADDED: Pawnee OTSA Q30274
ADDED: Choctaw OTSA Q30275
ADDED: Emmonak ANVSA Q30276
ADDED: Napaskiak ANVSA Q30277
ADDED: Nikolski ANVSA Q30278
ADDED: Rohnerville (Rancheria) Trust Land Q30279
ADDED: Akiak ANVSA Q30280
ADDED: St. Paul ANVSA Q30281
ADDED: Nome ANVSA Q30282
ADDED: Hydaburg ANVSA Q30283
ADDED: Chickaloon ANVSA Q30284
ADDED: Chickasaw OTSA Q30285
ADDED: Modoc OTSA Q30286
ADDED: White Mountain ANVSA Q30287
ADDED: King Cove ANVSA Q30288
ADDED: Upolu Hawaiian Home Land Q302