This notebook works through the process of adding records for U.S. States to the knowledgebase. It uses Wikidata as its source here because it has a complete list that is reasonably stable at this point other than new properties being added to state records.

In [41]:
import os
import requests
import pandas as pd
import numpy as np

from functions import (
    sparql_query,
    kb_props,
    kb_datasources,
    valid_classes
)

from wikibaseintegrator.wbi_config import config as wbi_config
from wikibaseintegrator import WikibaseIntegrator, wbi_login
from wikibaseintegrator.models import Qualifiers, References, Reference, Claims
from wikibaseintegrator import datatypes
from wikibaseintegrator.wbi_helpers import execute_sparql_query

In [42]:
wbi_config['MEDIAWIKI_API_URL'] = os.environ['MEDIAWIKI_API_URL']
wbi_config['SPARQL_ENDPOINT_URL'] = os.environ['SPARQL_ENDPOINT_URL']
wbi_config['WIKIBASE_URL'] = os.environ['WIKIBASE_URL']
wbi_config['USER_AGENT'] = f'EDJIBot/1.0 ({os.environ["WIKIBASE_URL"]})'

login_instance = wbi_login.Login(
    user=os.environ['BOT_NAME'],
    password=os.environ['BOT_PASS']
)

wbi = WikibaseIntegrator(login=login_instance)

# Foundational properties, classifiers, and data sources

Every time we run a workflow to build out some concept in the knowledgebase, we need to pull a reference together of the fundamental properties and specific definition information that drives how claims are built, the items that serve as classifiers (establishing "instance of" claims), and data sources. As I work through each source several times, I'm fiddling with the best way to document a source such that a link to that item in a reference from a claim provides a lot of detail to fully understand where the claim came from.

In [34]:
prop_item_definitions, properties = kb_props()
classes = valid_classes()
datasources = kb_datasources()

In [39]:
query_wd_states = """
prefix wd: <http://www.wikidata.org/entity/>
prefix wdt: <http://www.wikidata.org/prop/direct/>

SELECT ?item ?itemLabel ?itemDescription ?iso_code WHERE {
  ?item wdt:P300 ?iso_code .
  ?item wdt:P31 wd:Q35657 .
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
"""

# wd_states = sparql_query(
#     endpoint='https://query.wikidata.org/sparql',
#     query=query_wd_states,
#     output='dataframe'
# )



In [None]:
for index, row in wd_states[~wd_states.iso_code.isin(edji_kb_states.iso_code)].iterrows():
    print("PROCESSING:", row.itemLabel)

    item = wbi.item.new()
    
    # Set label and description
    item.labels.set('en', row.itemLabel)
    item.descriptions.set('en', row.itemDescription)

    item.claims.add(
        datatypes.Item(
            prop_nr=properties['instance of'],
            value=classes['U.S. State']
        )
    )

    item.claims.add(
        datatypes.ExternalID(
            prop_nr=properties['ISO 3166-2 code'],
            value=row.iso_code
        )
    )

    wd_link_qualifiers = Qualifiers()
    wd_link_caveat = datatypes.String(
        prop_nr=properties['caveat'],
        value='Not all properties from U.S. State records in Wikidata may be appropriate in this context'
    )
    wd_link_qualifiers.add(wd_link_caveat)

    item.claims.add(
        datatypes.ExternalID(
            prop_nr=properties['related wikidata item'],
            value=row['item'].split("/")[-1],
            qualifiers=wd_link_qualifiers
        )
    )

    item.write(summary="Adding new U.S. state/territory item derived from wikidata")

In [49]:
census_tiger_source_id = 'Q2205'

# Build qualifier that indicates the primary entity in the source table
county_class_qualifier = datatypes.Item(
    prop_nr=properties['entity classification'],
    value=classes['U.S. County']
)

# Set up the reference for the inbound claims as the state listing pointing to county table links
claim_refs = References()
state_county_link_ref = datatypes.URL(
    prop_nr=properties['reference url'],
    value='https://tigerweb.geo.census.gov/tigerwebmain/TIGERweb_counties_current.html'
)
claim_refs.add(state_county_link_ref)

# Create container for the new claims
county_table_source_claims = []

for index, row in edji_kb_states.iterrows():
    print("PROCESSING", row.itemLabel)
    item_id = row['item'].split("/")[-1]

    # Make the URL and check its validity
    county_table_url = f"https://tigerweb.geo.census.gov/tigerwebmain/Files/bas23/tigerweb_bas23_county_{row.fips_code.lower()}.html"
    check_url = requests.head(county_table_url)
    if check_url.status_code != 200:
        # Tell us about any problems
        print(county_table_url, check_url.status_code)
    else:
        # Create the qualifiers that will specify the classification and the link to the state/territory
        claim_qualifiers = Qualifiers()
        claim_qualifiers.add(county_class_qualifier)
        state_item_qualifier = datatypes.Item(
            prop_nr=properties['location'],
            value=item_id
        )
        claim_qualifiers.add(state_item_qualifier)

        # Build the claim for this source
        county_table_source_claim = datatypes.URL(
            prop_nr=properties['HTML Data Table'],
            value=county_table_url,
            qualifiers=claim_qualifiers,
            references=claim_refs
        )

        # Add the claim to the series of claims
        county_table_source_claims.append(county_table_source_claim)

# Get the data source item and add the claims
census_tiger_source_item = wbi.item.get(census_tiger_source_id)
census_tiger_source_item.claims.add(county_table_source_claims)
census_tiger_source_item.write()

PROCESSING Northern Mariana Islands
PROCESSING Guam
PROCESSING American Samoa
PROCESSING Puerto Rico
PROCESSING United States Virgin Islands
PROCESSING Massachusetts
PROCESSING Illinois
PROCESSING Mississippi
PROCESSING New Mexico
PROCESSING California
PROCESSING Alabama
PROCESSING Maine
PROCESSING New Hampshire
PROCESSING Connecticut
PROCESSING Hawaii
PROCESSING Alaska
PROCESSING Florida
PROCESSING Arizona
PROCESSING Oregon
PROCESSING Utah
PROCESSING Michigan
PROCESSING North Dakota
PROCESSING South Dakota
PROCESSING Montana
PROCESSING Wyoming
PROCESSING Idaho
PROCESSING Washington
PROCESSING Nevada
PROCESSING Colorado
PROCESSING Virginia
PROCESSING West Virginia
PROCESSING New York
PROCESSING Rhode Island
PROCESSING Maryland
PROCESSING Delaware
PROCESSING Ohio
PROCESSING Pennsylvania
PROCESSING New Jersey
PROCESSING Indiana
PROCESSING Georgia
PROCESSING Texas
PROCESSING North Carolina
PROCESSING South Carolina
PROCESSING Tennessee
PROCESSING Minnesota
PROCESSING Wisconsin
PROCESSING 

<ItemEntity @603910 _BaseEntity__api=<wikibaseintegrator.wikibaseintegrator.WikibaseIntegrator object at 0x7fb3e88ed730>
	 _BaseEntity__title='Item:Q2205'
	 _BaseEntity__pageid=2225
	 _BaseEntity__lastrevid=2848
	 _BaseEntity__type='item'
	 _BaseEntity__id='Q2205'
	 _BaseEntity__claims=<Claims @8ed8e0 _Claims__claims={'P1': [<Item @deae50 _Claim__mainsnak=<Snak @deaeb0 _Snak__snaktype=<WikibaseSnakType.KNOWN_VALUE: 'value'> _Snak__property_number='P1' _Snak__hash='6056844c4a6d3699fbafab69805eed5ca36eab0e' _Snak__datavalue={'value': {'entity-type': 'item', 'numeric-id': 4, 'id': 'Q4'}, 'type': 'wikibase-entityid'} _Snak__datatype='wikibase-item'> _Claim__type='statement' _Claim__qualifiers=<Qualifiers @dea760 _Qualifiers__qualifiers={}> _Claim__qualifiers_order=[] _Claim__id='Q2205$4b92e7b3-41de-8f48-e85a-10349122c74a' _Claim__rank=<WikibaseRank.NORMAL: 'normal'> _Claim__removed=False _Claim__references=<References @deab50 _References__references=[]>>], 'P24': [<URL @dea640 _Claim__main