This notebook works through the process of adding records for U.S. States to the knowledgebase. I have reworked this a couple of times as I'm trying to build out a process for a "wikibase starter pack." I'm iterating through what information I need and what I can get from which sources. I'll be coming back to this after I work through a few more sources and refine my process.

In [1]:
import os
import requests
import pandas as pd
import numpy as np

from functions import (
    sparql_query,
    kb_props,
    kb_datasources,
    valid_classes
)

from wikibaseintegrator.wbi_config import config as wbi_config
from wikibaseintegrator import WikibaseIntegrator, wbi_login
from wikibaseintegrator.models import Qualifiers, References, Reference, Claims
from wikibaseintegrator import datatypes
from wikibaseintegrator.wbi_helpers import execute_sparql_query

In [2]:
wbi_config['MEDIAWIKI_API_URL'] = os.environ['MEDIAWIKI_API_URL']
wbi_config['SPARQL_ENDPOINT_URL'] = os.environ['SPARQL_ENDPOINT_URL']
wbi_config['WIKIBASE_URL'] = os.environ['WIKIBASE_URL']
wbi_config['USER_AGENT'] = f'EDJIBot/1.0 ({os.environ["WIKIBASE_URL"]})'

login_instance = wbi_login.Login(
    user=os.environ['BOT_NAME'],
    password=os.environ['BOT_PASS']
)

wbi = WikibaseIntegrator(login=login_instance)

# Foundational properties, classifiers, and data sources

Every time we run a workflow to build out some concept in the knowledgebase, we need to pull a reference together of the fundamental properties and specific definition information that drives how claims are built, the items that serve as classifiers (establishing "instance of" claims), and data sources. As I work through each source several times, I'm fiddling with the best way to document a source such that a link to that item in a reference from a claim provides a lot of detail to fully understand where the claim came from.

In [3]:
prop_item_definitions, properties = kb_props()
classes = valid_classes()
datasources = kb_datasources()

In [4]:
display(properties)
display(classes)

{'NAICS Sector Code': 'P6',
 'NAICS Subsector Code': 'P7',
 'NAICS Industry Group Code': 'P8',
 'NAICS Industry Code': 'P9',
 'NAICS National Industry Code': 'P10',
 'instance of': 'P1',
 'subclass of': 'P2',
 'SIC Code': 'P3',
 'reference url': 'P4',
 'data source': 'P5',
 'file format': 'P11',
 'item of this property': 'P12',
 'identifier length': 'P13',
 'formatter URL': 'P14',
 'equivalent property': 'P15',
 'related wikidata item': 'P16',
 'caveat': 'P17',
 'ISO 3166-1 alpha-2 code': 'P18',
 'ISO 3166-1 alpha-3 code': 'P19',
 'ISO 3166-1 numeric code': 'P20',
 'ISO 3166-2 code': 'P21',
 'country': 'P22',
 'location': 'P23',
 'HTML Data Table': 'P24',
 'entity classification': 'P26',
 'FIPS 5-2 alpha code': 'P27',
 'GNIS ID': 'P28',
 'coordinate location': 'P29',
 'FIPS 5-2 numeric code': 'P30'}

{'spatio-temporal activity': 'Q2',
 'data source': 'Q4',
 'file format': 'Q455',
 'geographic entity': 'Q2148',
 'industrial activity': 'Q3',
 'NAICS Sector': 'Q450',
 'NAICS Subsector': 'Q451',
 'NAICS Industry Group': 'Q452',
 'NAICS Industry': 'Q453',
 'NAICS Industry (national)': 'Q454',
 'artificial geographic entity': 'Q2149',
 'country': 'Q1897',
 'U.S. State': 'Q2150',
 'U.S. Territory': 'Q2158',
 'U.S. County': 'Q2206'}

### Current Records



In [12]:
query_kb_states_territories = """
PREFIX wd: <https://edji-knows.wikibase.cloud/entity/>
PREFIX wdt: <https://edji-knows.wikibase.cloud/prop/direct/>

SELECT ?st ?stLabel ?fips_alpha ?fips_num ?iso_alpha_2 
?iso_alpha_3 ?iso_num ?wd_item ?gnis_id ?coord_loc
WHERE {
    ?st wdt:P1 ?classifier .
    VALUES ?classifier { wd:Q2150 wd:Q2158 } .
    OPTIONAL { ?st wdt:P27 ?fips_alpha . }
    OPTIONAL { ?st wdt:P30 ?fips_num . }
    OPTIONAL { ?st wdt:P21 ?iso_alpha_2 . }
    OPTIONAL { ?st wdt:P19 ?iso_alpha_3 . }
    OPTIONAL { ?st wdt:P20 ?iso_num . }
    OPTIONAL { ?st wdt:P16 ?wd_item . }
    OPTIONAL { ?st wdt:P28 ?gnis_id . }
    OPTIONAL { ?st wdt:P29 ?coord_loc . }
    SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
}
"""

kb_states_territories = sparql_query(
    endpoint=os.environ['SPARQL_ENDPOINT_URL'],
    query=query_kb_states_territories,
    output="dataframe"
)

In [15]:
tiger_source_item_id = "Q2205"

tiger_source_query = """
PREFIX wd: <https://edji-knows.wikibase.cloud/entity/>
PREFIX wdt: <https://edji-knows.wikibase.cloud/prop/direct/>

SELECT ?statement
WHERE {
  wd:Q2205 wdt:P24 ?statement.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
}
"""

tiger_source_urls = sparql_query(
    endpoint=os.environ['SPARQL_ENDPOINT_URL'],
    query=tiger_source_query,
    output="dict"
)

state_source_link = next((i for i in tiger_source_urls if "_state_" in i["statement"]), None)
print(state_source_link)

{'statement': 'https://tigerweb.geo.census.gov/tigerwebmain/Files/bas23/tigerweb_bas23_state_us.html'}


In [8]:
tiger_state_table = pd.read_html(
    state_source_link['statement'], 
    converters={
        'GEOID': str, 
        'STATE': str, 
        'STATENS': str 
    }
)

In [13]:
kb_merged_tiger = pd.merge(
    left=kb_states_territories,
    right=tiger_state_table[0][["STUSAB","GEOID","STATENS","CENTLAT","CENTLON"]],
    how="left",
    left_on="fips_alpha",
    right_on="STUSAB"
)

In [23]:
tiger_refs = References()
ref_tiger = datatypes.Item(
    prop_nr=properties['data source'],
    value=tiger_source_item_id
)
tiger_refs.add(ref_tiger)

for index, row in kb_merged_tiger.iterrows():
    print("PROCESSING:", row.stLabel)
    print(row.st)

    item = wbi.item.get(row.st.split("/")[-1])
    
    print("ADDING FIPS NUMERIC CODE:", row.GEOID)
    fips_num_claim = datatypes.ExternalID(
        prop_nr=properties['FIPS 5-2 numeric code'],
        value=row.GEOID,
        references=tiger_refs
    )
    item.claims.add(fips_num_claim)

    print("ADDING GNIS ID:", row.STATENS.lstrip('0'))
    gnis_id_claim = datatypes.ExternalID(
        prop_nr=properties['GNIS ID'],
        value=row.STATENS.lstrip('0'),
        references=tiger_refs
    )
    item.claims.add(gnis_id_claim)
        
    print("ADDING COORDINATE LOCATION:", f"{row.CENTLAT}, {row.CENTLON}")
    coord_loc_claim = datatypes.GlobeCoordinate(
        prop_nr=properties['coordinate location'],
        latitude=row.CENTLAT,
        longitude=row.CENTLON,
        references=tiger_refs
    )
    item.claims.add(coord_loc_claim)
        
    item.write(summary="Added GNIS ID, FIPS numeric code, and coordinate location from U.S. Census data")
        

PROCESSING: Northern Mariana Islands
https://edji-knows.wikibase.cloud/entity/Q1942
ADDING FIPS NUMERIC CODE: 69
ADDING GNIS ID: 1779809
ADDING COORDINATE LOCATION: 16.7974379, 145.596871
PROCESSING: Guam
https://edji-knows.wikibase.cloud/entity/Q2026
ADDING FIPS NUMERIC CODE: 66
ADDING GNIS ID: 1802705
ADDING COORDINATE LOCATION: 13.4427156, 144.7693797
PROCESSING: American Samoa
https://edji-knows.wikibase.cloud/entity/Q2027
ADDING FIPS NUMERIC CODE: 60
ADDING GNIS ID: 1802701
ADDING COORDINATE LOCATION: -13.9638307, -170.0822667
PROCESSING: Puerto Rico
https://edji-knows.wikibase.cloud/entity/Q2132
ADDING FIPS NUMERIC CODE: 72
ADDING GNIS ID: 1779808
ADDING COORDINATE LOCATION: 18.2164715, -66.4147614
PROCESSING: United States Virgin Islands
https://edji-knows.wikibase.cloud/entity/Q2135
ADDING FIPS NUMERIC CODE: 78
ADDING GNIS ID: 1802710
ADDING COORDINATE LOCATION: 18.0593788, -64.8387617
PROCESSING: Massachusetts
https://edji-knows.wikibase.cloud/entity/Q2153
ADDING FIPS NUMERIC 

## Linking to Wikidata

I need to come back to this as this particular codeblock is now defunct. I will rework the process of building state/territory records to start from the TIGER data as my initiating source and then figure out related Wikidata items and what properties I want to leverage.

In [None]:
for index, row in wd_states[~wd_states.iso_code.isin(edji_kb_states.iso_code)].iterrows():
    print("PROCESSING:", row.itemLabel)

    item = wbi.item.new()
    
    # Set label and description
    item.labels.set('en', row.itemLabel)
    item.descriptions.set('en', row.itemDescription)

    item.claims.add(
        datatypes.Item(
            prop_nr=properties['instance of'],
            value=classes['U.S. State']
        )
    )

    item.claims.add(
        datatypes.ExternalID(
            prop_nr=properties['ISO 3166-2 code'],
            value=row.iso_code
        )
    )

    wd_link_qualifiers = Qualifiers()
    wd_link_caveat = datatypes.String(
        prop_nr=properties['caveat'],
        value='Not all properties from U.S. State records in Wikidata may be appropriate in this context'
    )
    wd_link_qualifiers.add(wd_link_caveat)

    item.claims.add(
        datatypes.ExternalID(
            prop_nr=properties['related wikidata item'],
            value=row['item'].split("/")[-1],
            qualifiers=wd_link_qualifiers
        )
    )

    item.write(summary="Adding new U.S. state/territory item derived from wikidata")

# Documenting Sources

I'm continuing to work up a way to document sources as completely and functionally as possible in the knowledgebase structure. I want to be able to link a "data source" property as a reference to an item and then have that item used directly in building the associated claim. I'm fiddling with that here, trying to come up with a reasonable approach.

I'm thinking that having a single item like this one for the U.S. Census TIGER data related to places makes sense. It is one overall logical source for information within which there are several different specific sources we need to process. From a knowledge graph perspective, having a single item to point to seems to make some sense. We achieve granularity by having multiple claims about the same concept (HTML data tables that can be read with code) associated with the one logical item. We could have multiple items with parent-child relationships, but this basically accomplishes the same thing other than not having a specific unique resolvable identifier for each very specific source. I'll have to play with this in practice and see what makes sense.

In [None]:
census_tiger_source_id = 'Q2205'

# Build qualifier that indicates the primary entity in the source table
county_class_qualifier = datatypes.Item(
    prop_nr=properties['entity classification'],
    value=classes['U.S. County']
)

# Set up the reference for the inbound claims as the state listing pointing to county table links
claim_refs = References()
state_county_link_ref = datatypes.URL(
    prop_nr=properties['reference url'],
    value='https://tigerweb.geo.census.gov/tigerwebmain/TIGERweb_counties_current.html'
)
claim_refs.add(state_county_link_ref)

# Create container for the new claims
county_table_source_claims = []

for index, row in edji_kb_states.iterrows():
    print("PROCESSING", row.itemLabel)
    item_id = row['item'].split("/")[-1]

    # Make the URL and check its validity
    county_table_url = f"https://tigerweb.geo.census.gov/tigerwebmain/Files/bas23/tigerweb_bas23_county_{row.fips_code.lower()}.html"
    check_url = requests.head(county_table_url)
    if check_url.status_code != 200:
        # Tell us about any problems
        print(county_table_url, check_url.status_code)
    else:
        # Create the qualifiers that will specify the classification and the link to the state/territory
        claim_qualifiers = Qualifiers()
        claim_qualifiers.add(county_class_qualifier)
        state_item_qualifier = datatypes.Item(
            prop_nr=properties['location'],
            value=item_id
        )
        claim_qualifiers.add(state_item_qualifier)

        # Build the claim for this source
        county_table_source_claim = datatypes.URL(
            prop_nr=properties['HTML Data Table'],
            value=county_table_url,
            qualifiers=claim_qualifiers,
            references=claim_refs
        )

        # Add the claim to the series of claims
        county_table_source_claims.append(county_table_source_claim)

# Get the data source item and add the claims
census_tiger_source_item = wbi.item.get(census_tiger_source_id)
census_tiger_source_item.claims.add(county_table_source_claims)
census_tiger_source_item.write()