This notebook works through several listings of U.S. State-level listings for counties and other subdivisions, setting up logical data sources for processing into the knowledgebase.

In [1]:
import requests
from bs4 import BeautifulSoup
from wbmaker import WikibaseConnection
import swifter

import pandas as pd

In [2]:
eew = WikibaseConnection('eew')

In [4]:
# Get standard reference material we need to operate on the Wikibase instance
properties = eew.properties()
property_lookup = properties.set_index('propertyLabel')['pid'].to_dict()
classification = eew.classification()

# Reference/Linking Sources

In [5]:
# Prepare a lookup for current QID values on FIPS 5-2 alpha codes we find in URLs to source tables
query_fips_alpha = """
%(namespaces)s

SELECT ?item ?itemLabel ?fips_52_alpha
WHERE
{
  ?item wdt:%(prop)s ?fips_52_alpha .
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
""" % {
    'namespaces': eew.sparql_namespaces(),
    'prop': property_lookup['FIPS 5-2 alpha']
}

fips_alpha_entities = eew.sparql_query(
    query=query_fips_alpha,
    output='dataframe'
)

fips_alpha_entities['qid'] = fips_alpha_entities['item'].apply(lambda x: x.split('/')[-1])
fips_alpha_lookup = fips_alpha_entities.set_index('fips_52_alpha')['qid'].to_dict()

In [6]:
# A usable reference source for TIGER field names
tech_doc_reference = "https://tigerweb.geo.census.gov/tigerwebmain/TIGERweb_attribute_glossary.html"

# A function for scraping links to state-level files
def scrape_state_data_table_links(url, state_ref=fips_alpha_lookup):
    tiger_base_url = "https://tigerweb.geo.census.gov/tigerwebmain/"

    r = requests.get(url)
    
    soup = BeautifulSoup(r.content, 'html.parser')
    
    table_links = []
    for l in soup.find_all('a', href=True):
        if l['href'].startswith('Files/'):
            # if l.text.strip().startswith('Commonwealth'):
            #     name = 'Commonwealth of the Northern Mariana Islands'
            # else:
            name = ' '.join([i.strip() for i in l.text.split(" ")])
                
            state_alpha = l['href'].split('/')[-1].split('.')[0].split('_')[-1].upper()

            table_links.append({
                'state or territory': name,
                'url': f'{tiger_base_url}{l["href"]}',
                'state_alpha_code': state_alpha,
                'state_territory_qid': state_ref[state_alpha]
            })

    return table_links

# Item Builder Class

Ultimately, I need to start nailing down schemas for all the different types of items and encoding those in the ShEx format used by Wikibase. I can then use that to drive process codes for building conformant items. For now, I'm separating out logical processes in notebooks where I can iteratively develop common schemas, starting to at least move things up to the level of functional logic that can be applied in more than one case.

This is an experiment in a class to build items where I've broken out logical components of an item. We can build in more sophisticate upsert type process where we could run this on existing items, blending in new information with old or doing things like adding evidence to existing claims, but I need to figure out how the Wikibase API does some of that.

In [7]:
from wikibaseintegrator import datatypes, models, wikibaseintegrator

class BuildItem:
    def __init__(self,
        wikibase_connection: wikibaseintegrator.WikibaseIntegrator,
        property_reference: dict,
        classification_reference: dict,
        reference_url: str,
        technical_documentation_url: str,
        item_label: str,
        item_description: str,
        property_map: list,
        table_links: list,
    ):
        self.pid_ref_url = property_reference['reference URL']
        self.pid_instance_of = property_reference['instance of']
        self.pid_entity_classifier = property_reference['entity classifier']
        self.pid_technical_documentation = property_reference['technical documentation']
        self.pid_source_property = property_reference['source property']
        self.pid_prop_from_source = property_reference['property from data source']
        self.pid_applies_to_jurisdiction = property_lookup['applies to jurisdiction']
        self.pid_html_table = property_lookup['html table']
        
        self.class_dataset = classification_reference['dataset']
        self.class_congressional_district = classification_reference['U.S. Congressional District']
        
        self.item_label = item_label
        self.item_description = item_description
        self.reference_url = reference_url
        self.technical_documentation_url = technical_documentation_url
        self.property_map = property_map
        self.table_links = table_links
        self.property_reference = property_reference

        # Build item
        self.item = wikibase_connection.item.new()
        self.references = self.build_reference()
        self.set_label_desc()
        self.claim_item_classification()
        self.claim_reference_url()
        self.claim_datasource_classification()
        self.claim_tech_documentation()
        self.claim_source_property_mapping()
        self.claim_html_table()
        
    def build_reference(self):
        refs = eew.models.References()
        refs.add(
            eew.datatypes.URL(
                prop_nr=self.pid_ref_url,
                value=self.reference_url
            )
        )
        
        return refs
    
    def set_label_desc(self):
        self.item.labels.set('en', self.item_label)
        self.item.descriptions.set('en', self.item_description)
        
    def claim_item_classification(self):
        self.item.claims.add(
            datatypes.Item(
                prop_nr=self.pid_instance_of,
                value=self.class_dataset
            )
        )
        
    def claim_reference_url(self):
        self.item.claims.add(
            datatypes.URL(
                prop_nr=self.pid_ref_url,
                value=self.reference_url
            )
        )

    def claim_datasource_classification(self):
        self.item.claims.add(
            datatypes.Item(
                prop_nr=self.pid_entity_classifier,
                value=self.class_congressional_district,
                references=self.references
            )
        )
        
    def claim_tech_documentation(self):
        self.item.claims.add(
            datatypes.URL(
                prop_nr=self.pid_technical_documentation,
                value=self.technical_documentation_url,
                references=self.references
            )
        )
        
    def claim_source_property_mapping(self):
        source_property_claims = []
        for m in self.property_map:
            q = models.Qualifiers()
            q.add(
                datatypes.String(
                    prop_nr=self.pid_source_property,
                    value=m[1]
                )
            )
            c = datatypes.Property(
                prop_nr=self.pid_prop_from_source,
                value=self.property_reference[m[0]],
                qualifiers=q
            )
            source_property_claims.append(c)
        self.item.claims.add(source_property_claims)
        
    def claim_html_table(self):
        html_table_claims = []
        for f in self.table_links:
            q = models.Qualifiers()
            q.add(
                datatypes.Item(
                    prop_nr=self.pid_applies_to_jurisdiction,
                    value=f['state_territory_qid']
                )
            )
            html_table_claims.append(
                eew.datatypes.URL(
                    prop_nr=self.pid_html_table,
                    value=f['url'],
                    qualifiers=q,
                    references=self.references
                )
            )
        self.item.claims.add(html_table_claims)


# Counties and equivalent divisions

This section builds a data source item by getting links from a TIGER web listing of states/territories. The data tables on the other end of the links can be harvested to build our reference items.

In [8]:
# Set the URL for the county table listing by state and the title for the source item
tiger_current_county_url = "https://tigerweb.geo.census.gov/tigerwebmain/TIGERweb_counties_current.html"
state_county_source_label = 'TIGER data file source for U.S. Counties'
state_county_source_desc = 'data files from the U.S. Census Bureau listing U.S. County names and identifying information'


# Set the mapping of source fields to knowledgebase fields
tiger_county_property_map = [
    ('label', 'NAME'),
    ('alias', 'BASENAME'),
    ('GNIS ID', 'COUNTYNS'),
    ('TIGER GEOID', 'GEOID'),
    ('FIPS 10-4', 'GEOID'),
    ('coordinate location', 'CENTLAT,CENTLON')
]


# Run the process to scrape links and determine if we need to run the script for building the source item
state_county_links = scrape_state_data_table_links(url=tiger_current_county_url)

check_item = eew.item_by_label(state_county_source_label)
build_item = True if not check_item else False

display(state_county_links[:3])
print()
print("BUILD ITEM:", build_item)
print()

if build_item:
    county_item = BuildItem(
        wikibase_connecton=eew.wbi,
        property_reference=property_lookup,
        classification_reference=classification,
        reference_url=tiger_current_county_url,
        technical_documentation_url=tech_doc_reference,
        item_label=state_county_source_label,
        item_description=state_county_source_desc,
        property_map=tiger_county_property_map,
        table_links=state_county_links,
    )
    
    # county_item.item.write(
    #     summary="Added data source reference item for TIGER congressional districts"
    # )

[{'state or territory': 'Alabama',
  'url': 'https://tigerweb.geo.census.gov/tigerwebmain/Files/bas23/tigerweb_bas23_county_al.html',
  'state_alpha_code': 'AL',
  'state_territory_qid': 'Q268'},
 {'state or territory': 'Illinois',
  'url': 'https://tigerweb.geo.census.gov/tigerwebmain/Files/bas23/tigerweb_bas23_county_il.html',
  'state_alpha_code': 'IL',
  'state_territory_qid': 'Q281'},
 {'state or territory': 'Montana',
  'url': 'https://tigerweb.geo.census.gov/tigerwebmain/Files/bas23/tigerweb_bas23_county_mt.html',
  'state_alpha_code': 'MT',
  'state_territory_qid': 'Q294'}]


BUILD ITEM: False



# National Congressional Districts

For the EEW case, information tied to political districts at Federal and State levels is quite important. U.S. Census TIGER data is also a reasonable starting point for the basic identification of districts in a given election period. Here, we build out a similar data source item for the 116th Congressional Districts.

In [9]:
# Set the mapping of source fields to knowledgebase fields
tiger_cong_dist_property_map = [
    ('label', 'NAME'),
    ('TIGER GEOID', 'GEOID'),
    ('coordinate location', 'CENTLAT,CENTLON')
]


# Set the URL for the county table listing by state and the title for the source item
# Set the URLs we need to work with as sources
tiger_cong_dist_url = "https://tigerweb.geo.census.gov/tigerwebmain/TIGERweb_cd116_current.html"
cong_dist_source_label = 'TIGER data file source for U.S. Congressional Districts'
cong_dist_source_desc = 'data files from the U.S. Census Bureau listing U.S. County names and identifying information'


# Run the process to scrape links and determine if we need to run the script for building the source item
cong_dist_links = scrape_state_data_table_links(url=tiger_cong_dist_url)

check_item = eew.item_by_label(cong_dist_source_label)
build_item = True if not check_item else False

display(cong_dist_links[:3])
print()
print("BUILD ITEM:", build_item)
print()

if build_item:
    cong_dist_item = BuildItem(
        wikibase_connection=eew.wbi,
        property_reference=property_lookup,
        classification_reference=classification,
        reference_url=tiger_cong_dist_url,
        technical_documentation_url=tech_doc_reference,
        item_label=cong_dist_source_label,
        item_description=cong_dist_source_desc,
        property_map=tiger_cong_dist_property_map,
        table_links=cong_dist_links,
    )
    
    item = cong_dist_item.item.write(
        summary="Added data source reference item for TIGER congressional districts"
    )
    
    print("CREATED ITEM", item.id)

[{'state or territory': 'Alabama',
  'url': 'https://tigerweb.geo.census.gov/tigerwebmain/Files/bas23/tigerweb_bas23_cd118_al.html',
  'state_alpha_code': 'AL',
  'state_territory_qid': 'Q268'},
 {'state or territory': 'Illinois',
  'url': 'https://tigerweb.geo.census.gov/tigerwebmain/Files/bas23/tigerweb_bas23_cd118_il.html',
  'state_alpha_code': 'IL',
  'state_territory_qid': 'Q281'},
 {'state or territory': 'Montana',
  'url': 'https://tigerweb.geo.census.gov/tigerwebmain/Files/bas23/tigerweb_bas23_cd118_mt.html',
  'state_alpha_code': 'MT',
  'state_territory_qid': 'Q294'}]


BUILD ITEM: True

CREATED ITEM Q664
