U.S. Census data uses a Legal/Statistical Area Description code to describe every unit in their system. In this notebook, I pull these in from a reference HTML table published online and create items with their codes for use as a linked set of entities.

In [1]:
import pandas as pd
from wbmaker import WikibaseConnection

In [3]:
geokb = WikibaseConnection("GEOKB_CLOUD")

In [4]:
datasource_qid = geokb.ref_lookup['Legal/Statistical Area Description Codes and Definitions']
instance_of_class = geokb.class_lookup['Legal/Statistical Area Description']
lsad_property = geokb.prop_lookup['LSAD']

In [9]:
ds = geokb.wbi.item.get(datasource_qid).get_json()
ks_claim = next((i for i in ds["claims"][geokb.prop_lookup['instance of']] if i["mainsnak"]["datavalue"]["value"]["id"] == geokb.class_lookup["knowledgebase source"]), None)
if ks_claim is not None and "references" in ks_claim:
    ks_refs = []
    for r in ks_claim["references"]:
        ref_type_prop_nr = list(r["snaks"].keys())[0]
        ks_refs.append({
            "reference type": {v: k for k, v in geokb.prop_lookup.items()}[ref_type_prop_nr],
            "reference links": [i["datavalue"]["value"] for i in r["snaks"][ref_type_prop_nr]]
        })

ds_tables = []
for ref in ks_refs:
    if ref["reference type"] == 'html table':
        for url in ref["reference links"]:
            ds_tables.extend(pd.read_html(url))

lsad_reference = ds_tables[0].astype({
    "LSAD": "category", 
    "LSAD Description": "str", 
    "Associated Geographic Entity": "str"
})

In [20]:
query_lsad = "PREFIX%20wdt%3A%20%3Chttps%3A%2F%2Fgeokb.wikibase.cloud%2Fprop%2Fdirect%2F%3E%0A%0ASELECT%20%3Fitem%20%3Flsad%0AWHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP90%20%3Flsad%20.%0A%7D%0A"
df_lsad = geokb.wb_ref_data(query=query_lsad)
missing_lsad = lsad_reference[~lsad_reference.LSAD.isin(df_lsad.lsad)]

In [23]:
references = geokb.models.References()
references.add(
    geokb.datatypes.Item(
        prop_nr=geokb.prop_lookup['data source'],
        value=datasource_qid
    )
)

for index, row in missing_lsad.iterrows():
    item = geokb.wbi.item.new()
    if row["LSAD Description"] == "nan":
        item.labels.set("en", "general or unknown legal/statistica area")
        item.descriptions.set("en", "general LSAD category referring to any other classification; essentially unclassified")
    else:
        item.labels.set("en", row["LSAD Description"])
        if row["Associated Geographic Entity"] == "Primary Metropolitan Statistical Area":
            item.descriptions.set("en", "metropolitan area")
        elif row["Associated Geographic Entity"] == "New England County Metropolitan Area":
            item.descriptions.set("en", "New England county area")
        else:
            item.descriptions.set("en", row["Associated Geographic Entity"])

    item.claims.add(
        geokb.datatypes.Item(
            prop_nr=geokb.prop_lookup['instance of'],
            value=instance_of_class,
            references=references
        )
    )

    item.claims.add(
        geokb.datatypes.ExternalID(
            prop_nr=lsad_property,
            value=row["LSAD"],
            references=references
        )
    )

    try:
        response = item.write(summary="Added LSAD item from source HTML table")
        print("ADDED:", row["LSAD"], response.id)
    except Exception as e:
        pass

ADDED: 88 Q26619
ADDED: 89 Q26620
ADDED: 90 Q26621
ADDED: 91 Q26622
ADDED: 92 Q26623
ADDED: 93 Q26624
ADDED: 94 Q26625
ADDED: 95 Q26626
ADDED: 96 Q26627
ADDED: 97 Q26628
ADDED: 98 Q26629
ADDED: 99 Q26630
ADDED: 9C Q26631
ADDED: 9D Q26632
ADDED: 9E Q26633
ADDED: 9F Q26634
ADDED: B1 Q26635
ADDED: B2 Q26636
ADDED: B3 Q26637
ADDED: B4 Q26638
ADDED: B5 Q26639
ADDED: B6 Q26640
ADDED: B7 Q26641
ADDED: B8 Q26642
ADDED: BG Q26643
ADDED: BI Q26644
ADDED: BK Q26645
ADDED: BL Q26646
ADDED: C1 Q26647
ADDED: C2 Q26648
ADDED: C3 Q26649
ADDED: C4 Q26650
ADDED: C5 Q26651
ADDED: CB Q26652
ADDED: CG Q26653
ADDED: CN Q26654
ADDED: CR Q26655
ADDED: CT Q26656
ADDED: IB Q26657
ADDED: IT Q26658
ADDED: L1 Q26659
ADDED: L2 Q26660
ADDED: L3 Q26661
ADDED: L4 Q26662
ADDED: L5 Q26663
ADDED: L6 Q26664
ADDED: L7 Q26665
ADDED: LL Q26666
ADDED: LU Q26667
ADDED: M0 Q26668
ADDED: M1 Q26669
ADDED: M2 Q26670
ADDED: M3 Q26671
ADDED: M4 Q26672
ADDED: M5 Q26673
ADDED: M6 Q26674
ADDED: M7 Q26675
ADDED: MB Q26676
ADDED: MG Q266

Service unavailable (HTTP Code 502). Sleeping for 60 seconds.


ADDED: MT Q26678
ADDED: P1 Q26679
ADDED: P5 Q26680
ADDED: T1 Q26681
ADDED: T2 Q26682
ADDED: T3 Q26683
ADDED: TA Q26684
ADDED: TB Q26685
ADDED: TC Q26686
ADDED: TZ Q26687
ADDED: UB Q26688
ADDED: UC Q26689
ADDED: UG Q26690
ADDED: V1 Q26691
ADDED: V2 Q26692
ADDED: Z3 Q26693
ADDED: Z5 Q26694
