I need to come back and revisit this one to smooth out the methodology for pulling reference place names into the graph. This was a brief deal I needed to add in first-level administrative units (states, provinces, territories) for Canada and Mexico because we have enough of those references in the Pubs Warehouse metadata to make it worthwhile. I had used different methods and sources than I used here for U.S. States and Territories, and I need to run back through and clean up a few things.

In [13]:
import requests
import pickle
import pandas as pd
from wbmaker import WikibaseConnection

In [2]:
geokb = WikibaseConnection('GEOKB_CLOUD')
pw_dump = pd.DataFrame(pickle.load(open('data/pw_usgs_reports_dump.pickle', 'rb')))

In [3]:
geokb_states = geokb.url_sparql_query(
    sparql_url="https://geokb.wikibase.cloud/query/sparql?query=PREFIX%20wd%3A%20%3Chttps%3A%2F%2Fgeokb.wikibase.cloud%2Fentity%2F%3E%0APREFIX%20wdt%3A%20%3Chttps%3A%2F%2Fgeokb.wikibase.cloud%2Fprop%2Fdirect%2F%3E%0A%0ASELECT%20%3Fitem%20%3FitemLabel%0AWHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP1%20%3Fclasses%20.%0A%20%20VALUES%20%3Fclasses%20%7B%20wd%3AQ229%20wd%3AQ25363%20%7D%0A%20%20SERVICE%20wikibase%3Alabel%20%7B%20bd%3AserviceParam%20wikibase%3Alanguage%20%22en%22%20.%20%7D%0A%7D",
    output_format="dataframe"
)

geokb_states['object'] = geokb_states['item'].apply(lambda x: x.split('/')[-1])

In [28]:
wd_3166_2 = geokb.url_sparql_query(
    sparql_url="https://query.wikidata.org/sparql?query=SELECT%20%3Fitem%20%3FitemLabel%20%3FcountryLabel%20%3Fiso_code%20%3Finstance_ofLabel%0AWHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP300%20%3Fiso_code%20.%0A%20%20%3Fitem%20wdt%3AP17%20%3Fcountry%20.%0A%20%20VALUES%20%3Fcountry%20%7B%20wd%3AQ96%20wd%3AQ16%20%7D%0A%20%20%3Fitem%20wdt%3AP31%20%3Finstance_of%20.%0A%20%20VALUES%20%3Finstance_of%20%7B%20wd%3AQ11828004%20wd%3AQ9357527%20wd%3AQ15149663%20%7D%0A%20%20SERVICE%20wikibase%3Alabel%20%7B%20bd%3AserviceParam%20wikibase%3Alanguage%20%22%5BAUTO_LANGUAGE%5D%2Cen%22.%20%7D%0A%7D",
    output_format="dataframe"
)

In [24]:
geonames_props = [
    "geonameId",
    "countryCode",
    "name",
    "toponymName",
    "adminCodes1"
]

geonames_ids = {
    "CA": "6251999",
    "MX": "3996063"
}

username = 'skybristol'

country_dfs = []
for geonames_id in geonames_ids.values():
    url = f'http://api.geonames.org/childrenJSON?geonameId={geonames_id}&username={username}'
    response = requests.get(url)
    country_dfs.append(pd.DataFrame(response.json()['geonames'])[geonames_props])

geonames_states = pd.concat(country_dfs)
geonames_states["iso_code"] = geonames_states.apply(lambda x: '-'.join([x['countryCode'], x['adminCodes1']['ISO3166_2']]), axis=1)

In [33]:
geokb_class_mapping = {
    "province of Canada": "Q138360",
    "territory of Canada": "Q138361",
    "state of Mexico": "Q138362"
}

geokb_country_mapping = {
    "CA": "Q157",
    "MX": "Q177"
}

In [34]:
state_items = pd.merge(
    left=geonames_states.drop(columns=['adminCodes1']),
    right=wd_3166_2[['iso_code','item','instance_ofLabel']],
    how="inner",
    on="iso_code"
)

state_items['country_qid'] = state_items['countryCode'].apply(lambda x: geokb_country_mapping[x])
state_items['instance_of_qid'] = state_items['instance_ofLabel'].apply(lambda x: geokb_class_mapping[x])

In [40]:
for index, row in state_items.iterrows():
    item = geokb.wbi.item.new()

    item.labels.set('en', row['name'])
    item.descriptions.set('en', row['instance_ofLabel'])
    aliases = [i for i in row['toponymName'].split('/') if i != row['name']]
    aliases.append(f"{row['name']}, {row['countryCode']}")
    if aliases:
        item.aliases.set('en', aliases)

    item.claims.add(
        geokb.datatypes.Item(
            prop_nr=geokb.prop_lookup['instance of'],
            value=row['instance_of_qid']
        )
    )

    item.claims.add(
        geokb.datatypes.Item(
            prop_nr=geokb.prop_lookup['country'],
            value=row['country_qid']
        )
    )

    item.claims.add(
        geokb.datatypes.ExternalID(
            prop_nr=geokb.prop_lookup['ISO 3166-2 code'],
            value=row['iso_code']
        )
    )

    item.claims.add(
        geokb.datatypes.URL(
            prop_nr=geokb.prop_lookup['same as'],
            value=row['item']
        )
    )

    response = item.write(
        summary="Added first-level administrative unit for country from geonames and wikidata source material"
    )
    print(response.id, row['name'])


Q138363 Alberta
Q138364 British Columbia
Q138365 Manitoba
Q138366 New Brunswick
Q138367 Newfoundland and Labrador
Q138368 Northwest Territories
Q138369 Nova Scotia
Q138370 Nunavut
Q138371 Ontario
Q138372 Prince Edward Island
Q138373 Quebec
Q138374 Saskatchewan
Q138375 Yukon
Q138376 Aguascalientes
Q138377 Baja California
Q138378 Baja California Sur
Q138379 Campeche
Q138380 Chiapas
Q138381 Chihuahua
Q138382 Coahuila
Q138383 Colima
Q138384 Durango
Q138385 Guanajuato
Q138386 Guerrero
Q138387 Hidalgo
Q138388 Jalisco
Q138389 Michoacán
Q138390 Morelos
Q138391 México
Q138392 Nayarit
Q138393 Nuevo León
Q138394 Oaxaca
Q138395 Puebla
Q138396 Querétaro
Q138397 Quintana Roo
Q138398 San Luis Potosí
Q138399 Sinaloa
Q138400 Sonora
Q138401 Tabasco
Q138402 Tamaulipas
Q138403 Tlaxcala
Q138404 Veracruz
Q138405 Yucatán
Q138406 Zacatecas
