This notebook runs the process that reads USGS Staff Profile information cached in the Item_talk pages associated with the items representing people and derives linkages to contribute to fleshing out the person graph. Eventually, this basic logic should be put together as a microservice that can operate on a single profile document, doing all of the things indicated in that content so that we end up with something like this workflow:

scrape profile (based on retrieved date and specified periodicity) > post to cache (if new) > process claims

### Note
I still have a number of issues with "ghosts" in the Wikibase - items that show up in SPARQL that should not be there and claims that don't show up in SPARQL that should be in place. There seems to be something going on between the the API process that is processing data and the data store behing Blazegraph (the Wikibase component that provides the triple store). See https://phabricator.wikimedia.org/T343034

In [84]:
from wbmaker import WikibaseConnection
from joblib import Parallel, delayed
from tqdm import tqdm
import yaml
import pandas as pd
from fuzzywuzzy import process
from dateutil import parser
from datetime import datetime

geokb = WikibaseConnection('GEOKB_CLOUD')

# Cached Profiles

In [2]:
query_persons = """
PREFIX wd: <https://geokb.wikibase.cloud/entity/>
PREFIX wdt: <https://geokb.wikibase.cloud/prop/direct/>

SELECT ?item ?itemLabel ?profile_url
WHERE {
  ?item wdt:P1 wd:Q3 ;
        wdt:P31 ?profile_url .
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
"""

person_profiles = geokb.sparql_query(query_persons)
person_profiles['qid'] = person_profiles['item'].apply(lambda x: x.split('/')[-1])
person_profiles['profile_cache'] = person_profiles['qid'].apply(lambda x: f"Item_talk:{x}")

In [3]:
def fetch_profile_cache(page_title, site):
    try:
        page = site.pages[page_title]
        cache_data = yaml.safe_load(page.text())
        return {'profile_cache': page_title, 'profile_data': cache_data, 'error': None}
    except Exception as e:
        return {'profile_cache': page_title, 'profile_data': None, 'error': str(e)}
    
cached_profiles = Parallel(n_jobs=-1, prefer='threads')(delayed(fetch_profile_cache)(page, geokb.mw_site) for page in tqdm(person_profiles['profile_cache'].to_list()))

100%|██████████| 6349/6349 [08:44<00:00, 12.10it/s]


# Split out profile information

In [87]:
temp = pd.DataFrame(cached_profiles)
df_split = temp['profile_data'].apply(lambda x: pd.Series(x))
df_concat = pd.concat([temp.drop(columns='profile_data'), df_split], axis=1)

df_split_profile_data = df_concat['usgs_staff_profile'].apply(lambda x: pd.Series(x))
df_profile_data = pd.concat([df_concat.drop(columns=['usgs_staff_profile','error']), df_split_profile_data], axis=1)

df_split_meta = df_profile_data['meta'].apply(lambda x: pd.Series(x))
df_split_profile = df_profile_data['profile'].apply(lambda x: pd.Series(x))
df_profile_data = pd.concat([df_profile_data.drop(columns=['meta']), df_split_meta], axis=1)
df_profile_data = pd.concat([df_profile_data.drop(columns=['profile']), df_split_profile], axis=1)

df_profile_data['qid'] = df_profile_data['profile_cache'].apply(lambda x: x.split(':')[-1])

  df_split = temp['profile_data'].apply(lambda x: pd.Series(x))
  df_split = temp['profile_data'].apply(lambda x: pd.Series(x))


# Occupations from GeoKB

In [31]:
query_occupations = """
PREFIX wd: <https://geokb.wikibase.cloud/entity/>
PREFIX wdt: <https://geokb.wikibase.cloud/prop/direct/>

SELECT ?item ?itemLabel ?item_alt_label
WHERE {
  ?item wdt:P2* ?classes .
  VALUES ?classes {wd:Q159568 wd:Q159617}
  OPTIONAL {
    ?item skos:altLabel ?item_alt_label .
    FILTER (lang(?item_alt_label)='en')
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""

occupations = geokb.sparql_query(query_occupations)
occupations['qid'] = occupations['item'].apply(lambda x: x.split('/')[-1])
occupations_lookup = occupations[['qid','itemLabel']].drop_duplicates().set_index('itemLabel')['qid'].to_dict() 
occupations_lookup.update(occupations[occupations['item_alt_label'].notnull()].set_index('item_alt_label')['qid'].to_dict())

# Match titles to occupations

In [73]:
# Match occupation titles to QIDs
# Check spelling first, then fuzzy match
def match_occupation(title, occupations=occupations_lookup):
    if title in occupations:
        return occupations[title]

    match = process.extractOne(title, list(occupations.keys()))
    if match[1] >= 80:
        return occupations[match[0]]
    else:
        return None

In [88]:
titles = df_profile_data[['qid','title','url','timestamp']].dropna().reset_index(drop=True)
titles['date_qualifier'] = titles['timestamp'].apply(lambda x: parser.parse(x).strftime('+%Y-%m-%dT00:00:00Z') if x else None)

titles['occupation_qid'] = titles['title'].apply(match_occupation)

## Supervisor Role

In [90]:
titles['supervisor'] = titles['title'].str.contains('super', case=False)

## Research Grade

In [95]:
titles['rge'] = titles['title'].str.contains('Research ')

# Add occuption claims

In [100]:
processed_qids = []
with open('../data/fixes.txt', 'r') as f:
    for line in f:
        processed_qids.append(line.strip())

In [103]:
print(len(titles))
process_items = titles[
    (titles['occupation_qid'].notnull()) | (titles['supervisor'] == True)
    &
    (~titles['qid'].isin(processed_qids))
]
print(len(process_items))

6192
6075


In [104]:
for index, row in process_items.iterrows():
    try:
        item = geokb.wbi.item.get(row['qid'])
    except Exception as e:
        print(row['qid'], str(e))
        continue

    reference = geokb.models.References()
    reference.add(
        geokb.datatypes.URL(
            prop_nr=geokb.prop_lookup['reference URL'],
            value=row['url']
        )
    )

    qualifier = geokb.models.Qualifiers()
    qualifier.add(
        geokb.datatypes.Time(
            prop_nr=geokb.prop_lookup['point in time'],
            time=row['date_qualifier']
        )
    )

    occupation_claims = []
    if row['occupation_qid']:
        occupation_claims.append(
            geokb.datatypes.Item(
                prop_nr=geokb.prop_lookup['occupation'],
                value=row['occupation_qid'],
                references=reference,
                qualifiers=qualifier
            )
        )

    if row['supervisor']:
        occupation_claims.append(
            geokb.datatypes.Item(
                prop_nr=geokb.prop_lookup['occupation'],
                value="Q159624",
                references=reference,
                qualifiers=qualifier
            )
        )

    item.claims.add(occupation_claims, action_if_exists=geokb.action_if_exists.REPLACE_ALL)

    if row['rge']:
        item.claims.add(
            geokb.datatypes.Item(
                prop_nr="P153",
                value="Q159626",
                references=reference,
                qualifiers=qualifier
            )
        )

    try:
        response = item.write(
            summary='Added occupation claim matched from personnel title in USGS staff profile'
        )
        print(response.id)
    except Exception as e:
        print(row['qid'], str(e))


Q44385
Q44386
Q44387
Q44389
Q44392
Q44393
Q44394
Q44395
Q44396
Q44397
Q44398
Q44399
Q44401
Q44402
Q44403
Q44405
Q44406
Q44407
Q44408
Q44409
Q44410
Q44412
Q44413
Q44414
Q44416
Q44417
Q44418
Q44419
Q44422
Q44423
Q44424
Q44426
Q44427
Q44428
Q44429
Q44431
Q44432
Q44434
Q44436
Q44437
Q44438
Q44439
Q44440
Q44441
Q44442
Q44443
Q44444
Q44445
Q44447
Q44448
Q44449
Q44452
Q44453
Q44454
Q44455
Q44456
Q44457
Q44459
Q44460
Q44461
Q44462
Q44466
Q44467
Q44469
Q44470
Q44472
Q44473
Q44474
Q44475
Q44476
Q44480
Q44481
Q44484
Q44485
Q44486
Q44489
Q44490
Q44491
Q44492
Q44493
Q44497
Q44499
Q44500
Q44502
Q44505
Q44506
Q44507
Q44508
Q44511
Q44512
Q44515
Q44517
Q44519
Q44483
Q44504
Q44509
Q44518
Q44520
Q44521
Q44522
Q44523
Q44525
Q44526
Q44527
Q44528
Q44529
Q44530
Q44531
Q44532
Q44533
Q44534
Q44535
Q44536
Q44538
Q44539
Q44540
Q44541
Q44542
Q44543
Q44544
Q44545
Q44546
Q44547
Q44548
Q44549
Q44550
Q44551
Q44553
Q44554
Q44555
Q44556
Q44557
Q44561
Q44562
Q44563
Q44564
Q44565
Q44568
Q44569
Q44570
Q44571
Q44572
Q44574

Error while writing to the Wikibase instance
Traceback (most recent call last):
  File "/Users/sbristol/mambaforge-pypy3/envs/geokb/lib/python3.11/site-packages/wikibaseintegrator/entities/baseentity.py", line 244, in _write
    json_result: dict = edit_entity(data=data, id=entity_id, type=self.type, summary=summary, clear=clear, is_bot=is_bot, allow_anonymous=allow_anonymous,
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sbristol/mambaforge-pypy3/envs/geokb/lib/python3.11/site-packages/wikibaseintegrator/wbi_helpers.py", line 335, in edit_entity
    return mediawiki_api_call_helper(data=params, is_bot=is_bot, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sbristol/mambaforge-pypy3/envs/geokb/lib/python3.11/site-packages/wikibaseintegrator/wbi_helpers.py", line 217, in mediawiki_api_call_helper
    return mediaw

Q46358 '[[Item:Q44365|Q44365]] not found'
Q46359
Q46360
Q46361
Q46362
Q46363
Q46365
Q46368
Q46370
Q46372
Q46373
Q46374
Q46375
Q46376
Q46377
Q46378
Q46379
Q46382
Q46385
Q46386
Q46387
Q46388
Q46389
Q46391
Q44400
Q44552
Q45130
Q45683
Q45776
Q46004
Q46195
Q46720
Q47087
Q48012
Q48075
Q48310
Q48391
Q48414
Q48440
Q48444
Q48651
Q49810
Q50004
Q49718
Q48747
Q45729
Q46318
Q46381
Q49774
Q44807
Q44859
Q45873
Q46312
Q47457
Q48547
Q49186
Q49605
Q49803
Q49359
Q50215
Q50358
Q44415
Q44421
Q44425
Q44446
Q44451
Q44514
Q44516
Q44573
Q44581
Q44586
Q44587
Q44638
Q44654
Q44656
Q44681
Q44688
Q44696
Q44707
Q44727
Q44737
Q44742
Q44764
Q44778
Q44792
Q44805
Q44814
Q44821
Q44827
Q44835
Q44860
Q44868
Q44875
Q44881
Q44914
Q44915
Q44922
Q44932
Q44933
Q44964
Q44972
Q44977
Q44987
Q45001
Q45010
Q45029
Q45040
Q45042
Q45047
Q45070
Q45136
Q45158
Q45163
Q45174
Q45188
Q45196
Q45217
Q45218
Q45226
Q45231
Q45232
Q45258
Q45259
Q45290
Q45294
Q45304
Q45308
Q45368
Q45378
Q45384
Q45399
Q45419
Q45424
Q45450
Q45463
Q45468
Q45474
Q45490