This process ended up initially being kind of a mess. I had a bunch of duplicates and other issues to hash through before finally being able to add a few hundred additional person items that were indicated by the Staff Profile inventory. I will come back and revisit this at a later time once I get a proper baseline established.

In [221]:
import os
import mwclient
import requests
import pandas as pd
import isaid
from joblib import Parallel, delayed
from tqdm import tqdm
import yaml

from wbmaker import WikibaseConnection
geokb = WikibaseConnection('GEOKB_CLOUD')


In [16]:
source_item = "Q44323"

In [121]:
# Create a secure connection to the Wikibase so we can write to it
def create_authenticated_site(user_name, password):
    site = mwclient.Site('geokb.wikibase.cloud', path='/w/', scheme='https')
    site.login(user_name, password)

    return site

def df_from_sparql(json_results):
    data_records = []
    var_names = json_results['head']['vars']

    for record in json_results['results']['bindings']:
        data_record = {}
        for var_name in var_names:
            data_record[var_name] = record[var_name]['value'] if var_name in record else None
        data_records.append(data_record)

    return pd.DataFrame(data_records)

In [222]:
# Establish GeoKB Wikibase site connection
mw_site = create_authenticated_site(os.environ['WB_BOT_GEOKB_CLOUD'], os.environ['WB_BOT_PASS_GEOKB_CLOUD'])  

source_page = mw_site.pages[f"Item_talk:{source_item}"]
profile_list = source_page.text().split(',')


In [184]:
query_person_profiles = """
PREFIX wd: <https://geokb.wikibase.cloud/entity/>
PREFIX wdt: <https://geokb.wikibase.cloud/prop/direct/>
PREFIX p: <https://geokb.wikibase.cloud/prop/>
PREFIX ps: <https://geokb.wikibase.cloud/prop/statement/>
PREFIX pq: <https://geokb.wikibase.cloud/prop/qualifier/>

SELECT ?item ?itemLabel ?profile_url ?retrieved ?status_code ?orcid ?email
WHERE {
  ?item wdt:P1 wd:Q3 .
  OPTIONAL {
    ?item wdt:P31 ?profile_url .
  }
  OPTIONAL {
    ?item wdt:P106 ?orcid .
  }
  OPTIONAL {
    ?item wdt:P109 ?email .
  }
  OPTIONAL {
    ?item p:P31 ?ref_url_statement .
    ?ref_url_statement pq:P151 ?status_code ;
                       pq:P139 ?retrieved .
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""

r = requests.get(
    'https://geokb.wikibase.cloud/query/sparql',
    params = {'query': query_person_profiles, 'format': 'json'}
)

df_geokb_profiles = df_from_sparql(r.json())
df_geokb_profiles['qid'] = df_geokb_profiles['item'].apply(lambda x: x.split('/')[-1])
df_geokb_profiles['profile_name'] = df_geokb_profiles['profile_url'].apply(lambda x: x.split('/')[-1] if x else None)
df_geokb_profiles['email'] = df_geokb_profiles['email'].apply(lambda x: x.split(':')[-1] if x else None)

orcid_lookup = df_geokb_profiles[df_geokb_profiles['orcid'].notnull()].set_index('orcid')['qid'].to_dict()

df_geokb_profiles.head()

Unnamed: 0,item,itemLabel,profile_url,retrieved,status_code,orcid,email,qid,profile_name
0,https://geokb.wikibase.cloud/entity/Q44897,John Karl Bohlke,https://www.usgs.gov/staff-profiles/john-karl-...,2023-09-30T00:00:00Z,200,0000-0001-5693-6455,jkbohlke@usgs.gov,Q44897,john-karl-bohlke
1,https://geokb.wikibase.cloud/entity/Q44898,Carol Bolden,https://www.usgs.gov/staff-profiles/carol-bolden,2023-09-30T00:00:00Z,200,,cbolden@usgs.gov,Q44898,carol-bolden
2,https://geokb.wikibase.cloud/entity/Q44900,Wallace Bolen,https://www.usgs.gov/staff-profiles/wallace-bolen,2023-09-30T00:00:00Z,200,,wbolen@usgs.gov,Q44900,wallace-bolen
3,https://geokb.wikibase.cloud/entity/Q44902,"Scott Bonar, PhD",https://www.usgs.gov/staff-profiles/scott-bonar,2023-09-30T00:00:00Z,200,0000-0003-3532-4067,sbonar@usgs.gov,Q44902,scott-bonar
4,https://geokb.wikibase.cloud/entity/Q44904,Michael Bonds,https://www.usgs.gov/staff-profiles/michael-bonds,2023-09-30T00:00:00Z,200,0000-0002-9454-1195,mbonds@usgs.gov,Q44904,michael-bonds


In [185]:
missing_url = df_geokb_profiles[df_geokb_profiles['profile_url'].isna()].reset_index(drop=True)
print("PEOPLE IN GEOKB", len(df_geokb_profiles))
print("PEOPLE MISSING PROFILES", len(missing_url))
print()
missing_profile_names = [i for i in profile_list if i not in df_geokb_profiles['profile_name'].tolist()]
print("POTENTIALLY NEW PEOPLE", len(missing_profile_names))
display(missing_profile_names[:5])

PEOPLE IN GEOKB 10753
PEOPLE MISSING PROFILES 4802

POTENTIALLY NEW PEOPLE 404


['tim-clements',
 'sabrina-n-martinez',
 'seth-siefken',
 'cayla-shirley',
 'michael-utecht']

# Scrape Profiles

In [18]:
missing_profiles = Parallel(n_jobs=-1, prefer='threads')(delayed(isaid.staff_profile_scrape)(i) for i in tqdm(missing_profile_names))


100%|██████████| 564/564 [01:13<00:00,  7.68it/s]


In [152]:
for profile_name in [i for i in missing_profile_names if i not in df_missing_profiles['profile_name'].tolist()]:
    missing_profiles.append(isaid.staff_profile_scrape(profile_name))

In [197]:
df_missing_profiles = pd.DataFrame([{**d['profile'], **d['meta']} for d in missing_profiles])
df_missing_profiles['profile_name'] = df_missing_profiles['url'].apply(lambda x: x.split('/')[-1])
df_missing_profiles = df_missing_profiles[
    (df_missing_profiles['profile_name'].isin(missing_profile_names))
    &
    (~df_missing_profiles['orcid'].isin(orcid_lookup))
    &
    (
        (df_missing_profiles['email'].notnull())
        |
        (df_missing_profiles['orcid'].notnull())
    )
].reset_index(drop=True)

df_missing_profiles.head()

Unnamed: 0,name,name_qualifier,title,organization_name,organization_link,email,orcid,intro_statements,expertise_terms,professional_experience,education,affiliations,honors,abstracts,personal_statement,url,timestamp,status_code,profile_name
0,Tim Clements,,Scientist,Earthquake Hazards Program,https://www.usgs.gov/programs/earthquake-hazards,tclements@usgs.gov,,[Tim Clements],[],[],[],[],[],[],,https://www.usgs.gov/staff-profiles/tim-clements,2023-10-02T16:15:28.619274,200,tim-clements
1,Sabrina N Martinez,,Geologist,Landslide Hazards Program,https://www.usgs.gov/programs/landslide-hazards,snmartinez@usgs.gov,0000-0002-1812-5990,[Sabrina Martinez joined the team at the USGS ...,[Landslides],"[2020 - Present: Geologist, USGS Geologic Haza...","[2017-2019: Tulane University, M.S., Earth and...",[],[],[],Sabrina uses remotely sensed data to better un...,https://www.usgs.gov/staff-profiles/sabrina-n-...,2023-10-02T16:15:29.585371,200,sabrina-n-martinez
2,Seth Siefken,,Hydrologist - Civil Engineering,Wyoming-Montana Water Science Center,https://www.usgs.gov/centers/wyoming-montana-w...,ssiefken@usgs.gov,0000-0001-5502-7903,[Seth Siefken is a civil engineer based in Hel...,[],[],"[M.S. Civil Engineering, Colorado State Univer...",[],[],[],,https://www.usgs.gov/staff-profiles/seth-siefken,2023-10-02T16:15:27.059574,200,seth-siefken
3,Cayla Shirley,,Hydrologic Technician,Oregon Water Science Center,https://www.usgs.gov/centers/oregon-water-scie...,cshirley@usgs.gov,,[Cayla is a hydrologic technician at the Orego...,[],[],[],[],[],[],,https://www.usgs.gov/staff-profiles/cayla-shirley,2023-10-02T16:15:28.912974,200,cayla-shirley
4,Michael Utecht,,IT Specialist,Upper Midwest Environmental Sciences Center,https://www.usgs.gov/centers/upper-midwest-env...,mutecht@usgs.gov,,[Michael Utecht],[],[],[],[],[],[],,https://www.usgs.gov/staff-profiles/michael-ut...,2023-10-02T16:15:28.647856,200,michael-utecht


In [215]:
org_query = """
PREFIX wd: <https://geokb.wikibase.cloud/entity/>
PREFIX wdt: <https://geokb.wikibase.cloud/prop/direct/>

SELECT ?item ?itemLabel ?item_alt_label
WHERE {
  ?item wdt:P62* wd:Q44210 .
  OPTIONAL {
    ?item skos:altLabel ?item_alt_label .
    FILTER (lang(?item_alt_label)='en')
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
"""

r_orgs = requests.get(
    'https://geokb.wikibase.cloud/query/sparql',
    params = {'query': org_query, 'format': 'json'}
)

df_geokb_orgs = df_from_sparql(r_orgs.json())
df_geokb_orgs['qid'] = df_geokb_orgs['item'].apply(lambda x: x.split('/')[-1])
org_lookup = df_geokb_orgs.set_index('itemLabel')['qid'].to_dict()
org_lookup.update(df_geokb_orgs[df_geokb_orgs['item_alt_label'].notnull()].set_index('item_alt_label')['qid'].to_dict())

org_lookup['Hydrologic Instrumentation Facility (HIF)'] = 'Q44352'
org_lookup['Caribbean-Florida Water Science Center (CFWSC)'] = 'Q44285'
org_lookup['South Atlantic Water Science Center (SAWSC)'] = 'Q44236'
org_lookup['Reston Stable Isotope Laboratory (RSIL)'] = 'Q44350'
org_lookup['Michigan Bacteriological Research Laboratory'] = 'Q50911'
org_lookup['Science Data Management'] = 'Q44223'


In [216]:
df_missing_profiles['org_qid'] = df_missing_profiles['organization_name'].apply(lambda x: org_lookup[x] if x in org_lookup else None)

In [224]:
references = geokb.models.References()
references.add(
    geokb.datatypes.Item(
        prop_nr=geokb.prop_lookup['data source'],
        value="Q44323"
    )
)

profile_qualifers = geokb.models.Qualifiers()
profile_qualifers.add(
    geokb.datatypes.Time(
        prop_nr=geokb.prop_lookup['retrieved'],
        time='+2023-10-03T00:00:00Z',
    )
)
profile_qualifers.add(
    geokb.datatypes.String(
        prop_nr=geokb.prop_lookup['status code'],
        value='200'
    )
)

pit_qualifier = geokb.models.Qualifiers()
pit_qualifier.add(
    geokb.datatypes.Time(
        prop_nr=geokb.prop_lookup['point in time'],
        time='+2023-10-03T00:00:00Z',
    )
)

for index, row in df_missing_profiles[df_missing_profiles['name'] != 'Tim Clements'].iterrows():
    item = geokb.wbi.item.new()

    item.labels.set('en', row['name'])
    item.descriptions.set('en', f"{row['title']} at the {row['organization_name']}")

    item.claims.add(
        geokb.datatypes.Item(
            prop_nr=geokb.prop_lookup['instance of'],
            value="Q3",
            references=references
        )
    )

    item.claims.add(
        geokb.datatypes.URL(
            prop_nr=geokb.prop_lookup['reference URL'],
            value=row['url'],
            references=references,
            qualifiers=profile_qualifers
        )
    )
    
    item.claims.add(
        geokb.datatypes.Item(
            prop_nr=geokb.prop_lookup['employer'],
            value="Q44210",
            references=references,
            qualifiers=pit_qualifier
        )
    )

    item.claims.add(
        geokb.datatypes.Item(
            prop_nr=geokb.prop_lookup['affiliation'],
            value=row['org_qid'],
            references=references,
            qualifiers=pit_qualifier
        )
    )

    if row['orcid']:
        item.claims.add(
            geokb.datatypes.ExternalID(
                prop_nr=geokb.prop_lookup['ORCID iD'],
                value=row['orcid'],
                references=references,
                qualifiers=pit_qualifier
            )
        )

    if row['email']:
        item.claims.add(
            geokb.datatypes.URL(
                prop_nr=geokb.prop_lookup['email address'],
                value=f"mailto:{row['email']}",
                references=references,
                qualifiers=pit_qualifier
            )
        )

    try:
        response = item.write(
            summary="Added person item from staff profile inventory"
        )
        new_qid = response.id
        print(row['name'], new_qid)
    except Exception as e:
        print("ERROR", str(e))
        new_qid=False

    if new_qid:
        profile_doc = next((i for i in missing_profiles if i['meta']['url'] == row['url']), None)
        if profile_doc is None:
            print("ERROR: Profile not found")
        else:
            profile_doc = {
                "usgs_staff_profile": profile_doc
            }
            talk_page = mw_site.pages[f"Item_talk:{new_qid}"]
            talk_page.save(yaml.dump(profile_doc), summary='Update cached profile metadata')
            print("Cached profile to talk page")    
    

Sabrina N Martinez Q159659
Cached profile to talk page
Seth Siefken Q159660
Cached profile to talk page
Cayla Shirley Q159661
Cached profile to talk page
Michael Utecht Q159662
Cached profile to talk page
Morgan Van Aken Q159663
Cached profile to talk page
Courtney Kramer Q159664
Cached profile to talk page
Travis Hiett Q159665
Cached profile to talk page
Jessica N Middleton Q159666
Cached profile to talk page
Eden F. Zickler Q159667
Cached profile to talk page
Kenan Matterson Q159668
Cached profile to talk page
Harold R Myers Q159669
Cached profile to talk page
Cayla Carlson Q159670
Cached profile to talk page
Caden P Brege Q159671
Cached profile to talk page
Jaycee Favela Q159672
Cached profile to talk page
Gary L Rowe, Jr., PhD Q159673
Cached profile to talk page
Jennie L Ridgley Q159674
Cached profile to talk page
Kelli Baxstrom Q159675
Cached profile to talk page
Melvin Bower Q159676
Cached profile to talk page
Gregor-Fausto Siegmund, Ph.D. Q159677
Cached profile to talk page
Zach