In [15]:
import os
import mwclient
import requests
import pandas as pd
import isaid
from joblib import Parallel, delayed
from tqdm import tqdm


In [16]:
source_item = "Q44323"

In [6]:
# Create a secure connection to the Wikibase so we can write to it
def create_authenticated_site(user_name, password):
    site = mwclient.Site('geokb.wikibase.cloud', path='/w/', scheme='https')
    site.login(user_name, password)

    return site

def df_from_sparql(json_results):
    data_records = []
    var_names = json_results['head']['vars']

    for record in json_results['results']['bindings']:
        data_record = {}
        for var_name in var_names:
            data_record[var_name] = record[var_name]['value'] if var_name in record else None
        data_records.append(data_record)

    return pd.DataFrame(data_records)

In [4]:
# Establish GeoKB Wikibase site connection
mw_site = create_authenticated_site(os.environ['WB_BOT_GEOKB_CLOUD'], os.environ['WB_BOT_PASS_GEOKB_CLOUD'])  

source_page = mw_site.pages[f"Item_talk:{source_item}"]
profile_list = source_page.text().split(',')


In [11]:
query_person_profiles = """
PREFIX wd: <https://geokb.wikibase.cloud/entity/>
PREFIX wdt: <https://geokb.wikibase.cloud/prop/direct/>
PREFIX p: <https://geokb.wikibase.cloud/prop/>
PREFIX ps: <https://geokb.wikibase.cloud/prop/statement/>
PREFIX pq: <https://geokb.wikibase.cloud/prop/qualifier/>

SELECT ?item ?itemLabel ?profile_url ?retrieved ?status_code ?orcid ?email
WHERE {
  ?item wdt:P1 wd:Q3 ;
        wdt:P31 ?profile_url ;
        p:P31 ?ref_url_statement .
  OPTIONAL {
    ?item wdt:P106 ?orcid ;
          wdt:P109 ?email .
  }
  OPTIONAL {
    ?ref_url_statement pq:P151 ?status_code ;
                       pq:P139 ?retrieved .
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""

r = requests.get(
    'https://geokb.wikibase.cloud/query/sparql',
    params = {'query': query_person_profiles, 'format': 'json'}
)

In [13]:
df_geokb_profiles = df_from_sparql(r.json())
df_geokb_profiles['qid'] = df_geokb_profiles['item'].apply(lambda x: x.split('/')[-1])
df_geokb_profiles['profile_name'] = df_geokb_profiles['profile_url'].apply(lambda x: x.split('/')[-1])
df_geokb_profiles['email'] = df_geokb_profiles['email'].apply(lambda x: x.split(':')[-1] if x else None)
df_geokb_profiles.head()

Unnamed: 0,item,itemLabel,profile_url,retrieved,status_code,orcid,email,qid,profile_name
0,https://geokb.wikibase.cloud/entity/Q46517,Jane M. Hammarstrom,https://www.usgs.gov/staff-profiles/jane-m-ham...,2023-09-30T00:00:00Z,200,0000-0003-2742-3460,jhammars@usgs.gov,Q46517,jane-m-hammarstrom
1,https://geokb.wikibase.cloud/entity/Q44385,Brad Aagaard,https://www.usgs.gov/staff-profiles/brad-aagaard,2023-09-30T00:00:00Z,200,0000-0002-8795-9833,baagaard@usgs.gov,Q44385,brad-aagaard
2,https://geokb.wikibase.cloud/entity/Q44386,Lowell Abbadini,https://www.usgs.gov/staff-profiles/lowell-abb...,2023-09-30T00:00:00Z,200,,,Q44386,lowell-abbadini
3,https://geokb.wikibase.cloud/entity/Q44387,Justin Abel,https://www.usgs.gov/staff-profiles/justin-abel,2023-09-30T00:00:00Z,200,,,Q44387,justin-abel
4,https://geokb.wikibase.cloud/entity/Q44389,Kate Ackerman,https://www.usgs.gov/staff-profiles/kate-ackerman,2023-09-30T00:00:00Z,200,0000-0003-3925-721X,kackerman@usgs.gov,Q44389,kate-ackerman


In [17]:
missing_profile_names = [i for i in profile_list if i not in df_geokb_profiles['profile_name'].tolist()]
print(len(missing_profile_names))
display(missing_profile_names[:5])

564


['darryl-a-hoppe',
 'rick-l-wessels',
 'tim-clements',
 'michelle-bouchard',
 'sabrina-n-martinez']

In [18]:
missing_profiles = Parallel(n_jobs=-1, prefer='threads')(delayed(isaid.staff_profile_scrape)(i) for i in tqdm(missing_profile_names))


100%|██████████| 564/564 [01:13<00:00,  7.68it/s]


In [20]:
df_missing_profiles = pd.DataFrame([i['profile'] for i in missing_profiles])

In [24]:
pd.merge(
    left=df_geokb_profiles[df_geokb_profiles['orcid'].notnull()],
    right=df_missing_profiles[df_missing_profiles['orcid'].notnull()],
    how='inner',
    on='orcid'
)

Unnamed: 0,item,itemLabel,profile_url,retrieved,status_code,orcid,email_x,qid,profile_name,name,...,organization_link,email_y,intro_statements,expertise_terms,professional_experience,education,affiliations,honors,abstracts,personal_statement
0,https://geokb.wikibase.cloud/entity/Q45265,Theodore R Castro-Santos,https://www.usgs.gov/staff-profiles/theodore-r...,2023-09-30T00:00:00Z,200,0000-0003-2575-9120,tcastrosantos@usgs.gov,Q45265,theodore-r-castro-santos,Theodore Castro-Santos,...,https://www.usgs.gov/centers/eesc,tcastrosantos@usgs.gov,[Research Ecologist at the Eastern Ecological ...,"[Telemetry, Animal Behavior, Movement Ecology,...","[US Geological Survey (1995-present), U.S. Fis...",[PhD Organismic and Evolutionary Biology; Univ...,"[American Fisheries Society, Society for Integ...",[],[],Human activities have caused extensive fragmen...
1,https://geokb.wikibase.cloud/entity/Q45814,Carolyn Driedger,https://www.usgs.gov/staff-profiles/carolyn-dr...,2023-09-30T00:00:00Z,200,0000-0002-4011-4112,driedger@usgs.gov,Q45814,carolyn-driedger,Carolyn (Driedger) Mastin,...,https://www.usgs.gov/observatories/cvo,cmastin@usgs.gov,[Addressing volcano hazards effectively entail...,"[communication, volcano hazards, volcanic acti...",[USGS Professional History USGS Cascades Volca...,[M.S. Shippensburg State University of Pennsyl...,"[American Geophysical Union (AGU), Geological ...",[Department of Interior Meritorious Service Aw...,"[""Mount St. Helens Revisited: Lives Changed, L...",My science career began with research on glaci...
2,https://geokb.wikibase.cloud/entity/Q46754,Glenn A Hodgkins,https://www.usgs.gov/staff-profiles/glenn-a-ho...,2023-09-30T00:00:00Z,200,0000-0002-4916-5565,gahodgki@usgs.gov,Q46754,glenn-a-hodgkins,Glenn Hodgkins,...,https://www.usgs.gov/centers/new-england-water...,gahodgki@usgs.gov,[Glenn Hodgkins is a Research Hydrologist with...,"[Streamflow, Groundwater, Trends, Variability]","[Research Hydrologist, U.S. Geological Survey,...","[M.S. Engineering, Purdue University, 1995, B....",[],[],[],Glenn's work in recent years has focused on st...
3,https://geokb.wikibase.cloud/entity/Q48257,"Trevor P Needham, PhD",https://www.usgs.gov/staff-profiles/trevor-p-n...,2023-09-30T00:00:00Z,200,0000-0001-9356-4216,tneedham@usgs.gov,Q48257,trevor-p-needham-0,"Trevor P Needham, PhD",...,https://www.usgs.gov/centers/md-de-dc-water,tneedham@usgs.gov,"[Trevor Needham, PhD is a Hydrologist employed...",[Bioremediation],"[Hydrologist, U.S. Geological Survey MD-DE-DC ...","[PhD: University of Maryland Baltimore County,...",[],[],"[Needham, Trevor. Fate and Transport of PCBs i...",
4,https://geokb.wikibase.cloud/entity/Q49316,"Dorothy Sifuentes, PhD",https://www.usgs.gov/staff-profiles/dorothy-si...,2023-09-30T00:00:00Z,200,0000-0001-7540-2766,dsifuentes@usgs.gov,Q49316,dorothy-sifuentes,"Dorothy F Sifuentes, PhD",...,https://www.usgs.gov/centers/cfwsc,dsifuentes@usgs.gov,[],"[groundwater flow, saltwater intrusion, ground...",[],[],[],[],[],"Professional ExperienceU.S.G.S., Supervisory H..."
5,https://geokb.wikibase.cloud/entity/Q49595,Meryl Storb,https://www.usgs.gov/staff-profiles/meryl-storb,2023-09-30T00:00:00Z,200,0000-0002-4346-5022,mstorb@usgs.gov,Q49595,meryl-storb,Meryl B Storb,...,https://www.usgs.gov/centers/wyoming-montana-w...,mstorb@usgs.gov,[Meryl Storb is hydrologist with the USGS WY-M...,"[WRTDS, Trends, Loads, Solute transport, Bioge...","[2016 – present Hydrologist, U.S. Geological S...","[Ph.D. Candidate, Ecology and Environmental Sc...",[],[],[],Meryl is also a PhD candidate in the Payn Wate...
6,https://geokb.wikibase.cloud/entity/Q48390,Adam Oliphant,https://www.usgs.gov/staff-profiles/adam-oliphant,2023-09-30T00:00:00Z,200,0000-0001-8622-7932,aoliphant@usgs.gov,Q48390,adam-oliphant,Adam J. Oliphant,...,https://www.usgs.gov/centers/western-geographi...,aoliphant@usgs.gov,[Adam Oliphant is a geographer with the USGS b...,"[remote sensing, geospatial analysis, forest r...",[2015 - present - Geographer with USGS Western...,[M.S. in Forestry with an emphasis in Remote S...,[],[],[],He is part of the Western Geographic Science C...
7,https://geokb.wikibase.cloud/entity/Q47868,Brendan A. McCarthy,https://www.usgs.gov/staff-profiles/brendan-a-...,2023-09-30T00:00:00Z,200,0000-0003-4993-021X,bmccarthy@usgs.gov,Q47868,brendan-a-mccarthy,Brendan McCarthy,...,https://www.usgs.gov/centers/new-england-water...,bmccarthy@usgs.gov,[Brendan McCarthy is a Hydrologist in the New ...,"[Hydrology, GIS, Python, R for Statistics]","[Hydrologist, U.S. Geological Survey, New Engl...","[M.S. Hydrogeology, Stony Brook University, 20...",[],[],[],Brendan's work at the USGS New England Water S...
8,https://geokb.wikibase.cloud/entity/Q54112,John A Engott,https://www.usgs.gov/staff-profiles/john-engott,2023-09-30T00:00:00Z,200,0000-0003-1889-4519,jaengott@usgs.gov,Q54112,john-engott,John A Engott,...,https://www.usgs.gov/centers/california-water-...,jaengott@usgs.gov,[John A Engott - California Water Science Center],[],[],[],[],[],[],
9,https://geokb.wikibase.cloud/entity/Q54115,A. Kate Souders,https://www.usgs.gov/staff-profiles/kate-souders,2023-09-30T00:00:00Z,200,0000-0002-1367-8924,asouders@usgs.gov,Q54115,kate-souders,A. Kate Souders,...,https://www.usgs.gov/centers/gggsc,asouders@usgs.gov,[Kate Souders is a Research Geologist with the...,[geology],[],[],[],[],[],


In [22]:
df_missing_profiles[
    (df_missing_profiles['email'].isin(df_geokb_profiles['email'])) | (df_missing_profiles['orcid'].isin(df_geokb_profiles['orcid']))
]

Unnamed: 0,name,name_qualifier,title,organization_name,organization_link,email,orcid,intro_statements,expertise_terms,professional_experience,education,affiliations,honors,abstracts,personal_statement
0,Darryl A. Hoppe,,Geologist,"Geology, Energy & Minerals Science Center",https://www.usgs.gov/centers/geology-energy-an...,dhoppe@usgs.gov,0000-0003-3369-5577,[Darryl Hoppe is a Geologist with the USGS Geo...,[Geochemistry],"[2018 - Present: Geologist, U.S. Geological Su...",[],[],[],[],
2,Tim Clements,,Scientist,Earthquake Hazards Program,https://www.usgs.gov/programs/earthquake-hazards,tclements@usgs.gov,,[Tim Clements],[],[],[],[],[],[],
6,Cayla Shirley,,Hydrologic Technician,Oregon Water Science Center,https://www.usgs.gov/centers/oregon-water-scie...,cshirley@usgs.gov,,[Cayla is a hydrologic technician at the Orego...,[],[],[],[],[],[],
7,Carolyn (Driedger) Mastin,,Emeritus/USGS-CVO Outreach Coordinator (1995-2...,Cascades Volcano Observatory,https://www.usgs.gov/observatories/cvo,cmastin@usgs.gov,0000-0002-4011-4112,[Addressing volcano hazards effectively entail...,"[communication, volcano hazards, volcanic acti...",[USGS Professional History USGS Cascades Volca...,[M.S. Shippensburg State University of Pennsyl...,"[American Geophysical Union (AGU), Geological ...",[Department of Interior Meritorious Service Aw...,"[""Mount St. Helens Revisited: Lives Changed, L...",My science career began with research on glaci...
8,Michael Utecht,,IT Specialist,Upper Midwest Environmental Sciences Center,https://www.usgs.gov/centers/upper-midwest-env...,mutecht@usgs.gov,,[Michael Utecht],[],[],[],[],[],[],
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
558,Megan A Evans,,Public Affairs Specialist,Eastern Ecological Science Center,https://www.usgs.gov/centers/eesc,meganevans@usgs.gov,,[Megan Evans is a Public Affairs Specialist wi...,"[Communications, Outreach, Photography, Social...","[Owner, Megan Evans Photography, Social Media ...","[University of Colorado at Boulder. BS, Journa...",[],[],[],Megan brings more than 25 years of professiona...
559,Alma C Schrage,,Biological Technician,Great Lakes Science Center,https://www.usgs.gov/centers/great-lakes-scien...,aschrage@usgs.gov,,[Alma Schrage is a Biological Technician based...,[],[],[],[],[],[],
560,Robert A Williams,,Scientist Emeritus,Earthquake Hazards Program,https://www.usgs.gov/programs/earthquake-hazards,rawilliams@usgs.gov,,[Rob Williams is a Scientist Emeritus in the E...,[],[],[],[],[],[],
561,Harold Cameron,,Hydrologic Technician,Pennsylvania Water Science Center,https://www.usgs.gov/centers/pennsylvania-wate...,hcameron@usgs.gov,,[Harold Cameron is a Hydrologic Technician wit...,[],[],[],[],[],[],
