This notebook handles the process of scraping through the USGS staff profiles paginated interface to pull together a current inventory in structured data and dropping that to the Mediawiki discussion page for the item representing that source in the GeoKB. The inventory is then processed with a separate algorithm to deal with the content.

After figuring out there's a limit on the size of data I can push, I stripped this process way back to simply scrape all of the unique profile URLs (and really just the name part of the profile URL). I really only need these pointers over time to figure out what the GeoKB already knows about and what profiles need to be pulled and added.

In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs
from joblib import Parallel, delayed
from tqdm import tqdm
import os
import mwclient

In [2]:
# I could get these dynamically, but they are the essential variables we need to run this
source_item = "Q44323"
profile_inventory_url = "https://www.usgs.gov/connect/staff-profiles"

In [3]:
# We need to get the range of pages to scrape
def last_page():
    r_profile_inventory = requests.get(profile_inventory_url)
    soup_profile_inventory = BeautifulSoup(r_profile_inventory.content, 'html.parser')
    last_page_link = soup_profile_inventory.find('a', {'title': 'Go to last page'})['href']
    if not last_page_link:
        return

    last_page_url = "".join([profile_inventory_url, last_page_link])
    parsed_url = urlparse(last_page_url)
    query_params = parse_qs(parsed_url.query)
    last_page_num = query_params.get("page")
    if last_page_num:
        return int(last_page_num[0])

# Scrape the basic profile name string from links on the page
def profiles_from_inventory_page(page_num):
    url = "?".join([profile_inventory_url, f"page={str(page_num)}"])
    r_inventory = requests.get(url)
    if r_inventory.status_code == 200:
        soup = BeautifulSoup(r_inventory.content, 'html.parser')
        container = soup.find('div', {'class': 'views-element-container'})
        return [l['href'].split('/')[-1] for l in container.find_all('a', href=lambda href: href.startswith('/staff-profiles/') if href else False)]

# Combine all lists returned in parallel processing into one with unique values
def inventory_list(inventories):
    inventory_records = []
    for i in inventories:
        inventory_records.extend(i)

    return list(set(inventory_records))

# Create a secure connection to the Wikibase so we can write to it
def create_authenticated_site(user_name, password):
    site = mwclient.Site('geokb.wikibase.cloud', path='/w/', scheme='https')
    site.login(user_name, password)

    return site

# Write out the inventory list to the Wikibase
def write_inventory(qid, site, profile_list):
    inventory_cache_page = f"Item_talk:{qid}"
    
    try:
        page = site.pages[inventory_cache_page]
        page.save(','.join(profile_list), summary=f'Added cache of USGS staff profile inventory')
        return True
    except Exception as e:
        return e

In [4]:
# get the last page of the inventory
last_page_num = last_page()

# Scrape the inventory pages as fast as possible
inventories = Parallel(n_jobs=-1, prefer='threads')(delayed(profiles_from_inventory_page)(i) for i in tqdm(range(last_page_num+1)))

# Put inventories together into dataframe
inventory_list = inventory_list(inventories)

# Establish GeoKB Wikibase site connection
mw_site = create_authenticated_site(os.environ['WB_BOT_GEOKB_CLOUD'], os.environ['WB_BOT_PASS_GEOKB_CLOUD'])   

# Write the inventory list to the source page
write_inventory(source_item, mw_site, inventory_list)

100%|██████████| 490/490 [00:56<00:00,  8.67it/s]


True