With 177K+ records in the Pubs Warehouse catalog, we want to establish a baseline of representation for these in the GeoKB and then run regular updates for new and refreshed metadata. This notebook has two parts:
* Cache raw data as pickle files via a loop against the PW web service
* Pre-process raw data into an efficient structure we can work from

# Retrieve raw metadata

In [None]:
%%time
import requests
import pickle

page_num = 1
num_records = 0

while True:
    api = f"https://pubs.er.usgs.gov/pubs-services/publication/?page_size=1000&page_number={page_num}"
    pw = requests.get(api)
    
    if pw.status_code != 200:
        break

    pw_records = pw.json()
    
    if pw_records["records"]:
        pickle.dump(pw_records["records"], open(f"./data/pwdump/page_{page_num}.pickle", "wb"))
        num_records+=len(pw_records["records"])
        page_num += 1
        print(
            "PAGE NUMBER:", 
            pw_records['pageNumber'], 
            "| PAGE ROW START:", 
            pw_records['pageRowStart'],
            "| RECORDS CACHED:", 
            num_records,
            "| REMAINING RECORDS:", 
            int(pw_records['recordCount']) - num_records
        )
    else:
        break


# Pre-process PW Catalog

There are a number of ways that the PW Catalog has packaged information that add additional stuff we can't do anything with (e.g., internal identifiers) or are otherwise difficult to deal with. Taking the raw metadata that we dumped to a collection of pickle files, we can pull out the parts we can use in building a knowledge representation, run a couple of validation steps, and build a minimal transformation that includes the following:
* Core text string fields that have some utility in building the representation in the GeoKB
* Collection of links that provide some clues on underlying content
* Contributors that have ORCIDs (the only ones we are processing at this time)
* Valid GeoJSON feature collections that we may use in future

In [1]:
import os
import pickle
import pandas as pd
import json
import geojson


In [2]:
folder_path = "./data/pwdump"

all_dicts = []
for file_name in os.listdir(folder_path):
    if file_name.endswith(".pickle"):
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'rb') as file:
            # Load the list of dictionaries from each pickle file
            data = pickle.load(file)
            all_dicts.extend(data)

pw_dump = pd.DataFrame(all_dicts)

In [6]:
pw_core_props = [
    "indexId",
    "doi",
    "publisher",
    "lastModifiedDate",
    "displayToPublicDate",
    "publishedDate",
    "revisedDate",
    "publicationYear",
    "title",
    "docAbstract",
    "tableOfContents",
    "usgsCitation",
    "country",
    "state",
    "county",
    "city",
    "otherGeospatial",
    "ipdsId",
    "pub_type",
    "pub_subtype",
    "series_title",
    "part_of",
    "superseded_by",
    "cost_centers",
    "programNote",
    "pub_rel",
    "numberOfPages"
]

pw_dump['pub_type'] = pw_dump['publicationType'].apply(lambda x: x['text'] if isinstance(x, dict) else None)
pw_dump['pub_subtype'] = pw_dump['publicationSubtype'].apply(lambda x: x['text'] if isinstance(x, dict) else None)
pw_dump['series_title'] = pw_dump['seriesTitle'].apply(lambda x: x['text'] if isinstance(x, dict) else None)
pw_dump['part_of'] = pw_dump['isPartOf'].apply(lambda x: x['indexId'] if isinstance(x, dict) else None)
pw_dump['superseded_by'] = pw_dump['supersededBy'].apply(lambda x: x['indexId'] if isinstance(x, dict) else None)

pw_dump['cost_centers'] = pw_dump['costCenters'].apply(lambda x: [i['text'] for i in x])

pw_dump['pub_rel'] = pw_dump['interactions'].apply(lambda x: [':'.join([i['subject']['indexId'], i['predicate'], i['object']['indexId']]) for i in x])

pw_core = pw_dump[pw_core_props].reset_index(drop=True)

pw_core.to_parquet('./data/pw_cache/pw_core.parquet')

In [7]:
pw_links = pw_dump[pw_dump['links'].notnull()][['indexId','links']].reset_index(drop=True).explode('links')
pw_links['link_type'] = pw_links['links'].apply(lambda x: x['type']['text'])
pw_links['link_url'] = pw_links['links'].apply(lambda x: x['url'])
pw_links.drop(columns="links", inplace=True)

pw_links.to_parquet('./data/pw_cache/pw_links.parquet')

In [10]:
pw_dump[pw_dump['contributors'].notnull()][['indexId','contributors']].iloc[0]['contributors']

{'authors': [{'text': 'Lee, Willis T.',
   'contributorId': 87524,
   'corporation': False,
   'usgs': True,
   'family': 'Lee',
   'given': 'Willis T.',
   'affiliations': [],
   'preferred': False,
   'id': 221485,
   'contributorType': {'id': 1, 'text': 'Authors'},
   'rank': 1}]}

In [8]:
def parse_contributors(contributors):
    pub_contributors = []
    for role, contrib_list in contributors.items():
        for i in contrib_list:
            if not i["corporation"] and "orcid" in i and i["orcid"].startswith('https://orcid.org/'):
                affiliations = None
                if "affiliations" in i and i["affiliations"]:
                    affiliations = [x["text"] for x in i["affiliations"] if x["usgs"]]
                pub_contributors.append({
                    "orcid": i["orcid"].split("/")[-1],
                    "usgs": i["usgs"],
                    "pub_role": role,
                    "usgs_affiliations": affiliations
                })

    return pub_contributors

pw_contributors = pw_dump[pw_dump['contributors'].notnull()][['indexId','contributors']].reset_index(drop=True)
pw_contributors['orcid_contributors'] = pw_contributors['contributors'].apply(parse_contributors)
pw_contributors.drop(columns="contributors", inplace=True)
pw_contributors = pw_contributors[pw_contributors['orcid_contributors'].str.len() > 0]

pw_contributors = pw_contributors.explode('orcid_contributors').reset_index(drop=True)

pw_contributors = pd.concat([
    pw_contributors.drop('orcid_contributors', axis=1),
    pw_contributors['orcid_contributors'].apply(pd.Series)
], axis=1)

pw_contributors.to_parquet('./data/pw_cache/pw_contributors.parquet')


In [34]:
pw_contributors_raw = pw_dump[pw_dump['contributors'].notnull()][['indexId','contributors']].reset_index(drop=True)

pw_contributors_all = pd.concat([pw_contributors_raw.drop(columns="contributors"), pd.json_normalize(pw_contributors_raw['contributors'])], axis=1)

pw_authors = pw_contributors_all[['indexId','authors']].dropna().reset_index(drop=True)
pw_authors = pw_authors.explode("authors").reset_index(drop=True)
pw_authors = pd.concat([pw_authors.drop(columns="authors"), pd.json_normalize(pw_authors['authors'])], axis=1)
pw_authors['creatorType'] = "author"
pw_authors = pw_authors[['indexId','creatorType','given','family']].dropna().rename(columns={'given': 'firstName', 'family': 'lastName'})

pw_editors = pw_contributors_all[['indexId','editors']].dropna().reset_index(drop=True)
pw_editors = pw_editors.explode("editors").reset_index(drop=True)
pw_editors = pd.concat([pw_editors.drop(columns="editors"), pd.json_normalize(pw_editors['editors'])], axis=1)
pw_editors['creatorType'] = "editor"
pw_editors = pw_editors[['indexId','creatorType','given','family']].dropna().rename(columns={'given': 'firstName', 'family': 'lastName'})

pw_compilers = pw_contributors_all[['indexId','compilers']].dropna().reset_index(drop=True)
pw_compilers = pw_compilers.explode("compilers").reset_index(drop=True)
pw_compilers = pd.concat([pw_compilers.drop(columns="compilers"), pd.json_normalize(pw_compilers['compilers'])], axis=1)
pw_compilers['creatorType'] = "compiler"
pw_compilers = pw_compilers[['indexId','creatorType','given','family']].dropna().rename(columns={'given': 'firstName', 'family': 'lastName'})

pd.concat([pw_authors, pw_editors, pw_compilers]).reset_index(drop=True).to_parquet('./data/pw_cache/pw_creators.parquet')


In [7]:
def check_geojson(geojson_string):
    check_obj = geojson.loads(geojson_string)
    if check_obj.is_valid:
        return geojson.dumps(check_obj)
    else:
        return None

pw_geo = pw_dump[pw_dump['geographicExtents'].notnull()][['indexId','geographicExtents']].reset_index(drop=True)
pw_geo['geojson'] = pw_geo['geographicExtents'].apply(json.loads)
pw_geo.drop(columns="geographicExtents", inplace=True)
pw_geo['geojson'] = pw_geo['geojson'].apply(json.dumps)
pw_geo['geojson'] = pw_geo['geojson'].apply(check_geojson)

pw_geo.dropna(subset="geojson", inplace=True)

pw_geo.to_parquet('./data/pw_cache/pw_geo.parquet')