This notebook was employed to pull some items together that represent publications in the GeoKB and get those built out to a Zotero group library for ongoing work. It's a bit ugly because I was working through some issues in how USGS Pubs Warehouse landing pages currently advertise their metadata. I had to go back into the library, update some items, and then build out new items.

I'll likely revisit this in future to make this a more general process that works against the live USGS Pubs Warehouse API (at least until the schema.org situation is worked out for DOI resolution). It makes sense that we will have a logical two-way process between Zotero and the GeoKB. Zotero can be used to annotate and markup content, and annotation can be exploited to encode more connections into the GeoKB.

In [11]:
import os
import requests
from wbmaker import WikibaseConnection
from pyzotero import zotero
import pandas as pd

In [2]:
geokb = WikibaseConnection("GEOKB_CLOUD")

In [149]:
pw_core = pd.read_parquet('./data/pw_core.parquet')
pw_core = pw_core[['indexId','doi','publicationYear','title','pub_type','pub_subtype','series_title','numberOfPages']]

pw_creators = pd.read_parquet('./data/pw_creators.parquet')

def create_dict(row):
    return {
        'creatorType': row['creatorType'],
        'firstName': row['firstName'],
        'lastName': row['lastName'],
    }

pw_creators['creators'] = pw_creators.apply(create_dict, axis=1)
pw_creators.drop(columns=['creatorType','firstName','lastName'], inplace=True)
pw_creators = pw_creators.groupby('indexId', as_index=False).agg(list)

def chunk_list(long_list, chunk_size):
    for i in range(0, len(long_list), chunk_size):
        yield long_list[i:i + chunk_size]

In [109]:
mra_library = zotero.Zotero(
    "5137613",
    'group', 
    os.environ['ZOTERO_API_KEY']
)

In [110]:
user_agent = "GeoKB/0.1 (https://geokb.wikibase.cloud/; mailto:sbristol@usgs.gov) GeoKB-code/0.4"

def crossref_metadata(doi):
    url = f"https://api.crossref.org/works/{doi}"
    headers = {"User-Agent": user_agent}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.json()["message"]
    else:
        return None

In [111]:
mra_zotero_items = mra_library.everything(mra_library.items())

In [120]:
df_mra_zotero_items = pd.DataFrame([i['data'] for i in mra_zotero_items if i['data']['itemType'] != 'attachment'])

df_mra_zotero_items['DOI'] = df_mra_zotero_items['extra'].apply(lambda x: x.replace('DOI: ', ''))
df_mra_zotero_items.drop(columns=[
    'extra',
    'dateAdded',
    'dateModified',
    'rights',
    'tags',
    'collections',
    'language',
    'accessDate',
    'archive',
    'archiveLocation',
    'relations',
    'callNumber',
    'seriesTitle',
    'place',
    'shortTitle',
    'abstractNote',
    'reportNumber',
    'reportType',
    'institution',
    'pages'
], inplace=True)

In [144]:
journal_article_template = mra_library.item_template('journalArticle')

In [146]:
item_updates = pd.merge(
    left=df_mra_zotero_items,
    right=pw_core.rename(columns={
        'doi': 'DOI', 
        'indexId': 'callNumber', 
        'series_title': 'seriesTitle', 
        'title': 'pw_title',
        'numberOfPages': 'pages'
    }),
    how="left",
    on="DOI"
)

item_updates = pd.merge(
    left=item_updates.drop(columns=['creators']),
    right=pw_creators.rename(columns={'indexId': 'callNumber'}),
    how="left",
    on="callNumber"
)

item_updates['itemType'] = 'journalArticle'
item_updates['libraryCatalog'] = "USGS Publications Warehouse (pubs.usgs.gov)"

item_updates['pages'] = item_updates['pages'].fillna('')
item_updates['creators'] = item_updates['creators'].fillna('')
item_updates['creators'] = item_updates['creators'].apply(lambda x: [] if len(x) == 0 else x)

item_updates.drop(columns=['publicationYear', 'pw_title', 'pub_type', 'pub_subtype'], inplace=True)

mra_library_updates = item_updates.to_dict(orient="records")

In [150]:
for update_list in chunk_list(mra_library_updates, 50):
    print(mra_library.update_items(update_list))

True
True
True


In [151]:
mra_items = geokb.url_sparql_query(
    sparql_url="https://geokb.wikibase.cloud/query/sparql?query=PREFIX%20wd%3A%20%3Chttps%3A%2F%2Fgeokb.wikibase.cloud%2Fentity%2F%3E%0APREFIX%20wdt%3A%20%3Chttps%3A%2F%2Fgeokb.wikibase.cloud%2Fprop%2Fdirect%2F%3E%0A%0ASELECT%20%3Fitem%20%3FitemLabel%20%3Fdoi%20%3FindexId%0AWHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP1%20wd%3AQ152682%20.%0A%20%20OPTIONAL%20%7B%0A%20%20%20%20%3Fitem%20wdt%3AP74%20%3Fdoi%20.%0A%20%20%7D%0A%20%20OPTIONAL%20%7B%0A%20%20%20%20%3Fitem%20wdt%3AP114%20%3FindexId%20.%0A%20%20%7D%0A%20%20SERVICE%20wikibase%3Alabel%20%7B%20bd%3AserviceParam%20wikibase%3Alanguage%20%22en%22%20.%20%7D%0A%7D",
    output_format="dataframe"
)

In [158]:
journal_article_template

{'itemType': 'journalArticle',
 'title': '',
 'creators': [{'creatorType': 'author', 'firstName': '', 'lastName': ''}],
 'abstractNote': '',
 'publicationTitle': '',
 'volume': '',
 'issue': '',
 'pages': '',
 'date': '',
 'series': '',
 'seriesTitle': '',
 'seriesText': '',
 'journalAbbreviation': '',
 'language': '',
 'DOI': '',
 'ISSN': '',
 'shortTitle': '',
 'url': '',
 'accessDate': '',
 'archive': '',
 'archiveLocation': '',
 'libraryCatalog': '',
 'callNumber': '',
 'rights': '',
 'extra': '',
 'tags': [],
 'collections': [],
 'relations': {}}

In [166]:
new_zotero_items = pw_core[pw_core['doi'].isin(mra_items[~mra_items['doi'].isin(df_mra_zotero_items['DOI'])]['doi'])].reset_index(drop=True)
new_zotero_items.rename(
    columns={
        'indexId': 'callNumber',
        'doi': 'DOI',
        'publicationYear': 'date',
        'series_title': 'seriesTitle',
        'numberOfPages': 'pages'
    },
    inplace=True
)
new_zotero_items.drop(columns=['pub_type','pub_subtype'], inplace=True)
new_zotero_items['itemType'] = 'journalArticle'
new_zotero_items['libraryCatalog'] = "USGS Publications Warehouse (pubs.usgs.gov)"

new_zotero_items = pd.merge(
    left=new_zotero_items,
    right=pw_creators.rename(columns={'indexId': 'callNumber'}),
    how="left",
    on="callNumber"
)

new_zotero_items['pages'] = new_zotero_items['pages'].fillna('')
new_zotero_items['creators'] = new_zotero_items['creators'].fillna('')
new_zotero_items['creators'] = new_zotero_items['creators'].apply(lambda x: [] if len(x) == 0 else x)

new_zotero_items['url'] = new_zotero_items['callNumber'].apply(lambda x: f"https://pubs.er.usgs.gov/publication/{x}")


In [169]:
for batch in chunk_list(new_zotero_items.to_dict(orient="records"), 50):
    print(mra_library.create_items(batch))

{'successful': {'0': {'key': '8RPK5HQV', 'version': 190, 'library': {'type': 'group', 'id': 5137613, 'name': 'Mineral Resource Assessments', 'links': {'alternate': {'href': 'https://www.zotero.org/groups/mineral_resource_assessments', 'type': 'text/html'}}}, 'links': {'self': {'href': 'https://api.zotero.org/groups/5137613/items/8RPK5HQV', 'type': 'application/json'}, 'alternate': {'href': 'https://www.zotero.org/groups/mineral_resource_assessments/items/8RPK5HQV', 'type': 'text/html'}}, 'meta': {'createdByUser': {'id': 1119084, 'username': 'skybristol', 'name': 'Sky Bristol', 'links': {'alternate': {'href': 'https://www.zotero.org/skybristol', 'type': 'text/html'}}}, 'creatorSummary': 'Van Gosen et al.', 'parsedDate': '1996', 'numChildren': 0}, 'data': {'key': '8RPK5HQV', 'version': 190, 'itemType': 'journalArticle', 'title': 'Mineral resource assessment of the Custer National Forest in the Pryor Mountains, Carbon County, south-central Montana', 'creators': [{'creatorType': 'author', 