This notebook pulls in a representation of a collection of historic reports from the U.S. Bureau of Mines housed in ScienceBase (as part of ReSciColl). We are building these into the GeoKB for use in NLP/LLM/RAG pipelines.

In [1]:
from wbmaker import WikibaseConnection
import requests

geokb = WikibaseConnection('GEOKB_CLOUD')

In [2]:
def year_to_iso(year):
    iso_date = f"+{year}-01-01T00:00:00Z"
    return iso_date


In [3]:
sb_items = []

sb_query = "https://www.sciencebase.gov/catalog/items?&max=500&folderId=620e7840d34e6c7e83baa741&fields=title,files,dates,contacts,spatial,distributionLinks&format=json"

while True:
    r = requests.get(sb_query).json()
    if "items" in r and r["items"]:
        sb_items.extend(r["items"])
    if "nextlink" in r:
        sb_query = r["nextlink"]["url"]
    else:
        break

In [6]:
publishers = {
    'U.S. Bureau of Mines': 'Q164042',
    'Bureau of Mines': 'Q164042',
    'Wahler Associates': 'unknown',
    'Bureau of Land Management': 'Q158218',
    'U.S. Bureau of Economic Analysis': 'Q164043',
    'Johnson, Fredrick L.': 'unknown',
    'Barker, J.C.': 'unknown'
}

In [15]:
sb_items_processed = []

In [None]:
for sb_item in sb_items:
    if sb_item['id'] in [i[0] for i in sb_items_processed]:
        continue
    item = geokb.wbi.item.new()
    if len(sb_item['title']) > 250:
        item_label = f"{sb_item['title'][:247]}..."
    else:
        item_label = sb_item['title']  
    item.labels.set('en', item_label)
    item.descriptions.set('en', 'scanned historic document from a U.S. Bureau of Mines collection on ScienceBase')

    item_ref = geokb.datatypes.URL(
        prop_nr=geokb.prop_lookup['source URL'],
        value=f"{sb_item['link']['url']}?format=json"
    )

    item.claims.add(
        geokb.datatypes.Item(
            prop_nr=geokb.prop_lookup['instance of'],
            value='Q164044'
        )
    )

    if len(sb_item['title']) > 250:
        item.claims.add(
            geokb.datatypes.String(
                prop_nr=geokb.prop_lookup['title'],
                value=sb_item['title']
            )
        )

    item.claims.add(
        geokb.datatypes.URL(
            prop_nr=geokb.prop_lookup['metadata URL'],
            value=sb_item['link']['url'],
            qualifiers=[
                geokb.datatypes.String(
                    prop_nr=geokb.prop_lookup['MIME type'],
                    value='text/html'
                ),
                geokb.datatypes.String(
                    prop_nr=geokb.prop_lookup['MIME type'],
                    value='application/json'
                )
            ]
        )
    )

    pub_date = next((i['dateString'] for i in sb_item['dates'] if "dates" in sb_item and i['type'] == 'Publication'), None)
    pub_date_wb = f"{pub_date}"
    if pub_date:
        item.claims.add(
            geokb.datatypes.Time(
                prop_nr=geokb.prop_lookup['publication date'],
                time=year_to_iso(pub_date),
                precision=9
            )
        )

    if 'contacts' in sb_item:
        sb_publisher_names = [i['name'] for i in sb_item['contacts'] if 'type' in i and i['type'] == 'Publisher']
        publisher_claims = []
        for sb_publisher_name in sb_publisher_names:
            publisher_id = publishers.get(sb_publisher_name, 'unknown')
            if publisher_id == 'unknown':
                publisher_claims.append(
                    geokb.datatypes.Item(
                        prop_nr=geokb.prop_lookup['publisher'],
                        snaktype=geokb.snaktypes.UNKNOWN_VALUE
                    )
                )
            else:
                publisher_claims.append(
                    geokb.datatypes.Item(
                        prop_nr=geokb.prop_lookup['publisher'],
                        value=publisher_id
                    )
                )
        if publisher_claims:
            item.claims.add(publisher_claims)

        sb_author_names = [i['name'] for i in sb_item['contacts'] if 'type' in i and i['type'] == 'Author']
        author_claims = []
        for sb_author_name in sb_author_names:
            author_claims.append(
                geokb.datatypes.String(
                    prop_nr=geokb.prop_lookup['author (name only)'],
                    value=sb_author_name
                )
            )
        if author_claims:
            item.claims.add(author_claims)

    content_url_claims = []
    if "files" in sb_item and sb_item['files']:
        for f in sb_item['files']:
            content_url_claims.append(
                geokb.datatypes.URL(
                    prop_nr=geokb.prop_lookup['content URL'],
                    value=f['url'],
                    qualifiers=[
                        geokb.datatypes.String(
                            prop_nr=geokb.prop_lookup['MIME type'],
                            value=f['contentType']
                        ),
                        geokb.datatypes.String(
                            prop_nr=geokb.prop_lookup['checksum'],
                            value=f['checksum']['value']
                        )
                    ]
                )
            )
    if content_url_claims:
        item.claims.add(content_url_claims)

    if "spatial" in sb_item:
        item.claims.add(
            geokb.datatypes.GlobeCoordinate(
                prop_nr=geokb.prop_lookup['coordinate location'],
                longitude=sb_item['spatial']['representationalPoint'][0],
                latitude=sb_item['spatial']['representationalPoint'][1]
            )
        )

    try:
        response = item.write(summary="Added Bureau of Mines document representation from ScienceBase")
        print(response.id)
        sb_items_processed.append((sb_item['id'], response.id))
    except Exception as e:
        print("PROBLEM:", sb_item['id'])
        if str(e) == "wikibaseintegrator.wbi_exceptions.ModificationFailed: 'Label must be no more than 250 characters long'":
            print(sb_item['title'])
    

# Duplicate Checksum Problem
I identified some cases where we have duplicate files in the collection and asked for someone to work on these. The following query and report shows the problem.

In [14]:
query_bom_report = """
PREFIX wd: <https://geokb.wikibase.cloud/entity/>
PREFIX wdt: <https://geokb.wikibase.cloud/prop/direct/>
PREFIX p: <https://geokb.wikibase.cloud/prop/>
PREFIX ps: <https://geokb.wikibase.cloud/prop/statement/>
PREFIX pq: <https://geokb.wikibase.cloud/prop/qualifier/>

SELECT ?report ?content_url ?meta_url ?mime_type ?checksum
WHERE {
  ?report wdt:P1 wd:Q164044 ;
          wdt:P141 ?meta_url ;
          wdt:P136 ?content_url ;
          p:P136 ?content_url_statement .
  ?content_url_statement ps:P136 ?content_url ;
                         pq:P65 ?mime_type ;
                         pq:P197 ?checksum .
}
"""

bom_reports = geokb.sparql_query(query_bom_report)
bom_reports.head()

Unnamed: 0,report,content_url,meta_url,mime_type,checksum
0,https://geokb.wikibase.cloud/entity/Q164045,https://www.sciencebase.gov/catalog/file/get/6...,https://www.sciencebase.gov/catalog/item/63d2f...,application/pdf,90486a3efe033d39978e7c017af71f72
1,https://geokb.wikibase.cloud/entity/Q164046,https://www.sciencebase.gov/catalog/file/get/6...,https://www.sciencebase.gov/catalog/item/63d2f...,application/pdf,80fa511e233193b6ce3901c6fe9a3be3
2,https://geokb.wikibase.cloud/entity/Q164046,https://www.sciencebase.gov/catalog/file/get/6...,https://www.sciencebase.gov/catalog/item/63d2f...,image/geotiff,d2b5cf06d7888e5792523272c4a4d0e2
3,https://geokb.wikibase.cloud/entity/Q164046,https://www.sciencebase.gov/catalog/file/get/6...,https://www.sciencebase.gov/catalog/item/63d2f...,image/geotiff,cf8177374d1a1a035ebd035ba3bf56b1
4,https://geokb.wikibase.cloud/entity/Q164047,https://www.sciencebase.gov/catalog/file/get/6...,https://www.sciencebase.gov/catalog/item/63d2f...,application/pdf,f117f23cb92f8637c433bb165c64cc5b


In [17]:
for _, row in bom_reports[bom_reports['checksum'].duplicated(keep=False)].sort_values('checksum').iterrows():
    print(row['meta_url'])
    print(row['checksum'])
    print(row['content_url'])
    print()

https://www.sciencebase.gov/catalog/item/64069124d34e76f5f75e2f20
2299e5dcf317feff1bbddaff2be3f39d
https://www.sciencebase.gov/catalog/file/get/64069124d34e76f5f75e2f20?f=__disk__93%2F18%2F4d%2F93184d093eff0e410d88125c59f7408dcadcecea

https://www.sciencebase.gov/catalog/item/6406a387d34e76f5f75e2f52
2299e5dcf317feff1bbddaff2be3f39d
https://www.sciencebase.gov/catalog/file/get/6406a387d34e76f5f75e2f52?f=__disk__7e%2F9c%2F4b%2F7e9c4bdfd72a55da30207bfc6cac98e1104abdc1

https://www.sciencebase.gov/catalog/item/6414d6e2d34eb496d1ceb6df
587c0f70ef1c67c14603fbd8b39a5fd8
https://www.sciencebase.gov/catalog/file/get/6414d6e2d34eb496d1ceb6df?f=__disk__d6%2Fbd%2Fb2%2Fd6bdb215243cd4af4630c0b9ab385f0f3c25d786

https://www.sciencebase.gov/catalog/item/6414d6e2d34eb496d1ceb6df
587c0f70ef1c67c14603fbd8b39a5fd8
https://www.sciencebase.gov/catalog/file/get/6414d6e2d34eb496d1ceb6df?f=__disk__ba%2F88%2F31%2Fba88319577ed4ea542b9315c510d646016512a94

https://www.sciencebase.gov/catalog/item/6406b7e3d34e76f