The USGS web has a unique listing for "observatories" as an organizational concept that really doesn't exist anywhere else. Right now, these are only the volcano observatories, but we could possibly see other types of observatories in future. Since this is how these particular organizational units are presented to the public and we can and need to pull them in, this notebook processes the page of observatories into GeoKB items.

In [1]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import openai
from wbmaker import WikibaseConnection

In [2]:
geokb = WikibaseConnection("GEOKB_CLOUD")

In [3]:
usgs_base_url = "https://www.usgs.gov"
usgs_obs_url = "https://www.usgs.gov/science/observatories"

In [8]:
def summarize_description(office_name, long_desc):
    if len(long_desc) < 251:
        return long_desc

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant."
            },
            {
                "role": "user",
                "content": f"Provide a concise summary (250 characters or less) of the following text: {long_desc}"
            },
        ]
    )

    summary = response["choices"][0]["message"]["content"]

    if len(summary) > 250:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant."
                },
                {
                    "role": "user",
                    "content": f"Provide a concise summary of less than 250 characters for the USGS {office_name}."
                },
            ]
        )

        summary = response["choices"][0]["message"]["content"]

        if len(summary) > 250:
            return long_desc[:249]

    return summary

In [5]:
observatories = []
r = requests.get(usgs_obs_url)
if r.status_code == 200:
    soup = BeautifulSoup(r.content, 'html.parser')
    for item in soup.find_all('div', {'class': 'grid-col-10'}):
        observatories.append({
            "label": item.find('h4').text.strip(),
            "link": f"{usgs_base_url}{item.find('h4').find('a')['href']}",
            "description": item.find('div', {'class': 'field-intro'}).text.strip()
        })

In [6]:
df_observatories = pd.DataFrame(observatories)
df_observatories

Unnamed: 0,label,link,description
0,Alaska Volcano Observatory,https://www.usgs.gov/observatories/avo,The Alaska Volcano Observatory (AVO) is a join...
1,California Volcano Observatory,https://www.usgs.gov/california-volcano-observ...,As a part of the U.S. Geological Survey's Volc...
2,Cascades Volcano Observatory,https://www.usgs.gov/cascades-volcano-observatory,The U.S. Geological Survey's Cascades Volcano ...
3,Hawaiian Volcano Observatory,https://www.usgs.gov/observatories/hvo,HVO monitors earthquakes and the active volcan...
4,Yellowstone Volcano Observatory,https://www.usgs.gov/yellowstone-volcano-obser...,The Yellowstone Volcano Observatory (YVO) is a...


In [10]:
df_observatories["summary"] = df_observatories.apply(lambda x: summarize_description(x.label, x.description), axis=1)

In [12]:
q_geokb_orgs = "PREFIX%20wd%3A%20%3Chttps%3A%2F%2Fgeokb.wikibase.cloud%2Fentity%2F%3E%0APREFIX%20wdt%3A%20%3Chttps%3A%2F%2Fgeokb.wikibase.cloud%2Fprop%2Fdirect%2F%3E%0A%0ASELECT%20%3Fitem%20%3FitemLabel%20%3Fitem_alt_label%0AWHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP62*%20wd%3AQ44210%20.%0A%20%20%3Fitem%20wdt%3AP1%20%3Finstance_of%20.%0A%20%20OPTIONAL%20%7B%0A%20%20%20%20%3Fitem%20skos%3AaltLabel%20%3Fitem_alt_label%20.%0A%20%20%20%20FILTER%20(lang(%3Fitem_alt_label)%3D'en')%0A%20%20%7D%0A%20%20SERVICE%20wikibase%3Alabel%20%7B%20bd%3AserviceParam%20wikibase%3Alanguage%20%22en%22%20.%20%7D%0A%7D"
df_geokb_orgs = geokb.wb_ref_data(query=q_geokb_orgs)
df_geokb_orgs["qid"] = df_geokb_orgs["item"].apply(lambda x: x.split("/")[-1])

org_name_lookup = pd.concat(
    [
        df_geokb_orgs[["qid","itemLabel"]].drop_duplicates(),
        df_geokb_orgs[df_geokb_orgs.item_alt_label.notnull()][["qid","item_alt_label"]].rename(columns={"item_alt_label": "itemLabel"})
    ]
).reset_index(drop=True)

In [14]:
df_observatories_geokb = pd.merge(
    left=df_observatories,
    right=org_name_lookup.rename(columns={"itemLabel": "label"}),
    how="left",
    on="label"
)

In [18]:
df_observatories_geokb

Unnamed: 0,label,link,description,summary,qid
0,Alaska Volcano Observatory,https://www.usgs.gov/observatories/avo,The Alaska Volcano Observatory (AVO) is a join...,"AVO is a collaborative program between USGS, U...",Q44361
1,California Volcano Observatory,https://www.usgs.gov/california-volcano-observ...,As a part of the U.S. Geological Survey's Volc...,The California Volcano Observatory seeks to re...,Q44336
2,Cascades Volcano Observatory,https://www.usgs.gov/cascades-volcano-observatory,The U.S. Geological Survey's Cascades Volcano ...,The U.S. Geological Survey's Cascades Volcano ...,Q44333
3,Hawaiian Volcano Observatory,https://www.usgs.gov/observatories/hvo,HVO monitors earthquakes and the active volcan...,Hawaii's HVO team monitors volcanoes for hazar...,Q44349
4,Yellowstone Volcano Observatory,https://www.usgs.gov/yellowstone-volcano-obser...,The Yellowstone Volcano Observatory (YVO) is a...,The USGS Yellowstone Volcano Observatory monit...,Q44373


In [20]:
from wikibaseintegrator.wbi_enums import ActionIfExists, WikibaseDatePrecision

refs = geokb.models.References()
refs.add(
    geokb.datatypes.URL(
        prop_nr=geokb.prop_lookup['reference URL'],
        value=usgs_obs_url
    )
)

quals = geokb.models.Qualifiers()
quals.add(
    geokb.datatypes.Time(
        prop_nr=geokb.prop_lookup['point in time'],
        time='+2023-01-01T00:00:00Z',
        precision=WikibaseDatePrecision.YEAR
    )
)



<Qualifiers @4d76a0 _Qualifiers__qualifiers={'P110': [<Snak @4d76d0 _Snak__snaktype=<WikibaseSnakType.KNOWN_VALUE: 'value'> _Snak__property_number='P110' _Snak__hash=None _Snak__datavalue={'value': {'time': '+2023-01-01T00:00:00Z', 'before': 0, 'after': 0, 'precision': 9, 'timezone': 0, 'calendarmodel': 'http://www.wikidata.org/entity/Q1985727'}, 'type': 'time'} _Snak__datatype='time'>]}>

In [21]:
for index, row in df_observatories_geokb.iterrows():
    item = geokb.wbi.item.get(row["qid"])
    item.descriptions.set('en', row["summary"])

    instance_of_claim = geokb.datatypes.Item(
        prop_nr=geokb.prop_lookup['instance of'],
        value='Q50878',
        qualifiers=quals,
        references=refs
    )

    item.claims.add(
        claims=instance_of_claim,
        action_if_exists=ActionIfExists.REPLACE_ALL
    )

    part_of_claim = geokb.datatypes.Item(
        prop_nr=geokb.prop_lookup['part of'],
        value='Q44354',
        qualifiers=quals,
        references=refs
    )

    item.claims.add(
        claims=part_of_claim,
        action_if_exists=ActionIfExists.REPLACE_ALL
    )

    ref_url_claim = geokb.datatypes.URL(
        prop_nr=geokb.prop_lookup['reference URL'],
        value=row["link"],
        qualifiers=quals,
        references=refs
    )

    item.claims.add(
        claims=ref_url_claim,
        action_if_exists=ActionIfExists.REPLACE_ALL
    )

    response = item.write(
        summary="Refreshed observatory item with new information from web scrape"
    )
    print(row.label, response.id)


Alaska Volcano Observatory Q44361
California Volcano Observatory Q44336
Cascades Volcano Observatory Q44333
Hawaiian Volcano Observatory Q44349
Yellowstone Volcano Observatory Q44373
