The USGS web provides a listing of [science support offices](https://www.usgs.gov/about/organization/science-support-offices) as part of its listing of the USGS organizational structure. This notebook scrapes that page to establish a connection to general organization items brought into the GeoKB previously from processing Staff Profiles, where the organizational units are indicated by name/URL in the personnel inventory.

I'm continuing to experiment here with an OpenAI method of working up descriptive information suitable for placement into the GeoKB. I've started running into some errors from the OpenAI API that I'm using. I can't quite figure them out, so I threw in an error trap to skip past the dependency if needed for now.

In [9]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import openai

from wbmaker import WikibaseConnection

In [18]:
geokb = WikibaseConnection("GEOKB_CLOUD")

usgs_base_url = "https://www.usgs.gov"
usgs_support_url = "https://www.usgs.gov/about/organization/science-support-offices"


In [19]:
def scrape_support_orgs():
    support_offices = []
    r = requests.get(usgs_support_url)
    if r.status_code == 200:
        soup = BeautifulSoup(r.content, 'html.parser')
        for item in soup.find_all('div', {'class': 'grid-col-10'}):
            support_offices.append({
                "office_name": item.find('h4').text.strip(),
                "office_link": f"{usgs_base_url}{item.find('h4').find('a')['href']}",
                "office_description": item.find('div', {'class': 'field-intro'}).text.strip()
            })

    return pd.DataFrame(support_offices)

def summarize_description(office_description):
    if len(office_description) < 250:
        return office_description

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "user",
                "content": f"Provide a concise summary with less than 250 characters of the following text: {office_description}"
            },
        ]
    )

    summary = response["choices"][0]["message"]["content"]

    if len(summary) > 250:
        return office_description[:249]

    return summary

In [4]:
df_support_offices = scrape_support_orgs()
df_support_offices

Unnamed: 0,office_name,office_link,office_description
0,Administration,https://www.usgs.gov/administration,Our office establishes policies and coordinate...
1,Associate Chief Information Officer,https://www.usgs.gov/associate-chief-informati...,The Office of the Associate Chief Information ...
2,"Office of Budget, Planning, and Integration (BPI)",https://www.usgs.gov/bpi,"The Office of Budget, Planning, and Integratio..."
3,Communications and Publishing,https://www.usgs.gov/communications-and-publis...,The Office of Communications and Publishing (O...
4,USGS Freedom of Information Act (FOIA) Office,https://www.usgs.gov/foia,The Freedom of Information Act (FOIA) is a Fed...
5,Human Capital,https://www.usgs.gov/human-capital,"The USGS Human Capital Office is a cohesive, c..."
6,International Programs,https://www.usgs.gov/international-programs,The USGS Office of International Programs help...
7,Office of Science Quality and Integrity,https://www.usgs.gov/office-of-science-quality...,The Office of Science Quality and Integrity (O...


In [16]:
q_geokb_orgs = "PREFIX%20wd%3A%20%3Chttps%3A%2F%2Fgeokb.wikibase.cloud%2Fentity%2F%3E%0APREFIX%20wdt%3A%20%3Chttps%3A%2F%2Fgeokb.wikibase.cloud%2Fprop%2Fdirect%2F%3E%0A%0ASELECT%20%3Fitem%20%3FitemLabel%20%3Fitem_alt_label%0AWHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP62*%20wd%3AQ44210%20.%0A%20%20%3Fitem%20wdt%3AP1%20%3Finstance_of%20.%0A%20%20OPTIONAL%20%7B%0A%20%20%20%20%3Fitem%20skos%3AaltLabel%20%3Fitem_alt_label%20.%0A%20%20%20%20FILTER%20(lang(%3Fitem_alt_label)%3D'en')%0A%20%20%7D%0A%20%20SERVICE%20wikibase%3Alabel%20%7B%20bd%3AserviceParam%20wikibase%3Alanguage%20%22en%22%20.%20%7D%0A%7D"
df_geokb_orgs = geokb.wb_ref_data(query=q_geokb_orgs)
df_geokb_orgs["qid"] = df_geokb_orgs["item"].apply(lambda x: x.split("/")[-1])

org_name_lookup = pd.concat(
    [
        df_geokb_orgs[["qid","itemLabel"]].drop_duplicates(),
        df_geokb_orgs[df_geokb_orgs.item_alt_label.notnull()][["qid","item_alt_label"]].rename(columns={"item_alt_label": "itemLabel"})
    ]
).reset_index(drop=True)

In [17]:
df_support_offices_geokb = pd.merge(
    left=df_support_offices,
    right=org_name_lookup.rename(columns={"itemLabel": "office_name"}),
    how="left",
    on="office_name"
)

In [20]:
refs = geokb.models.References()
refs.add(
    geokb.datatypes.URL(
        prop_nr=geokb.prop_lookup['reference URL'],
        value=usgs_support_url
    )
)

quals = geokb.models.Qualifiers()
quals.add(
    geokb.datatypes.Time(
        prop_nr=geokb.prop_lookup['point in time'],
        time='+2023-01-01T00:00:00Z',
        precision=geokb.date_precision.YEAR
    )
)

<Qualifiers @ed2090 _Qualifiers__qualifiers={'P110': [<Snak @6f31d0 _Snak__snaktype=<WikibaseSnakType.KNOWN_VALUE: 'value'> _Snak__property_number='P110' _Snak__hash=None _Snak__datavalue={'value': {'time': '+2023-01-01T00:00:00Z', 'before': 0, 'after': 0, 'precision': 9, 'timezone': 0, 'calendarmodel': 'http://www.wikidata.org/entity/Q1985727'}, 'type': 'time'} _Snak__datatype='time'>]}>

In [21]:
df_support_offices_geokb

Unnamed: 0,office_name,office_link,office_description,qid
0,Administration,https://www.usgs.gov/administration,Our office establishes policies and coordinate...,Q44328
1,Associate Chief Information Officer,https://www.usgs.gov/associate-chief-informati...,The Office of the Associate Chief Information ...,Q44340
2,"Office of Budget, Planning, and Integration (BPI)",https://www.usgs.gov/bpi,"The Office of Budget, Planning, and Integratio...",Q44325
3,Communications and Publishing,https://www.usgs.gov/communications-and-publis...,The Office of Communications and Publishing (O...,Q44335
4,USGS Freedom of Information Act (FOIA) Office,https://www.usgs.gov/foia,The Freedom of Information Act (FOIA) is a Fed...,Q44376
5,Human Capital,https://www.usgs.gov/human-capital,"The USGS Human Capital Office is a cohesive, c...",Q44353
6,International Programs,https://www.usgs.gov/international-programs,The USGS Office of International Programs help...,Q44299
7,Office of Science Quality and Integrity,https://www.usgs.gov/office-of-science-quality...,The Office of Science Quality and Integrity (O...,Q44215


In [23]:
for index, row in df_support_offices_geokb.iterrows():
    item = geokb.wbi.item.get(row["qid"])
    try:
        item.descriptions.set('en', summarize_description(row["office_description"]))
    except Exception as e:
        print("COULD NOT GET SUMMARY DESCRIPTION")
        print(e)

    instance_of_claim = geokb.datatypes.Item(
        prop_nr=geokb.prop_lookup['instance of'],
        value='Q50877',
        references=refs
    )

    item.claims.add(
        claims=instance_of_claim,
        action_if_exists=geokb.action_if_exists.REPLACE_ALL
    )

    part_of_claim = geokb.datatypes.Item(
        prop_nr=geokb.prop_lookup['part of'],
        value='Q44210',
        qualifiers=quals,
        references=refs
    )

    item.claims.add(
        claims=part_of_claim,
        action_if_exists=geokb.action_if_exists.REPLACE_ALL
    )

    ref_url_claim = geokb.datatypes.URL(
        prop_nr=geokb.prop_lookup['reference URL'],
        value=row["office_link"],
        qualifiers=quals,
        references=refs
    )

    item.claims.add(
        claims=ref_url_claim,
        action_if_exists=geokb.action_if_exists.REPLACE_ALL
    )

    response = item.write(
        summary="Refreshed science support office item with new information from web scrape"
    )
    print(row.office_name, response.id)


COULD NOT GET SUMMARY DESCRIPTION
The server had an error while processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 6de63370191d465bf5131ed8efce92f9 in your message.) {
  "error": {
    "message": "The server had an error while processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 6de63370191d465bf5131ed8efce92f9 in your message.)",
    "type": "server_error",
    "param": null,
    "code": null
  }
}
 500 {'error': {'message': 'The server had an error while processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 6de63370191d465bf5131ed8efce92f9 in your message.)', 'type': 'server_error', 'param': None, 'code