The USGS web presents a not fully complete listing of USGS Laboratories as another aspect of the USGS organizational structure. Similar to the "observatories" concept, there is a more strict internal definition of a lab as it relates to the Quality Management System, but the public-facing web presentation is useful in the context of organizing entities into the GeoKB. This notebook uses a similar web scraping routine to pull useful information on the labs.

In contrast to the state/territory listing with USGS Regions and Science Centers, the state listing for labs does not really apply as an "operating area" concept. Rather, it is the state that a lab is located in, so we use the "located in the administrative territorial entity" property in this case.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from wbmaker import WikibaseConnection

In [29]:
geokb = WikibaseConnection("GEOKB_CLOUD")

In [3]:
usgs_base_url = "https://www.usgs.gov"
usgs_lab_url = "https://www.usgs.gov/science/laboratories"

# Labs from web listing

In [4]:
def scrape_usgs_labs():
    labs = []

    r_labs = requests.get(usgs_lab_url)
    if r_labs.status_code == 200:
        soup_labs = BeautifulSoup(r_labs.content, 'html.parser')
        labs_table = soup_labs.find('table')
        table_body = labs_table.find('tbody')
        for row in table_body.find_all('tr'):
            columns = row.find_all('td')
            labs.append({
                "label": columns[0].text.strip(),
                "reference_url": f"{usgs_base_url}{columns[0].find('a')['href']}",
                "sc_label": columns[1].text.strip() if columns[1].text.strip() != '' else None,
                "sc_url": f"{usgs_base_url}{columns[1].find('a')['href']}" if columns[1].text.strip() != '' else None,
                "states_territories": columns[2].text.strip()
            })

        df_labs = pd.DataFrame(labs)

        df_labs["acronym"] = df_labs["label"].str.extract(r'\((.*?)\)')
        df_labs["label"] = df_labs.apply(lambda x: x.label.replace(f"({x.acronym})", "").strip() if isinstance(x.acronym, str) else x.label, axis=1)

        return df_labs

In [5]:
df_labs = scrape_usgs_labs()

# GeoKB References for Linking

In [6]:
q_geokb_orgs = "PREFIX%20wd%3A%20%3Chttps%3A%2F%2Fgeokb.wikibase.cloud%2Fentity%2F%3E%0APREFIX%20wdt%3A%20%3Chttps%3A%2F%2Fgeokb.wikibase.cloud%2Fprop%2Fdirect%2F%3E%0A%0ASELECT%20%3Fitem%20%3FitemLabel%20%3Fitem_alt_label%20%3Furl%20%3Finstance_ofLabel%0AWHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP62*%20wd%3AQ44210%20.%0A%20%20%3Fitem%20wdt%3AP1%20%3Finstance_of%20.%0A%20%20OPTIONAL%20%7B%0A%20%20%20%20%3Fitem%20skos%3AaltLabel%20%3Fitem_alt_label%20.%0A%20%20%20%20FILTER%20(lang(%3Fitem_alt_label)%3D'en')%0A%20%20%7D%0A%20%20OPTIONAL%20%7B%0A%20%20%20%20%3Fitem%20wdt%3AP31%20%3Furl%20.%0A%20%20%7D%0A%20%20SERVICE%20wikibase%3Alabel%20%7B%20bd%3AserviceParam%20wikibase%3Alanguage%20%22en%22%20.%20%7D%0A%7D"
df_geokb_orgs = geokb.wb_ref_data(query=q_geokb_orgs)
df_geokb_orgs["qid"] = df_geokb_orgs["item"].apply(lambda x: x.split("/")[-1])

q_states_territories = "PREFIX%20wdt%3A%20%3Chttps%3A%2F%2Fgeokb.wikibase.cloud%2Fprop%2Fdirect%2F%3E%0A%0ASELECT%20%3Fitem%20%3FitemLabel%20%3Ffips_alpha%0AWHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP13%20%3Ffips_alpha%20.%20%0A%20%20SERVICE%20wikibase%3Alabel%20%7B%20bd%3AserviceParam%20wikibase%3Alanguage%20%22en%22%20.%20%7D%0A%7D"
df_states_territories = geokb.wb_ref_data(query=q_states_territories)
df_states_territories["qid"] = df_states_territories["item"].apply(lambda x: x.split("/")[-1])


In [7]:
org_name_lookup = pd.concat([
    df_geokb_orgs[df_geokb_orgs["item_alt_label"].notnull()][["qid","item_alt_label"]].rename(columns={"item_alt_label": "label"}),
    df_geokb_orgs[["qid","itemLabel"]].rename(columns={"itemLabel": "label"})
])

org_url_lookup = df_geokb_orgs[df_geokb_orgs["url"].notnull()][["qid","url"]]

# Prep Data

In [8]:
df_labs_to_geokb = pd.merge(
    left=df_labs,
    right=org_name_lookup,
    how="left",
    on="label"
)

df_labs_to_geokb = pd.merge(
    left=df_labs_to_geokb,
    right=org_url_lookup.rename(columns={'qid': 'sc_qid', 'url': 'sc_url'}),
    how="left",
    on="sc_url"
)

df_labs_to_geokb = pd.merge(
    left=df_labs_to_geokb,
    right=df_states_territories[["qid","itemLabel"]].rename(columns={'qid': 'loc_qid', 'itemLabel': 'states_territories'}),
    how="left",
    on="states_territories"
)

In [9]:
lab_source_refs = geokb.models.References()
lab_source_refs.add(
    geokb.datatypes.URL(
        prop_nr=geokb.prop_lookup['reference URL'],
        value=usgs_lab_url
    )
)

now_quals = geokb.models.Qualifiers()
now_quals.add(
    geokb.datatypes.Time(
        prop_nr=geokb.prop_lookup['point in time'],
        time='+2023-01-01T00:00:00Z',
        precision=geokb.date_precision.YEAR
    )
)


<Qualifiers @632310 _Qualifiers__qualifiers={'P110': [<Snak @ced390 _Snak__snaktype=<WikibaseSnakType.KNOWN_VALUE: 'value'> _Snak__property_number='P110' _Snak__hash=None _Snak__datavalue={'value': {'time': '+2023-01-01T00:00:00Z', 'before': 0, 'after': 0, 'precision': 9, 'timezone': 0, 'calendarmodel': 'http://www.wikidata.org/entity/Q1985727'}, 'type': 'time'} _Snak__datatype='time'>]}>

# Preview

In [11]:
df_labs_to_geokb.head()

Unnamed: 0,label,reference_url,sc_label,sc_url,states_territories,acronym,qid,sc_qid,loc_qid
0,Algal and Other Environmental Toxins Laboratory,https://www.usgs.gov/programs/environmental-he...,,,Kansas,,,,Q269
1,Aquatic Experimental Laboratory,https://www.usgs.gov/centers/fort-collins-scie...,Fort Collins Science Center,https://www.usgs.gov/centers/fort,Colorado,AXL,,Q44213,Q256
2,Behavioral Toxicology Laboratory,https://www.usgs.gov/programs/environmental-he...,,,Missouri,,,,Q244
3,Bioactive Chemicals Research Laboratory,https://www.usgs.gov/programs/environmental-he...,,,Colorado,,,,Q256
4,Bird Banding Laboratory,https://www.usgs.gov/labs/bird-banding-laboratory,Patuxent Wildlife Research Center,https://www.usgs.gov/centers/pwrc,Maryland,,,Q50879,Q270


# Commit Lab Claims and Build New Items

In [16]:
labs_added = {
    "Algal and Other Environmental Toxins Laboratory": "Q50881",
    "Aquatic Experimental Laboratory": "Q50882",
    "Behavioral Toxicology Laboratory": "Q50883",
    "Bioactive Chemicals Research Laboratory": "Q50884",
    "Bird Banding Laboratory": "Q50885"
}

In [17]:
for index, row in df_labs_to_geokb[~df_labs_to_geokb.label.isin(labs_added.keys())].iterrows():
    if isinstance(row.qid, str):
        item = geokb.wbi.item.get(row.qid)
    else:
        item = geokb.wbi.item.new()

    item.labels.set('en', row.label)
    item.descriptions.set('en', f'a USGS laboratory located in {row.states_territories}')

    if isinstance(row.acronym, str):
        item.aliases.set('en', row.acronym)

    item.claims.add(
        claims=geokb.datatypes.Item(
            prop_nr=geokb.prop_lookup['instance of'],
            value="Q50880",
            qualifiers=now_quals,
            references=lab_source_refs
        ),
        action_if_exists=geokb.action_if_exists.REPLACE_ALL
    )

    item.claims.add(
        claims=geokb.datatypes.URL(
            prop_nr=geokb.prop_lookup['reference URL'],
            value=row.reference_url,
            qualifiers=now_quals,
            references=lab_source_refs
        ),
        action_if_exists=geokb.action_if_exists.REPLACE_ALL
    )

    if isinstance(row.sc_qid, str):
        item.claims.add(
            claims=geokb.datatypes.Item(
                prop_nr=geokb.prop_lookup['part of'],
                value=row["sc_qid"],
                qualifiers=now_quals,
                references=lab_source_refs
            ),
            action_if_exists=geokb.action_if_exists.REPLACE_ALL
        )
            
    if isinstance(row.loc_qid, str):
        item.claims.add(
            claims=geokb.datatypes.Item(
                prop_nr=geokb.prop_lookup['located in the administrative territorial entity'],
                value=row.loc_qid,
                qualifiers=now_quals,
                references=lab_source_refs
            ),
            action_if_exists=geokb.action_if_exists.REPLACE_ALL
        )

    response = item.write(
        summary="Built out USGS lab claims from USGS lab listing"
    )
    labs_added[row.label] = response.id
    print(row.label, response.id)

Brine Research Instrumentation and Experimental
			 Laboratory Q50886
Calcareous Laboratory Q50887
Carbonate Aquifer Characterization Laboratory Q50888
Chemistry Laboratory Q50889
Columbia River Research Laboratory Q50890
Coral Microbial Ecology Laboratory Q50891
Denver Microbeam Laboratory Q50892
Diagnostic Microbiology Laboratory Q50893
Diagnostic Parasitology Laboratory Q50894
Diagnostic Virology Laboratory Q50895
Earth Systems Biogeochemistry Laboratory Q50896
Eastern Energy and Environmental Laboratory Q50897
Environmental and Public Health Microbiology Laboratory Q50898
Environmental Chemistry Laboratory Q50899
Fish Health Program Q50900
Functional and Molecular Bioassay Laboratory Q50901
High-Content Screening Laboratory Q50902
Hydrologic Instrumentation Facility Q44352
Integrated Water Chemistry Assessment Laboratory Q50903
Integrated Water Chemistry Assessment Laboratory Q50904
Kentucky Sediment Laboratory Q50905
Klamath Falls Field Station Q50906
Laboratory for Infectious Dis

# Science Center "has part" Claims

In [31]:
df_center_labs = pd.merge(
    left=df_labs_to_geokb[df_labs_to_geokb.sc_qid.notnull()][["label","sc_qid"]].drop_duplicates(),
    right=pd.DataFrame(list(labs_added.items()), columns=["label","lab_qid"]),
    how="left",
    on="label"
).drop(["label"], axis=1).groupby("sc_qid", as_index=False).agg(list)

df_center_labs

Unnamed: 0,sc_qid,lab_qid
0,Q44213,[Q50882]
1,Q44227,[Q50917]
2,Q44230,[Q50934]
3,Q44241,[Q50929]
4,Q44243,[Q50918]
5,Q44244,"[Q50886, Q50897, Q50926, Q50927, Q50928]"
6,Q44247,"[Q50891, Q50932]"
7,Q44250,"[Q50923, Q50933]"
8,Q44258,"[Q50887, Q50909, Q50924]"
9,Q44284,"[Q50890, Q50900, Q50906, Q50910]"


In [None]:
for index, row in df_center_labs.iterrows():
    item = geokb.wbi.item.get(row["sc_qid"])
    
    has_part_claims = []
    for lab_qid in row["lab_qid"]:
        has_part_claims.append(
            geokb.datatypes.Item(
                prop_nr=geokb.prop_lookup['has part'],
                value=lab_qid,
                qualifiers=now_quals,
                references=lab_source_refs
            )
        )

    item.claims.add(
        claims=has_part_claims,
        action_if_exists=geokb.action_if_exists.REPLACE_ALL
    )

    response = item.write(
        summary="Added labs as has part claims to Science Center"
    )
    print(response.id)