In [1]:
import toyplot
import requests
import pandas as pd

In [2]:
"""
GBIF can be used to find specimen collection records, or other types of observation data, stored in museum type databases. 
The website offers a convenient way to request taxa by name, and to select specific research criteria. 
"""

'\nGBIF can be used to find specimen collection records, or other types of observation data, stored in museum type databases. \nThe website offers a convenient way to request taxa by name, and to select specific research criteria. \n'

In [3]:
# store the base url as a string variable
baseurl = "http://api.gbif.org/v1/occurrence/search?"

In [4]:
# store a endpoint request as a string variable
search_url = "http://api.gbif.org/v1/occurrence/search?q=Bombus"

The requests package work a bit like an automated web browser. We've used requests briefly in the past but now we'll start to use it more effectively. The main function we will call is .get(), which will send a GET command (a form of HTTP method that the web is built on) to the web address and return a Response Class object. We will then access attributes and functions of the Response instance to see if our request worked, and to parse the resulting text from it. Let's try this on our search_url string defined above.

In [5]:
# create a Response instance from a request
response = requests.get(search_url)
# check that your request worked (200 = worked; other codes No))
response.status_code 
# or, run this to check if it worked.
# This would return an error message if it didn't work (else None)
response.raise_for_status()

In [6]:
# first 500 characters of the .text string from GBIF API query
response.text[:500]

'{"offset":0,"limit":20,"endOfRecords":false,"count":3924237,"results":[{"key":1792076088,"datasetKey":"4fa7b334-ce0d-4e88-aaae-2e0c138d049e","publishingOrgKey":"e2e717bf-551a-4917-bdc9-4fa0f342c530","installationKey":"7182d304-b0a2-404b-baba-2086a325c221","hostingOrganizationKey":"e2e717bf-551a-4917-bdc9-4fa0f342c530","publishingCountry":"EC","protocol":"DWC_ARCHIVE","lastCrawled":"2024-09-27T13:35:39.907+00:00","lastParsed":"2025-02-05T01:03:29.507+00:00","crawlId":20,"extensions":{},"basisOfRe'

In [7]:
# or, get results as a dictionary (JSON converted)
rdict = response.json()

# get some quick info on the dictionary keys
list(rdict.keys())

['offset', 'limit', 'endOfRecords', 'count', 'results', 'facets']

In [8]:
## how many records are there for this query
rdict["count"]

3924237

In [9]:
## how many records were returned
rdict["limit"]

20

In [10]:
## starting from which record
rdict["offset"]

0

In [11]:
# here is the first record, it's also a dictionary
rdict["results"][0]

{'key': 1792076088,
 'datasetKey': '4fa7b334-ce0d-4e88-aaae-2e0c138d049e',
 'publishingOrgKey': 'e2e717bf-551a-4917-bdc9-4fa0f342c530',
 'installationKey': '7182d304-b0a2-404b-baba-2086a325c221',
 'hostingOrganizationKey': 'e2e717bf-551a-4917-bdc9-4fa0f342c530',
 'publishingCountry': 'EC',
 'protocol': 'DWC_ARCHIVE',
 'lastCrawled': '2024-09-27T13:35:39.907+00:00',
 'lastParsed': '2025-02-05T01:03:29.507+00:00',
 'crawlId': 20,
 'extensions': {},
 'basisOfRecord': 'HUMAN_OBSERVATION',
 'individualCount': 1,
 'occurrenceStatus': 'PRESENT',
 'taxonKey': 5228583,
 'kingdomKey': 1,
 'phylumKey': 44,
 'classKey': 212,
 'orderKey': 1448,
 'familyKey': 5289,
 'genusKey': 2476913,
 'speciesKey': 5228583,
 'acceptedTaxonKey': 5228583,
 'scientificName': 'Chaetocercus bombus Gould, 1871',
 'acceptedScientificName': 'Chaetocercus bombus Gould, 1871',
 'kingdom': 'Animalia',
 'phylum': 'Chordata',
 'order': 'Apodiformes',
 'family': 'Trochilidae',
 'genus': 'Chaetocercus',
 'species': 'Chaetocercu

In [14]:
# load as a dataframe
sdf = pd.json_normalize(rdict['results'])
#sdf.head() load first few rows 
sdf.columns

Index(['key', 'datasetKey', 'publishingOrgKey', 'installationKey',
       'hostingOrganizationKey', 'publishingCountry', 'protocol',
       'lastCrawled', 'lastParsed', 'crawlId', 'basisOfRecord',
       'individualCount', 'occurrenceStatus', 'taxonKey', 'kingdomKey',
       'phylumKey', 'classKey', 'orderKey', 'familyKey', 'genusKey',
       'speciesKey', 'acceptedTaxonKey', 'scientificName',
       'acceptedScientificName', 'kingdom', 'phylum', 'order', 'family',
       'genus', 'species', 'genericName', 'specificEpithet', 'taxonRank',
       'taxonomicStatus', 'iucnRedListCategory', 'decimalLatitude',
       'decimalLongitude', 'continent', 'stateProvince', 'year', 'month',
       'day', 'eventDate', 'startDayOfYear', 'endDayOfYear', 'issues',
       'lastInterpreted', 'license', 'isSequenced', 'identifiers', 'media',
       'facts', 'relations', 'isInCluster', 'recordedBy', 'geodeticDatum',
       'class', 'countryCode', 'recordedByIDs', 'identifiedByIDs',
       'gbifRegion', 'cou

In [15]:
# here we create the same urlpath using params
response = requests.get(
    url="https://api.gbif.org/v1/occurrence/search/",
    params={"q": "Bombus"} #this returns results that don't necessarily contain Bombus in the 'genus'; HOW do we address this??
)

# show url path
print(response.url)

https://api.gbif.org/v1/occurrence/search/?q=Bombus


If you looked closely at the results above you may have noticed that the records returned are not actually all for organisms in the genus Bombus. Instead results include things like Chaetocercus bombus and other organisms that happen to have "bombus" in their names.

This is why its important to look closely at your data. Looking back at the documentation we can see that the 'q=something' search parameter returns a fuzzy hit to anything that has the query in its data. If we instead want to restrict to the genus Bombus we need to find the genusKey for Bombus. This can be found using the 'species' endpoint in the API. So let's take a side track to find this. Note we are searching a different baseurl now, to look in the 'species' path instead of the 'occurrence' path.

The results below provide unique identifiers that are more reliable for searching the database. We will use the genusKey=1340278 for our next search of the occurrence database.

In [16]:
# get taxonomy info for the genus Bombus
res = requests.get(
    url="https://api.gbif.org/v1/species/match/",
    params={"genus": "Bombus"}, #passing a new parameter which is 'genus' to specify to query Bombus AND specifically only responses where Bombus is in the genus
)
#information returned is the genus key value; way of asking what the genus key is for Bombus
res.json()

{'usageKey': 1340278,
 'scientificName': 'Bombus Latreille, 1802',
 'canonicalName': 'Bombus',
 'rank': 'GENUS',
 'status': 'ACCEPTED',
 'confidence': 94,
 'matchType': 'EXACT',
 'kingdom': 'Animalia',
 'phylum': 'Arthropoda',
 'order': 'Hymenoptera',
 'family': 'Apidae',
 'genus': 'Bombus',
 'kingdomKey': 1,
 'phylumKey': 54,
 'classKey': 216,
 'orderKey': 1457,
 'familyKey': 4334,
 'genusKey': 1340278,
 'synonym': False,
 'class': 'Insecta'}

Below I show the URL for when we add the requirement that a record have coordinate data, and for when we add additional arguments to raise the limit for the number of records returned. The max records at a time (limit - offset) is 300. Above that you need to increment the offset to search higher values. You can see that the URL is simply appending additional queries to the end after the ? symbol to build more complex queries.

In [17]:
# add requirement that the record have coordinate data
res = requests.get(
    url="https://api.gbif.org/v1/occurrence/search/",
    params={
        "genusKey": 1340278, #fetched our query using genus key
        "hasCoordinate": "true", #this means that GBIF has location data for ___ of interest 
    }
)
res.url #URL should only return us with Bombus genus 
#alternatively res.json()

'https://api.gbif.org/v1/occurrence/search/?genusKey=1340278&hasCoordinate=true'

In [18]:
# request records 0-100
res = requests.get(
    url="https://api.gbif.org/v1/occurrence/search/",
    params={
        "genusKey": 1340278, 
        "hasCoordinate": "true",
        "offset": 100, #parameter to 'git call'
        "limit": 20,
    }
)
res.url

'https://api.gbif.org/v1/occurrence/search/?genusKey=1340278&hasCoordinate=true&offset=100&limit=20'

If we wanted to collect all records for a given search then we need to increment the "offset" argument until we reach the end of the records. Each is returned as a list of dictionaries, so we can just join all of those lists together and return them. That sounds a bit complex, so let's to it in two parts, first we'll write a function to fulfill a single request, and then a function to call many requests.

In [19]:
def get_single_batch(genusKey, year, offset=0, limit=20):
    """
    Returns a GBIF REST query with records between offset
    and offset + limit in JSON format. The genusKey and 
    year interval can be changed.
    """
    res = requests.get(
        url="https://api.gbif.org/v1/occurrence/search/",
        params={
            "genusKey": genusKey,
            "year": year,
            "offset": offset,
            "limit": limit,
            "hasCoordinate": "true",
            "country": "US",
        }
    )
    return res.json()

In [20]:
# test single batch function
jdata = get_single_batch(
    genusKey=3171670,
    year="1990,2020",
    offset=0, 
    limit=20
)

# how many results were fetched?
print(len(jdata["results"]))

20


In [21]:
# did we reach the end of the records?
jdata["endOfRecords"]

False

In [22]:
def get_all_records(genusKey, year):
    """
    Iterate requests over incremental offset positions until
    all records have been fetched. When the last record has
    been fetched the key 'endOfRecords' will be 'true'. Takes
    the API params as a dictionary. Returns result as a list
    of dictionaries.
    """
    # for storing results
    alldata = []

    # continue until we call 'break'
    offset = 0
    while 1:

        # get JSON data for a batch 
        jdata = get_single_batch(genusKey, year, offset, 300)

        # increment counter by 300 (the max limit)
        offset += 300

        # add this batch of data to the growing list
        alldata.extend(jdata["results"])

        # stop when end of record is reached
        if jdata["endOfRecords"]:
            print(f'Done. Found {len(alldata)} records')
            break

        # print a dot on each rep to show progress
        print('.', end='')

    return alldata

In [23]:
# call function to search over all offset values until end. 
# THIS MAY TAKE A FEW MINUTES TO RUN
jdata = get_all_records(1340278, "1900,1902")

....Done. Found 1223 records


In [24]:
# convert to a data frame
df = pd.json_normalize(jdata)

In [25]:
# keys (columns) in the dataframe (there are many!)
list(df.columns)

# view just the columns we're interested in for now.
sdf = df[["species", "year", "decimalLatitude", "decimalLongitude"]]
sdf.head()

Unnamed: 0,species,year,decimalLatitude,decimalLongitude
0,Bombus pensylvanicus,1902,39.98,-82.98
1,Bombus vosnesenskii,1902,38.18741,-122.5208
2,Bombus vosnesenskii,1902,37.3996,-121.7997
3,Bombus rufocinctus,1902,37.397444,-121.802593
4,Bombus californicus,1902,38.50222,-122.26528
