# Search BioOracle STAC Catalog for Temperature Zarr Assets
This script searches a static STAC catalog for temperature data and retrieves Zarr assets.

In [4]:
library(httr)
library(jsonlite)
library(glue)
library(dplyr)
library(lubridate)
library(tidyr)

Here is the endpoint of the static STAC catalog.  It is a JSON file structure that contains links to other internal catalogs and collections.
Be sure to read the STAC documentation to understand STAC specifications. 
https://github.com/radiantearth/stac-spec/tree/master

In [5]:
stac_endpoint_url <- "https://s3.waw3-1.cloudferro.com/emodnet/bio_oracle/stac/catalog.json"
# Extract the root by removing 'catalog.json'
stac_root <- dirname(stac_endpoint_url)

Retrieve the JSON data from the STAC endpoint.

In [6]:
response <- GET(stac_endpoint_url)
json_data <- fromJSON(content(response, as = "text", encoding = "UTF-8"))

Here we filter the links based on the keyword 'temperature' in the title. Since this STAC makes use of internal catalogs, we will need to loop through the catalogs to find the collections and items.

In [7]:
catalog_selector = 'temperature'
selected_catalogs <- json_data$links[grep(catalog_selector, json_data$links, ignore.case = TRUE), ]
selected_catalogs_titles <- json_data$links[grep(catalog_selector, json_data$links$title, ignore.case = TRUE), ]
print(selected_catalogs_titles$title)

catalog_links = glue('{stac_root}/{selected_catalogs_titles$title}/catalog.json')

print(catalog_links)

[1] "airtemperature"   "oceantemperature"
https://s3.waw3-1.cloudferro.com/emodnet/bio_oracle/stac/airtemperature/catalog.json
https://s3.waw3-1.cloudferro.com/emodnet/bio_oracle/stac/oceantemperature/catalog.json


Loop through and print each catalog link.

In [8]:
for (i in seq_along(catalog_links)) {
  print(catalog_links[i])
}

https://s3.waw3-1.cloudferro.com/emodnet/bio_oracle/stac/airtemperature/catalog.json
https://s3.waw3-1.cloudferro.com/emodnet/bio_oracle/stac/oceantemperature/catalog.json


Loop through and get each collection from the 'oceantemperature' catalog, then get all the items in the 'thetao_mean' collection.

In [9]:
for (i in seq_along(catalog_links)) {
  print(catalog_links[i])
  internal_catalog_response = GET(catalog_links[i])
  internal_catalog_json <- fromJSON(content(internal_catalog_response, as = "text", encoding = "UTF-8"))
  print(internal_catalog_json)
}

https://s3.waw3-1.cloudferro.com/emodnet/bio_oracle/stac/airtemperature/catalog.json
$type
[1] "Catalog"

$id
[1] "airtemperature"

$stac_version
[1] "1.0.0"

$description
[1] "Variable Family airtemperature"

$links
     rel                                                href             type
1   root                                     ../catalog.json application/json
2  child ./emodnet-bio_oracle_airtemperature/collection.json application/json
3 parent                                     ../catalog.json application/json
                      title
1                Bio-Oracle
2 Bio-Oracle AirTemperature
3                Bio-Oracle

$title
[1] "airtemperature"

https://s3.waw3-1.cloudferro.com/emodnet/bio_oracle/stac/oceantemperature/catalog.json
$type
[1] "Catalog"

$id
[1] "oceantemperature"

$stac_version
[1] "1.0.0"

$description
[1] "Variable Family oceantemperature"

$links
     rel                                                  href             type
1   root                  

Here is a function to get the links to the collections from the oceantemperature 'internal' catalog that is inside this STAC catalog.

In [11]:
get_catalog_links <- function(catalog_links) {
  selected_collections <- list()
  for (i in seq_along(catalog_links)) {
    if (grepl('oceantemperature', catalog_links[i], ignore.case = TRUE)) {
      cat_link <- catalog_links[i]
      cat_response <- GET(cat_link)
      cat_json <- fromJSON(content(cat_response, as = "text", encoding = "UTF-8"))
      links <- cat_json$links$href
      for (j in seq_along(links)) {
        if (grepl('collection.json', links[j], ignore.case = TRUE)) {
          collection <- gsub("^\\./", "", dirname(links[j]))
          selected_collections <- append(selected_collections, collection)
          print(collection)
        }
      }
    }
  }
  selected_collections
}

This function will get the STAC items inside the collections that contain 'thetao_mean' in their ID. These STAC items contain the actual data that we are interested in.

In [13]:
get_collection_items <- function(selected_collections, catalog_links) {
  selected_collection_items <- data.frame(item_link = character(), stringsAsFactors = FALSE)
  for (collection in selected_collections) {
    if (grepl('thetao_mean', collection, ignore.case = TRUE)) {
      cat_link <- catalog_links[grepl('oceantemperature', catalog_links, ignore.case = TRUE)][1]
      collection_link <- glue('{dirname(cat_link)}/{collection}/collection.json')
      collection_json <- fromJSON(content(GET(collection_link), as = 'text', encoding = 'UTF-8'))
      collection_links <- collection_json$links
      matched_rows <- collection_links[grepl('item', collection_links$rel, ignore.case = TRUE), ]
      if (nrow(matched_rows) > 0) {
        for (k in seq_len(nrow(matched_rows))) {
          item_href <- matched_rows[k, "href"]
          item_title <- matched_rows[k, 'title']
          item_href <- gsub("^\\./", "", item_href)
          item_link <- glue("{dirname(collection_link)}/{item_href}")
          item_link <- as.character(item_link)
          print(item_link)
          selected_collection_items <- rbind(selected_collection_items, data.frame(item_link = item_link, stringsAsFactors = FALSE))
        }
      } else {
        print(glue("No matches found for item: {item}"))
      }
    }
  }
  selected_collection_items
}

Here we execute the functions to get the links to the collections and the items inside the collections.

In [14]:
selected_collections <- get_catalog_links(catalog_links)
selected_collection_items <- get_collection_items(selected_collections, catalog_links)
print(selected_collection_items)

[1] "emodnet-bio_oracle_oceantemperature"
[1] "emodnet-thetao_mean"
[1] "https://s3.waw3-1.cloudferro.com/emodnet/bio_oracle/stac/oceantemperature/emodnet-thetao_mean/thetao_mean__depthSurf_baseline_-179.975_-89.975_179.975_89.975_2000-01-01 00:00:00+00:00_2010-01-01 00:00:00+00:00/thetao_mean__depthSurf_baseline_-179.975_-89.975_179.975_89.975_2000-01-01 00:00:00+00:00_2010-01-01 00:00:00+00:00.json"
[1] "https://s3.waw3-1.cloudferro.com/emodnet/bio_oracle/stac/oceantemperature/emodnet-thetao_mean/thetao_mean__depthSurf_ssp119_-179.975_-89.975_179.975_89.975_2020-01-01 00:00:00+00:00_2090-01-01 00:00:00+00:00/thetao_mean__depthSurf_ssp119_-179.975_-89.975_179.975_89.975_2020-01-01 00:00:00+00:00_2090-01-01 00:00:00+00:00.json"
[1] "https://s3.waw3-1.cloudferro.com/emodnet/bio_oracle/stac/oceantemperature/emodnet-thetao_mean/thetao_mean__depthSurf_ssp126_-179.975_-89.975_179.975_89.975_2020-01-01 00:00:00+00:00_2090-01-01 00:00:00+00:00/thetao_mean__depthSurf_ssp126_-179.975_-89.975_17

This function fetches the JSON data for each item in the selected collection items DataFrame.

In [19]:
fetch_item_jsons <- function(selected_items_df) {
  item_jsons <- list()
  for (item_link in selected_items_df$item_link) {
    item_link <- URLencode(item_link)
    item_response <- GET(item_link)
    item_json <- fromJSON(content(item_response, as = 'text', encoding = 'UTF-8'))
    item_jsons <- append(item_jsons, list(item_json))
  }
  item_jsons
}

Finally, we fetch and display the JSONs for the selected items.

In [20]:
item_jsons <- fetch_item_jsons(selected_collection_items)
print(item_jsons)

[[1]]
[[1]]$type
[1] "Feature"

[[1]]$stac_version
[1] "1.0.0"

[[1]]$id
[1] "thetao_mean__depthSurf_baseline_-179.975_-89.975_179.975_89.975_2000-01-01 00:00:00+00:00_2010-01-01 00:00:00+00:00"

[[1]]$properties
[[1]]$properties$provider
        name             role
1 Bio-Oracle resourceProvider

[[1]]$properties$`data rights and restrictions`
[1] "CC-BY-4.0"

[[1]]$properties$`proj:epsg`
[1] 4326

[[1]]$properties$start_datetime
[1] "2000-01-01T00:00:00Z"

[[1]]$properties$end_datetime
[1] "2010-01-01T00:00:00Z"

[[1]]$properties$native_variable
[1] "thetao_mean"

[[1]]$properties$datetime
NULL


[[1]]$geometry
[[1]]$geometry$type
[1] "Polygon"

[[1]]$geometry$coordinates
, , 1

         [,1]     [,2]    [,3]    [,4]     [,5]
[1,] -179.975 -179.975 179.975 179.975 -179.975

, , 2

        [,1]   [,2]   [,3]    [,4]    [,5]
[1,] -89.975 89.975 89.975 -89.975 -89.975



[[1]]$links
         rel                  href             type       title
1       root ../../../catalog.json appli