In [None]:
!pip install pystac xarray pandas pystac-client

In [25]:
import pystac
import xarray as xr
import pandas as pd
from datetime import datetime, date
from pystac_client import Client
import numpy as np
import copernicusmarine
from copernicusmarine.core_functions import custom_open_zarr



### Open the root catalog

Use pystac-client to connect to a STAC endpoint (https://catalog.dive.edito.eu/).

In [7]:
# STAC API root URL
URL = 'https://catalog.dive.edito.eu'
# URL = 'https://s3.waw3-1.cloudferro.com/emodnet/bio_oracle/stac/catalog.json'
# custom headers
headers = []

cat = Client.open(URL, headers=headers)
cat

### Browse the Catalog
Navigate through the root catalog to find sub-catalogs and collections of interest


In [3]:
collections = list(cat.get_all_collections())
print(f'Found {len(collections)} collections')
for collection in collections:
    if 'chlorophyll' in collection.id:
        print(collection.id)
        for item in collection.get_all_items():
            print(item.id)
            print(item.assets)
            break

/home/samwork/miniforge3/envs/tools/lib/python3.9/site-packages/pystac_client/client.py:440: FallbackToPystac: Falling back to pystac. This might be slow.
  self._warn_about_fallback("COLLECTIONS", "FEATURES")


Found 9690 collections
emodnet-deepest_values_of_water_body_chlorophyll_a
6a886250-2dff-53e6-8bcb-a99d6ff851b6
{'native_asset': <Asset href=https://s3.waw3-1.cloudferro.com/emodnet/emodnet_native/emodnet_chemistry/water_body_chlorophyll_a/water_body_chlorophyll_a_masked_using_relative_error_threshold_0.3_northeast_atlantic_ocean/Water_body_chlorophyll-a.4Danl.nc>, 'metadata_csw': <Asset href=https://emodnet.ec.europa.eu/geonetwork/emodnet/eng/csw?request=GetRecordById&service=CSW&version=2.0.2&elementSetName=full&id=a3461fb1-d209-440e-a49f-7acff7731395>, 'metadata_xml': <Asset href=https://emodnet.ec.europa.eu/geonetwork/srv/api/records/a3461fb1-d209-440e-a49f-7acff7731395/formatters/xml>, 'Zarr': <Asset href=https://s3.waw3-1.cloudferro.com/emodnet/emodnet_arco/emodnet_chemistry/water_body_chlorophyll_a/deepest_values_of_water_body_chlorophyll_a_northeast_atlantic_ocean/Water_body_chlorophyll-a.4Danl.zarr>, 'wms': <Asset href=https://ec.oceanbrowser.net/emodnet/Python/web/wms?service=

### Search for a Collection
Identify a collection based on your variables (e.g., temperature, salinity). You can filter by collection metadata like keywords or spatial/temporal bounds.

In [8]:
all_items = []
collection_selection = ['oxygen', 'habitat', 'elevation', 'temperature']
for collection in collections:
    if 'oxygen' in collection.id or 'habitat' in collection.id or 'temperature' in collection.id:
        collection_items = collection.get_all_items()
        try:
            for item in collection_items:
                # Append item information to the list
                geometry = item.geometry.to_dict()
                all_items.append({  'Collection ID': collection.id, 
                                    'Item ID': item.id,
                                    'Item bounds': item.geometry, 
                                    'item_starttime': item.properties['start_datetime'],
                                    'item_endtime': item.properties['end_datetime'], 
                                    'Assets': item.assets})
        except Exception as e:
            print(e)
            print(f'Error with {collection.id}')
            continue
oxygen_habitat_temperature_items_df = pd.DataFrame(all_items)
oxygen_habitat_temperature_items_df.head()

'list' object has no attribute 'items'
Error with climate_forecast-sea_water_temperature
'list' object has no attribute 'items'
Error with climate_forecast-sea_water_temperature


Unnamed: 0,Collection ID,Item ID,Item bounds,item_starttime,item_endtime,Assets
0,climate_forecast-air_temperature,8ebf3d6d-1a6f-58f8-beb3-e6ff6eae3c90,"{'type': 'Polygon', 'coordinates': [[[-52.9000...",2024-08-27T00:00:00.000000Z,2024-09-26T03:28:00.000000Z,{'arco-time-series': <Asset href=https://s3.wa...
1,climate_forecast-air_temperature,33857744-4f8c-5eb2-acc7-3cf15e6026b7,"{'type': 'Polygon', 'coordinates': [[[-52.9000...",2024-08-27T00:00:00.000000Z,2024-09-26T03:28:00.000000Z,{'arco-geo-series': <Asset href=https://s3.waw...
2,climate_forecast-air_temperature,139f5e61-86d9-562a-b24d-d940bb0b25c7,"{'type': 'Polygon', 'coordinates': [[[-14.4, -...",2024-08-27T00:00:00.000000Z,2024-09-26T02:16:00.000000Z,{'arco-time-series': <Asset href=https://s3.wa...
3,climate_forecast-air_temperature,962e103b-9fec-5b95-ab3a-9f8633c03e38,"{'type': 'Polygon', 'coordinates': [[[-14.4, -...",2024-08-27T00:00:00.000000Z,2024-09-26T02:16:00.000000Z,{'arco-geo-series': <Asset href=https://s3.waw...
4,climate_forecast-air_temperature,99b5c8e6-696f-5512-9f59-2f6e53caf374,"{'type': 'Polygon', 'coordinates': [[[-171, -3...",2024-08-27T00:00:00.000000Z,2024-09-26T03:16:00.000000Z,{'arco-time-series': <Asset href=https://s3.wa...


### Select a Collection and Fetch Items
Choose a collection and list the available items (datasets), filtered by date range and geographic region.

In [9]:
def filter_items_by_time(items_df, start_date, end_date):
    """
    Filter items based on the time range.
    """
    items_df['item_starttime'] = pd.to_datetime(items_df['item_starttime'])
    items_df['item_endtime'] = pd.to_datetime(items_df['item_endtime'])
    items_df = items_df.sort_values(by='item_starttime')
    return items_df[(items_df['item_starttime'] > start_date) & (items_df['item_endtime'] < end_date)]

# Define time range
start_date = '2000-01-01'
end_date = '2030-12-31'

time_df = filter_items_by_time(oxygen_habitat_temperature_items_df, start_date, end_date)
time_df.head()

Unnamed: 0,Collection ID,Item ID,Item bounds,item_starttime,item_endtime,Assets
58,climate_forecast-fractional_saturation_of_oxyg...,98c8594e-d438-5b9b-a02a-e911480e3c1a,"{'type': 'Polygon', 'coordinates': [[[-179.999...",2001-01-04 16:47:52+00:00,2023-06-30 23:59:11+00:00,{'arco-geo-series': <Asset href=https://s3.waw...
57,climate_forecast-fractional_saturation_of_oxyg...,f8ee5c99-4e1f-52d1-97f3-20bc5a7f9d9b,"{'type': 'Polygon', 'coordinates': [[[-179.999...",2001-01-04 16:47:52+00:00,2023-06-30 23:59:11+00:00,{'arco-time-series': <Asset href=https://s3.wa...
995,climate_forecast-fractional_saturation_of_oxyg...,98c8594e-d438-5b9b-a02a-e911480e3c1a,"{'type': 'Polygon', 'coordinates': [[[-179.999...",2001-01-04 16:47:52+00:00,2023-06-30 23:59:11+00:00,{'arco-geo-series': <Asset href=https://s3.waw...
15,climate_forecast-air_temperature,f88e7a43-a70f-587d-bb81-922f1ef629c8,"{'type': 'Polygon', 'coordinates': [[[-179.999...",2001-01-04 16:47:52+00:00,2023-06-30 23:59:11+00:00,{'arco-geo-series': <Asset href=https://s3.waw...
14,climate_forecast-air_temperature,650d1f3a-88a6-50c6-ac6f-a509030a4679,"{'type': 'Polygon', 'coordinates': [[[-179.999...",2001-01-04 16:47:52+00:00,2023-06-30 23:59:11+00:00,{'arco-time-series': <Asset href=https://s3.wa...


### Look for Cloud-Optimized Assets
From each item, find and extract cloud-optimized assets (like Zarr or Parquet) that can be processed further.

In [33]:
all_items_assets = []
           # Now filter the assets
all_items_assets = []
for _, row in time_df.iterrows():
    collection_id = row['Collection ID']
    item_id = row['Item ID']
    bounds = row['Item bounds']
    data_starttime = row['item_starttime']
    data_endtime = row['item_endtime']
    assets = row['Assets']
    
    for asset_key, asset in assets.items():
        if asset.href.endswith('.zarr') or asset.href.endswith('.zarr/') or asset.href.endswith('.parquet'):
            # Append asset information to the list
            all_items_assets.append({'Collection ID': collection_id, 'Item ID': item_id, 'Data Start': data_starttime, 'Data End': data_endtime, 'Bounds': bounds, 'Asset Key': asset_key, 'Asset Href': asset.href})

# Create a DataFrame for assets
assets_df = pd.DataFrame(all_items_assets)
assets_df.head()

assets_df.to_csv('temperature_oxygen_habitat_arco_assets.csv')