In [11]:
import pandas as pd
import numpy as np
import json
import fsspec
import xarray as xr

from pystac_client import Client
import planetary_computer as pc

WQ_PATH = "water_quality_training_dataset.csv"

df = pd.read_csv(WQ_PATH)
df.columns = [c.strip() for c in df.columns]

# Parse Sample Date robustly (your file looks like day-month-year)
df["Sample Date"] = pd.to_datetime(df["Sample Date"], errors="coerce", dayfirst=True)

# Keep only 2011â€“2015 (inclusive)
df = df[df["Sample Date"].between("2011-01-01", "2015-12-31")].copy()

# Ensure numeric lat/lon
df["Latitude"]  = pd.to_numeric(df["Latitude"], errors="coerce")
df["Longitude"] = pd.to_numeric(df["Longitude"], errors="coerce")
df = df.dropna(subset=["Latitude", "Longitude", "Sample Date"]).reset_index(drop=True)

print(df.shape)
df.head()

(9319, 6)


Unnamed: 0,Latitude,Longitude,Sample Date,Total Alkalinity,Electrical Conductance,Dissolved Reactive Phosphorus
0,-28.760833,17.730278,2011-01-02,128.912,555.0,10.0
1,-26.861111,28.884722,2011-01-03,74.72,162.9,163.0
2,-26.45,28.085833,2011-01-03,89.254,573.0,80.0
3,-27.671111,27.236944,2011-01-03,82.0,203.6,101.0
4,-27.356667,27.286389,2011-01-03,56.1,145.1,151.0


In [12]:
STAC_URL = "https://planetarycomputer.microsoft.com/api/stac/v1"
COLLECTION = "deltares-water-availability"

client = Client.open(STAC_URL)

search = client.search(collections=[COLLECTION])
items = list(search.get_items())
print("items found:", len(items))
print("example item ids:", [it.id for it in items[:5]])



items found: 5
example item ids: ['CHIRPS', 'NLDAS', 'EOBS', 'BOM', 'ERA5']


In [13]:
def choose_item(items, keyword="era5"):
    kw = keyword.lower()
    for it in items:
        blob = json.dumps(it.to_dict()).lower()
        if kw in blob:
            return it
    return items[0]

item = choose_item(items, keyword="era5")
item = pc.sign(item)  # IMPORTANT: signs the assets for access

print("chosen item:", item.id)
print("assets:", list(item.assets.keys()))

chosen item: ERA5
assets: ['data', 'index']


In [14]:
index_href = item.assets["index"].href  # signed already by pc.sign(item)

with fsspec.open(index_href, "r") as f:
    ref = json.load(f)

print("ref keys:", ref.keys())
print("refs entries:", len(ref.get("refs", {})))

ref keys: dict_keys(['version', 'templates', 'refs'])
refs entries: 223612


In [15]:
def sign_kerchunk_refs(ref_dict):
    refs = ref_dict.get("refs", {})
    for k, v in refs.items():
        # Kerchunk refs often look like: [URL, offset, length]
        if isinstance(v, list) and len(v) >= 1 and isinstance(v[0], str) and v[0].startswith("https://"):
            refs[k][0] = pc.sign(v[0])
        # Sometimes refs can be plain strings
        elif isinstance(v, str) and v.startswith("https://"):
            refs[k] = pc.sign(v)
    return ref_dict

ref = sign_kerchunk_refs(ref)

In [16]:
def make_ref_mapper(ref_dict):
    # Try the common modern signature first
    try:
        fs = fsspec.filesystem(
            "reference",
            fo=ref_dict,
            remote_protocol="https",
            target_protocol="https",
        )
        return fs.get_mapper("")
    except TypeError:
        # Fallback for older signatures
        fs = fsspec.filesystem(
            "reference",
            fo=ref_dict,
            target_protocol="https",
        )
        return fs.get_mapper("")

mapper = make_ref_mapper(ref)
print("mapper ready")

mapper ready


In [18]:
try:
    ds = xr.open_zarr(mapper, consolidated=False)
except Exception as e:
    print("open_zarr failed:", repr(e))
    # xarray>=2025 sometimes expects the generic "store" engine in some setups
    ds = xr.open_dataset(mapper, engine="store")

print(ds)
print("dims:", ds.dims)
print("data_vars (first 20):", list(ds.data_vars)[:20])

open_zarr failed: TypeError("ReferenceNotReachable.__init__() missing 1 required positional argument: 'target'")


AssertionError: 