# Metadata extraction - teachers only
This file is only used by the teachers to prepare metadata, like the time periods covered by individual datasets etc. This serves to avoid using very time consuming `dask` and `stack` commands to crawl through the data cube and extract those information individually.

Crawl through the individual datasets on the data cubes **or** the respective http JSON files on the "explorer" websites, and extract all relevant information to be used for updating the `measurements.csv` file.
- Create one `measurements_<data-cube-name>.csv`, for the Swiss, African, and Australian data cubes (<data-cube-name>)
- Update specifically the `time_start` and `time_end` arguments. These are crucial in the `config_tool.ipynb` routine

## Workflow
1. Read `measurements.csv` as `pd.DataFrame`
2. Connect to relevant datacube / or use directly the `http` link with JSON files
3. Identify all available dataset names (`catalogue`)
4. Identify the **explorer** link to the individual datasets
5. Use `http` to crawl through the JSON files using `request` on the **explorer** webiste (not using dash/stack - way too slow)
7. Identify relevant `attributes` (on the Swiss Data Cube (SDC) this is: "https://explorer.swissdatacube.org/stac/collections/<"dataset name">/extent/temporal/interval/0/<"0 = start, or 1 = end">
8. Update fields `time_start` and `time_end` in `measurements_<data-cube-name>` pd.DataFrame
9. Write `measurements_<data-cube-name>.csv`

In [1]:
# 1
import pandas as pd
import requests
import copy

df_meas_base = pd.read_csv('data/measurements_SwissDC_2024.csv')

In [None]:
#
# SWISS Data Cube
#

In [3]:
dc_i = "SwissDC"
expl_i = "http://explorer.swissdatacube.org:8080/stac/collections"
# 1. Read measurements file
df = pd.read_csv("data/measurements_SwissDC_2024.csv")

# 2. Fetch STAC collections metadata
url = DCs_expl[i]
response = requests.get(url)
response.raise_for_status()
data = response.json()

# 3. Extract collection id, start date, and end date
records = []
for col in data.get("collections", []):
    cid = col.get("id")
    temporal = col.get("extent", {}).get("temporal", {}).get("interval", [[None, None]])[0]
    start = temporal[0]
    end = temporal[1]
    # The SwissDC does not provide metadata on spatial resoltion, so no point in trying to extract this here
    records.append({"product": cid, "time_start": start, "time_end": end})

meta_df = pd.DataFrame(records)
# print(meta_df)

# 4. Clean date format (keep YYYY or YYYY-MM if possible)
def format_date(d):
    if d is None:
        return None
    d = d.split("T")[0]  # drop time component
    # remove trailing "-01" if only year is available, or keep YYYY-MM. All seem to work in the config tool.
    return d

meta_df["time_start"] = meta_df["time_start"].apply(format_date)
meta_df["time_end"] = meta_df["time_end"].apply(format_date)

# 5. Merge with your measurements file
merged = df.merge(meta_df, on="product", how="left", suffixes=("", "_new"))

# # 6. Overwrite only if new info available
# for col in ["time_start", "time_end"]:
#     merged[col] = merged[f"{col}_new"].combine_first(merged[col])

# 6. Overwrite existing values if new info available (not None)
# But skip arealstatistik and corinelc_europe - not valid dates inside!
skip_products = ["arealstatistik", "corinelc_europe"]
mask = ~merged["product"].isin(skip_products)
for col in ["time_start", "time_end"]:
    merged.loc[mask,col] = merged.loc[mask,f"{col}_new"].where(merged.loc[mask,f"{col}_new"].notna(), merged.loc[mask,col])


# 7. Drop temporary columns and save
merged = merged.drop(columns=["time_start_new", "time_end_new"])
merged.to_csv(f"measurements_{dc_i}.csv", index=False)

print(f"Updated metadata saved to 'measurements_{dc_i}.csv'")

✅ Updated metadata saved to 'measurements_SwissDC.csv'


In [None]:
#
# African Data Cube
#

In [4]:
import requests
import pandas as pd

collections_url = "https://explorer.digitalearth.africa/stac/collections"
collections = requests.get(collections_url).json()["collections"]


In [7]:
col = collections[0]
# col
info = {
    "id": col["id"],
    "description": col["description"],
    "extent_spatial": col["extent"]["spatial"]["bbox"][0],
    "extent_temporal": col["extent"]["temporal"]["interval"][0],
}

In [9]:
_measurement = info['id']
_description = info['description']
_extent = info['extent_spatial']

'alos_palsar_mosaic'

In [46]:
from pyproj import CRS, Transformer

# get the items link instead of the child
items_url = next(l["href"] for l in col["links"] if l["rel"] == "items")

# now request the first feature from that items collection
features = requests.get(items_url).json()["features"]
feature = features[0]

epsg = feature['properties']['proj:epsg']
transform = feature["properties"]["proj:transform"]

crs = CRS.from_epsg(epsg)
print(crs)

# # extract transform and resolution
# transform = feature["properties"]["proj:transform"]
# res_deg = transform[0]
# res_m = abs(res_deg * 111320)
# info["resolution_m"] = res_m
# print(feature)

EPSG:4326


In [45]:
feature.keys()
epsg = feature['properties']['proj:epsg']

4326

In [15]:
res_m

24.73777777777779

In [16]:
# with pystac
from pystac_client import Client

# Connect to Digital Earth Africa's STAC endpoint
catalog = Client.open("https://explorer.digitalearth.africa/stac")

# Inspect available collections
collections = list(catalog.get_collections())
print([c.id for c in collections])

# Select one collection
collection_id = "alos_palsar_mosaic"
collection = catalog.get_collection(collection_id)
print(collection.description)

# Search for items in the collection (you can add bbox, datetime, or query filters)
search = catalog.search(collections=[collection_id], max_items=5)
items = list(search.items())

# Work with the first item
item = items[0]
print(f"Item ID: {item.id}")
print(f"Projection transform: {item.properties['proj:transform']}")

# Compute approximate spatial resolution in meters
res_deg = item.properties["proj:transform"][0]
res_m = abs(res_deg * 111320)
print(f"Resolution ≈ {res_m:.1f} m")


/Users/pohle/miniconda3/envs/pystac_example/lib/python3.11/site-packages/pystac_client/client.py:799: MissingLink: No link with rel='data' could be found on this Client.
  href = self._get_href("data", data_link, "collections")


['alos_palsar_mosaic', 'cci_landcover', 'cgls_landcover', 'cgls_lwq100_2019_2024', 'cgls_lwq100_2024_nrt', 'cgls_lwq300_2002_2012', 'cgls_lwq300_2016_2024', 'cgls_lwq300_2024_nrt', 'crop_mask', 'crop_mask_central', 'crop_mask_eastern', 'crop_mask_indian_ocean', 'crop_mask_northern', 'crop_mask_sahel', 'crop_mask_southeast', 'crop_mask_southern', 'crop_mask_western', 'dem_cop_30', 'dem_cop_90', 'dem_srtm', 'dem_srtm_deriv', 'esa_worldcereal_activecropland', 'esa_worldcereal_maize_active', 'esa_worldcereal_maize_irrigation', 'esa_worldcereal_maize_main', 'esa_worldcereal_temporarycrops', 'esa_worldcereal_wintercereals', 'esa_worldcereal_wintercereals_irrigation', 'esa_worldcover', 'esa_worldcover_2020', 'esa_worldcover_2021', 'fc_ls', 'fc_ls_summary_annual', 'gm_ls5_ls7_annual', 'gm_ls5_ls7_annual_lowres', 'gm_ls8_annual', 'gm_ls8_annual_lowres', 'gm_ls8_ls9_annual', 'gm_ls8_ls9_annual_lowres', 'gm_s2_annual', 'gm_s2_annual_lowres', 'gm_s2_rolling', 'gm_s2_semiannual', 'gm_s2_semiannual_

In [27]:
item = items[0]
print(f"Item ID: {item.id}")
print(f"Projection transform: {item.properties['proj:epsg']}")


Item ID: 003d41a0-3cd0-54c5-83b8-53c74fc9ae10


KeyError: 'proj:epsg'

In [33]:
epsg

In [32]:
from pyproj import CRS, Transformer

# Select one collection
collection_id = "ndvi_anomaly"
collection = catalog.get_collection(collection_id)
print(collection.description)

# Search for items in the collection (you can add bbox, datetime, or query filters)
search = catalog.search(collections=[collection_id], max_items=5)
items = list(search.items())

# Work with the first item
item = items[0]
print(f"Item ID: {item.id}")
print(f"Projection transform: {item.properties['proj:transform']}")

# Compute approximate spatial resolution in meters
# projection = item.properties["proj:transform"]
epsg = item.properties.get("proj:epsg")
res_deg = item.properties.get("proj:transform")#[0]



crs = CRS.from_epsg(epsg)
print(crs)

# res_deg = transform[0]
# if epsg == 4326:
#     # Approximate meters per degree at equator
#     res_m = abs(res_deg * 111320)
# else:
#     # In projected CRS, resolution may already be in meters
#     res_m = abs(res_deg)

# print(f"Approximate resolution: {res_m:.2f} meters")

# res_m = abs(res_deg[0] * 111320)
# print(f"Resolution ≈ {res_m:.1f} m")

Monthly NDVI Anomalies produced by Digital Earth Africa.
Item ID: 003d41a0-3cd0-54c5-83b8-53c74fc9ae10
Projection transform: [30.0, 0.0, 3168000.0, 0.0, -30.0, 2304000.0, 0.0, 0.0, 1.0]


CRSError: Invalid projection: EPSG:None: (Internal Proj Error: proj_create: crs not found: EPSG:None)

In [22]:
projection

[30.0, 0.0, 3168000.0, 0.0, -30.0, 2304000.0, 0.0, 0.0, 1.0]

In [13]:
child_url = next(l["href"] for l in col["links"] if l["rel"] == "child")
feature = requests.get(child_url).json()["features"][0]
transform = feature["properties"]["proj:transform"]
res_deg = transform[0]
res_m = res_deg * 111320  # ≈ 25 m for ALOS mosaics
info["resolution_m"] = res_m


KeyError: 'features'

In [10]:
info

{'id': 'alos_palsar_mosaic',
 'description': 'ALOS/PALSAR and ALOS-2/PALSAR-2 annual mosaic tiles generated for use in the Data Cube - 25m pixel spacing, WGS84. These tiles are derived from the orignal JAXA mosaics with conversion to GeoTIFF.',
 'extent_spatial': [-20.000000000000004,
  -35.019721543030116,
  55.000000000000014,
  39.99999999999999],
 'extent_temporal': ['2007-01-01T00:00:00Z', '2022-01-01T00:00:00Z']}

In [3]:
import requests
import pandas as pd
import rasterio
from rasterio.warp import transform_bounds

# --- INPUT CSV (Swiss DC reference) ---
df = pd.read_csv("../data/measurements_SwissDC_2024.csv")

# --- AFRICAN DATA CUBE STAC URLs ---
collections_url = "https://explorer.digitalearth.africa/stac/collections"
search_url = "https://explorer.digitalearth.africa/stac/search"

# Fetch all collections
collections = requests.get(collections_url).json()["collections"]

# --- FUNCTION: compute resolution from GeoTIFF using rasterio ---
def get_resolution_from_asset(asset_url):
    """
    Compute approximate resolution in meters from a GeoTIFF asset using rasterio.
    Returns None if asset cannot be accessed.
    """
    try:
        with rasterio.Env():
            with rasterio.open(asset_url) as src:
                if src.crs.is_geographic:
                    lon_center = (src.bounds.left + src.bounds.right) / 2
                    lat_center = (src.bounds.bottom + src.bounds.top) / 2
                    utm_zone = int((lon_center + 180) / 6) + 1
                    utm_crs = f"+proj=utm +zone={utm_zone} +datum=WGS84 +units=m +no_defs"
                    bounds_utm = transform_bounds(src.crs, utm_crs, *src.bounds)
                    width_m = bounds_utm[2] - bounds_utm[0]
                    height_m = bounds_utm[3] - bounds_utm[1]
                    res_x = width_m / src.width
                    res_y = height_m / src.height
                else:
                    res_x, res_y = src.res
                res_m = (abs(res_x) + abs(res_y)) / 2
                return int(round(res_m))
    except Exception as e:
        print("Warning: cannot read", asset_url, ":", e)
        return None

# --- MAIN LOOP ---
rows = []

for col in collections:
    col_id = col["id"]
    print(col_id)

    # Fetch first item for collection
    params = {"collection": col_id, "limit": 1}
    response = requests.get(search_url, params=params).json()
    features = response.get("features")
    if not features:
        continue

    item = features[0]
    props = item["properties"]
    assets = item.get("assets", {})

    for meas_name, asset in assets.items():
        asset_url = asset.get("href")
        resolution = get_resolution_from_asset(asset_url) if asset_url else None

        # dtype and units
        dtype = "int16" if asset.get("type","").lower().endswith("geotiff") else ""
        units = "category" if dtype == "int16" else ""

        # Aliases from eo:bands if asset was readable
        eo_bands = asset.get("eo:bands") or []
        if eo_bands and resolution is not None:
            aliases_list = [b.get("name","") or b.get("description","") or "" for b in eo_bands]
            aliases = str(aliases_list)
        else:
            aliases = ""

        row = {
            "product": props.get("odc:product"),
            "measurement": meas_name,
            "name": meas_name,
            "dtype": dtype,
            "units": units,
            "nodata": 0,
            "aliases": aliases,
            "flags_definition": "",
            "resolution": resolution,
            "time_start": props.get("start_datetime"),
            "time_end": props.get("end_datetime")
        }
        rows.append(row)

# Create DataFrame
df_new = pd.DataFrame(rows)

# Skip unwanted products
skip_products = ["arealstatistik", "corinelc_europe"]
df_new = df_new[~df_new["product"].isin(skip_products)]

# --- SAVE CSV ---
df_new.to_csv("measurements_AfricanDC.csv", index=False)
print("measurements_AfricanDC.csv created with", len(df_new), "rows")


alos_palsar_mosaic
cci_landcover
cgls_landcover


KeyboardInterrupt: 