# Metadata extraction - teachers only
This file is only used by the teachers to prepare metadata, like the time periods covered by individual datasets etc. This serves to avoid using very time consuming `dask` and `stack` commands to crawl through the data cube and extract those information individually.

Crawl through the individual datasets on the data cubes **or** the respective http JSON files on the "explorer" websites, and extract all relevant information to be used for updating the `measurements.csv` file.
- Create one `measurements_<data-cube-name>.csv`, for the Swiss, African, and Australian data cubes (<data-cube-name>)
- Update specifically the `time_start` and `time_end` arguments. These are crucial in the `config_tool.ipynb` routine

## Workflow
1. Read `measurements.csv` as `pd.DataFrame`
2. Connect to relevant datacube / or use directly the `http` link with JSON files
3. Identify all available dataset names (`catalogue`)
4. Identify the **explorer** link to the individual datasets
5. Use `http` to crawl through the JSON files using `request` on the **explorer** webiste (not using dash/stack - way too slow)
7. Identify relevant `attributes` (on the Swiss Data Cube (SDC) this is: "https://explorer.swissdatacube.org/stac/collections/<"dataset name">/extent/temporal/interval/0/<"0 = start, or 1 = end">
8. Update fields `time_start` and `time_end` in `measurements_<data-cube-name>` pd.DataFrame
9. Write `measurements_<data-cube-name>.csv`

In [1]:
# 1
import pandas as pd
import requests
import copy

df_meas_base = pd.read_csv('data/measurements_SwissDC_2024.csv')

In [None]:
#
# SWISS Data Cube
#

In [3]:
dc_i = "SwissDC"
expl_i = "http://explorer.swissdatacube.org:8080/stac/collections"
# 1. Read measurements file
df = pd.read_csv("data/measurements_SwissDC_2024.csv")

# 2. Fetch STAC collections metadata
url = DCs_expl[i]
response = requests.get(url)
response.raise_for_status()
data = response.json()

# 3. Extract collection id, start date, and end date
records = []
for col in data.get("collections", []):
    cid = col.get("id")
    temporal = col.get("extent", {}).get("temporal", {}).get("interval", [[None, None]])[0]
    start = temporal[0]
    end = temporal[1]
    # The SwissDC does not provide metadata on spatial resoltion, so no point in trying to extract this here
    records.append({"product": cid, "time_start": start, "time_end": end})

meta_df = pd.DataFrame(records)
# print(meta_df)

# 4. Clean date format (keep YYYY or YYYY-MM if possible)
def format_date(d):
    if d is None:
        return None
    d = d.split("T")[0]  # drop time component
    # remove trailing "-01" if only year is available, or keep YYYY-MM
    return d

meta_df["time_start"] = meta_df["time_start"].apply(format_date)
meta_df["time_end"] = meta_df["time_end"].apply(format_date)

# 5. Merge with your measurements file
merged = df.merge(meta_df, on="product", how="left", suffixes=("", "_new"))

# # 6. Overwrite only if new info available
# for col in ["time_start", "time_end"]:
#     merged[col] = merged[f"{col}_new"].combine_first(merged[col])

# 6. Overwrite existing values if new info available (not None)
# But skip arealstatistik and corinelc_europe - not valid dates inside!
skip_products = ["arealstatistik", "corinelc_europe"]
mask = ~merged["product"].isin(skip_products)
for col in ["time_start", "time_end"]:
    merged.loc[mask,col] = merged.loc[mask,f"{col}_new"].where(merged.loc[mask,f"{col}_new"].notna(), merged.loc[mask,col])


# 7. Drop temporary columns and save
merged = merged.drop(columns=["time_start_new", "time_end_new"])
merged.to_csv(f"measurements_{dc_i}.csv", index=False)

print(f"Updated metadata saved to 'measurements_{dc_i}.csv'")

✅ Updated metadata saved to 'measurements_SwissDC.csv'


In [None]:
#
# African Data Cube
#

In [54]:
import requests
import pandas as pd
import rasterio
from rasterio.warp import transform_bounds

# --- INPUT CSV (Swiss DC reference) ---
df = pd.read_csv("data/measurements_SwissDC_2024.csv")

# --- AFRICAN DATA CUBE STAC URLs ---
collections_url = "https://explorer.digitalearth.africa/stac/collections"
search_url = "https://explorer.digitalearth.africa/stac/search"

# Fetch all collections
collections = requests.get(collections_url).json()["collections"]

# --- FUNCTION: compute resolution from GeoTIFF using rasterio ---
def get_resolution_from_asset(asset_url):
    """
    Compute approximate resolution in meters from a GeoTIFF asset using rasterio.
    Returns None if asset cannot be accessed.
    """
    try:
        with rasterio.Env():
            with rasterio.open(asset_url) as src:
                if src.crs.is_geographic:
                    lon_center = (src.bounds.left + src.bounds.right) / 2
                    lat_center = (src.bounds.bottom + src.bounds.top) / 2
                    utm_zone = int((lon_center + 180) / 6) + 1
                    utm_crs = f"+proj=utm +zone={utm_zone} +datum=WGS84 +units=m +no_defs"
                    bounds_utm = transform_bounds(src.crs, utm_crs, *src.bounds)
                    width_m = bounds_utm[2] - bounds_utm[0]
                    height_m = bounds_utm[3] - bounds_utm[1]
                    res_x = width_m / src.width
                    res_y = height_m / src.height
                else:
                    res_x, res_y = src.res
                res_m = (abs(res_x) + abs(res_y)) / 2
                return int(round(res_m))
    except Exception as e:
        print("Warning: cannot read", asset_url, ":", e)
        return None

# --- MAIN LOOP ---
rows = []

for col in collections:
    col_id = col["id"]
    print(col_id)

    # Fetch first item for collection
    params = {"collection": col_id, "limit": 1}
    response = requests.get(search_url, params=params).json()
    features = response.get("features")
    if not features:
        continue

    item = features[0]
    props = item["properties"]
    assets = item.get("assets", {})

    for meas_name, asset in assets.items():
        asset_url = asset.get("href")
        resolution = get_resolution_from_asset(asset_url) if asset_url else None

        # dtype and units
        dtype = "int16" if asset.get("type","").lower().endswith("geotiff") else ""
        units = "category" if dtype == "int16" else ""

        # Aliases from eo:bands if asset was readable
        eo_bands = asset.get("eo:bands") or []
        if eo_bands and resolution is not None:
            aliases_list = [b.get("name","") or b.get("description","") or "" for b in eo_bands]
            aliases = str(aliases_list)
        else:
            aliases = ""

        row = {
            "product": props.get("odc:product"),
            "measurement": meas_name,
            "name": meas_name,
            "dtype": dtype,
            "units": units,
            "nodata": 0,
            "aliases": aliases,
            "flags_definition": "",
            "resolution": resolution,
            "time_start": props.get("start_datetime"),
            "time_end": props.get("end_datetime")
        }
        rows.append(row)

# Create DataFrame
df_new = pd.DataFrame(rows)

# Skip unwanted products
skip_products = ["arealstatistik", "corinelc_europe"]
df_new = df_new[~df_new["product"].isin(skip_products)]

# --- SAVE CSV ---
df_new.to_csv("measurements_AfricanDC.csv", index=False)
print("measurements_AfricanDC.csv created with", len(df_new), "rows")


alos_palsar_mosaic
cci_landcover
cgls_landcover
cgls_lwq100_2019_2024
cgls_lwq100_2024_nrt
cgls_lwq300_2002_2012
cgls_lwq300_2016_2024
cgls_lwq300_2024_nrt
crop_mask
crop_mask_central
crop_mask_eastern
crop_mask_indian_ocean
crop_mask_northern
crop_mask_sahel
crop_mask_southeast
crop_mask_southern
crop_mask_western
dem_cop_30
dem_cop_90
dem_srtm
dem_srtm_deriv
esa_worldcereal_activecropland
esa_worldcereal_maize_active
esa_worldcereal_maize_irrigation
esa_worldcereal_maize_main
esa_worldcereal_temporarycrops
esa_worldcereal_wintercereals
esa_worldcereal_wintercereals_irrigation
esa_worldcover
esa_worldcover_2020
esa_worldcover_2021
fc_ls
fc_ls_summary_annual
gm_ls5_ls7_annual
gm_ls5_ls7_annual_lowres
gm_ls8_annual
gm_ls8_annual_lowres
gm_ls8_ls9_annual
gm_ls8_ls9_annual_lowres
gm_s2_annual
gm_s2_annual_lowres
gm_s2_rolling
gm_s2_semiannual
gm_s2_semiannual_lowres
gmw
io_lulc
io_lulc_v2
isda_soil_bedrock_depth


  dataset = DatasetReader(path, driver=driver, sharing=sharing, **kwargs)


isda_soil_bulk_density
isda_soil_carbon_total
isda_soil_clay_content
isda_soil_sand_content
isda_soil_silt_content
iwmi_blue_et_monthly
iwmi_evaporation_monthly
iwmi_green_et_monthly
iwmi_interception_monthly
iwmi_transpiration_monthly
jers_sar_mosaic
landsat_c2l2_ar
ls5_sr
ls5_st
ls7_sr
ls7_st
ls8_sr
ls8_st
ls9_sr
ls9_st
maxar_morocco_earthquake
maxar_morocco_earthquake_4bands
maxar_morocco_earthquake_8bands
nasadem
ndvi_anomaly
ndvi_climatology_ls
pc_s2_annual
rainfall_chirps_daily
rainfall_chirps_monthly
s1_monthly_mosaic
s1_rtc
s2_l2a
s2_l2a_c1
s3_ol_2_wfr_nrt
s3_olci_l2_lfr
s3_olci_l2_wfr
s3_syn_2_vg1
s5p_tropomi_l2_aer_ai
s5p_tropomi_l2_ch4
s5p_tropomi_l2_cloud
s5p_tropomi_l2_co
s5p_tropomi_l2_hcho
s5p_tropomi_l2_no2
s5p_tropomi_l2_o3
s5p_tropomi_l2_so2
wapor_soil_moisture
wofs_ls
wofs_ls_summary_alltime
wofs_ls_summary_annual
wsf_2015
wsf_2019
wsf_evolution
measurements_AfricanDC.csv created with 670 rows
