In [2]:
#!/usr/bin/env python3
# Exact EASE-Grid 2.0 sampling (EPSG:6933) for SMAP L4 soil moisture (45N+ monthly subset)
# - Reprojects sites to EPSG:6933 (EASE-Grid 2.0 Global, 9 km)
# - Computes (row, col) using a projection-anchored formula (no hard-coded XMIN/YMAX)
# - Indexes sm_surface / sm_rootzone directly
# - Writes monthly CSV for 2000–2023

import os
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import geopandas as gpd
import xarray as xr
from pyproj import Transformer

# -------------------- CONFIG --------------------
SHAPEFILE   = "/explore/nobackup/people/spotter5/anna_v/v2/v2_model_sites.shp"
H5_DIR      = "/explore/nobackup/people/spotter5/anna_v/v2/L4_SM_NRv11-4_40N+_soil_moisture"
OUT_CSV     = "/explore/nobackup/people/spotter5/anna_v/v2/soil_moisture_by_site_monthly_2000_2023.csv"

START_YEAR  = 2000
END_YEAR    = 2023

# EASE-Grid 2.0 Global (EPSG:6933) constants for 9-km product
PIX_SIZE_M    = 9000.0          # meters
GLOBAL_NCOLS  = 3856            # confirmed by your arrays (width)
SUBSET_NROWS  = 236             # confirmed by your arrays (height 90N→45N)
FILLVALUE     = -9999.0
# ------------------------------------------------

def build_filename(year: int, month: int) -> str:
    return f"L4_SM_NRv11-4_40N+_soil_moisture_Y{year:04d}M{month:02d}.h5"

def read_arrays(h5_path: str):
    """Open H5 and return (sm_surface, sm_rootzone) as float32 numpy arrays with FILLVALUE set to NaN."""
    ds = xr.open_dataset(h5_path, engine="h5netcdf", phony_dims="sort")
    sm_surf = ds["sm_surface"].values.astype("float32")
    sm_root = ds["sm_rootzone"].values.astype("float32")
    sm_surf[sm_surf == FILLVALUE] = np.nan
    sm_root[sm_root == FILLVALUE] = np.nan
    # Sanity shape check
    nrows, ncols = sm_surf.shape
    if (nrows, ncols) != (SUBSET_NROWS, GLOBAL_NCOLS):
        raise ValueError(f"Unexpected array shape {sm_surf.shape} in {os.path.basename(h5_path)}; "
                         f"expected ({SUBSET_NROWS}, {GLOBAL_NCOLS}).")
    return sm_surf, sm_root

def make_transformer():
    # lon/lat -> EPSG:6933
    return Transformer.from_crs("EPSG:4326", "EPSG:6933", always_xy=True)

def compute_row_anchor(transformer):
    """
    Compute projected Y at 90N to anchor the TOP of the north-hemisphere grid.
    Row index is measured down from this anchor in PIX_SIZE_M increments.
    """
    x90, y90 = transformer.transform(0.0, 90.0)
    x45, y45 = transformer.transform(0.0, 45.0)
    est_rows_45N = (y90 - y45) / PIX_SIZE_M
    print(f"[INFO] Projected span 90N→45N ≈ {est_rows_45N:.2f} rows (expected ~{SUBSET_NROWS})")
    return y90  # top anchor (row 0 at 90N)

def lonlat_to_epsg6933(transformer, lon, lat):
    return transformer.transform(lon, lat)

def xy_to_colrow_from_anchors(x_m, y_m, y_top_anchor):
    """
    Column (centered): col = floor(x/PIX + GLOBAL_NCOLS/2)
    Row anchored to y at 90N: row_global_from_90 = floor((y_top_anchor - y)/PIX)
    For the 45N+ subset, local rows are the first SUBSET_NROWS starting at 90N.
    """
    col = int(np.floor(x_m / PIX_SIZE_M + GLOBAL_NCOLS / 2.0))
    row_global_from_90 = int(np.floor((y_top_anchor - y_m) / PIX_SIZE_M))
    row_local = row_global_from_90  # subset starts right at 90N
    return col, row_local

def main():
    # Load sites; keep only rows with geometry
    gdf = gpd.read_file(SHAPEFILE)
    gdf = gdf[gdf.geometry.notnull()].copy()
    if gdf.crs is None:
        gdf = gdf.set_crs("EPSG:4326")

    # Normalize site id column
    if "site_refer" in gdf.columns and "site_reference" not in gdf.columns:
        gdf = gdf.rename(columns={"site_refer": "site_reference"})
    if "site_reference" not in gdf.columns:
        gdf["site_reference"] = [f"site_{i}" for i in range(len(gdf))]

    # Prepare transformer and compute top (90N) y-anchor
    tf = make_transformer()
    y_top = compute_row_anchor(tf)

    # Precompute site indices (allow NaN for out-of-range)
    site_index_info = []
    for _, row in gdf.iterrows():
        lon, lat = row.geometry.x, row.geometry.y

        # Sites south of 45N are outside subset by definition
        if lat < 45.0:
            site_index_info.append({
                "site_reference": row["site_reference"],
                "usable": False,
                "reason": "lat<45N",
                "col": np.nan, "row_local": np.nan
            })
            continue

        x_m, y_m = lonlat_to_epsg6933(tf, lon, lat)
        col, row_local = xy_to_colrow_from_anchors(x_m, y_m, y_top)
        usable = (0 <= col < GLOBAL_NCOLS) and (0 <= row_local < SUBSET_NROWS)

        site_index_info.append({
            "site_reference": row["site_reference"],
            "usable": bool(usable),
            "reason": None if usable else "out_of_bounds",
            "col": col if usable else np.nan,
            "row_local": row_local if usable else np.nan
        })

    site_idx_df = pd.DataFrame(site_index_info)

    # Ensure numeric dtype (avoids float indexing issues)
    site_idx_df["row_local"] = pd.to_numeric(site_idx_df["row_local"], errors="coerce")
    site_idx_df["col"]       = pd.to_numeric(site_idx_df["col"], errors="coerce")

    n_total = len(site_idx_df)
    n_use = int(site_idx_df["usable"].sum())
    n_bad = n_total - n_use
    print(f"[INFO] Sites usable in 40N+ subset: {n_use}/{n_total} "
          f"(excluded south of 40N or out_of_bounds: {n_bad})")
    if n_use == 0:
        print("[WARN] No sites landed in the 45N+ strip. Check site latitudes/CRS.")

    records = []

    for year in range(START_YEAR, END_YEAR + 1):
        for month in range(1, 13):
            fpath = os.path.join(H5_DIR, build_filename(year, month))
            if not os.path.exists(fpath):
                print(f"[WARN] Missing file: {fpath}")
                continue

            try:
                sm_surface, sm_rootzone = read_arrays(fpath)
            except Exception as e:
                print(f"[ERROR] {fpath}: {e}")
                continue

            # Sample each site using direct numpy indexing
            for _, r in site_idx_df.iterrows():
                if (not r["usable"]) or pd.isna(r["row_local"]) or pd.isna(r["col"]):
                    val_s = np.nan
                    val_r = np.nan
                else:
                    rs = int(r["row_local"])
                    cs = int(r["col"])
                    if 0 <= rs < SUBSET_NROWS and 0 <= cs < GLOBAL_NCOLS:
                        val_s = sm_surface[rs, cs]
                        val_r = sm_rootzone[rs, cs]
                    else:
                        val_s = np.nan
                        val_r = np.nan

                records.append({
                    "site_reference": r["site_reference"],
                    "year": year,
                    "month": month,
                    "sm_surface": float(val_s) if np.isfinite(val_s) else np.nan,
                    "sm_rootzone": float(val_r) if np.isfinite(val_r) else np.nan
                })

    out_df = pd.DataFrame.from_records(records)
    out_df.sort_values(["site_reference", "year", "month"], inplace=True)
    out_df.to_csv(OUT_CSV, index=False)
    print(f"Saved: {OUT_CSV}  (rows={len(out_df)})")

if __name__ == "__main__":
    main()


[INFO] Projected span 90N→45N ≈ 240.24 rows (expected ~236)
[INFO] Sites usable in 40N+ subset: 2143/2144 (excluded south of 40N or out_of_bounds: 1)
[ERROR] /explore/nobackup/people/spotter5/anna_v/v2/L4_SM_NRv11-4_40N+_soil_moisture/L4_SM_NRv11-4_40N+_soil_moisture_Y2000M01.h5: Unexpected array shape (289, 3856) in L4_SM_NRv11-4_40N+_soil_moisture_Y2000M01.h5; expected (236, 3856).
[ERROR] /explore/nobackup/people/spotter5/anna_v/v2/L4_SM_NRv11-4_40N+_soil_moisture/L4_SM_NRv11-4_40N+_soil_moisture_Y2000M02.h5: Unexpected array shape (289, 3856) in L4_SM_NRv11-4_40N+_soil_moisture_Y2000M02.h5; expected (236, 3856).
[ERROR] /explore/nobackup/people/spotter5/anna_v/v2/L4_SM_NRv11-4_40N+_soil_moisture/L4_SM_NRv11-4_40N+_soil_moisture_Y2000M03.h5: Unexpected array shape (289, 3856) in L4_SM_NRv11-4_40N+_soil_moisture_Y2000M03.h5; expected (236, 3856).
[ERROR] /explore/nobackup/people/spotter5/anna_v/v2/L4_SM_NRv11-4_40N+_soil_moisture/L4_SM_NRv11-4_40N+_soil_moisture_Y2000M04.h5: Unexpect

KeyError: 'site_reference'

In [16]:
df = pd.read_csv('/explore/nobackup/people/spotter5/anna_v/v2/soil_moisture_by_site_monthly_2000_2023.csv')

df

Unnamed: 0,site_reference,year,month,sm_surface,sm_rootzone
0,APEX Beta_Active Margin_AMCH1_agg_chamber,2000,1,0.538579,0.848821
1,APEX Beta_Active Margin_AMCH1_agg_chamber,2000,2,0.538579,0.848813
2,APEX Beta_Active Margin_AMCH1_agg_chamber,2000,3,0.538579,0.848805
3,APEX Beta_Active Margin_AMCH1_agg_chamber,2000,4,0.639189,0.881813
4,APEX Beta_Active Margin_AMCH1_agg_chamber,2000,5,0.660515,0.897082
...,...,...,...,...,...
617467,Zotino_RU-Zot_tower,2023,8,0.226507,0.151379
617468,Zotino_RU-Zot_tower,2023,9,0.208362,0.137487
617469,Zotino_RU-Zot_tower,2023,10,0.234460,0.144890
617470,Zotino_RU-Zot_tower,2023,11,0.238286,0.152914


In [13]:
df['sm_surface'].unique()

array([nan])