This script will take the predictors and add a band called burn fraction from fire cci and save the new tif files. This will eventually be used to make a parquet file which is the training data frame.  It will do it with MODIS

In [None]:
import os
import re
import numpy as np
import rasterio as rio
from rasterio.warp import reproject, Resampling
from pathlib import Path
from tqdm import tqdm
from calendar import monthrange
from datetime import date

# ---------------- PATHS ----------------
CEMS_DIR  = "/explore/nobackup/people/spotter5/clelland_fire_ml/training_e5l_cems_mcd"
FIRECCI_ANNUAL_DIR = "/explore/nobackup/people/spotter5/clelland_fire_ml/mcd64a1_us"   # DOY rasters: YYYY.tif
OUT_DIR   = "/explore/nobackup/people/spotter5/clelland_fire_ml/training_e5l_cems_mcd_with_fraction"

os.makedirs(OUT_DIR, exist_ok=True)

# -------------- BANDS TO KEEP --------------
WANTED = [
    "DEM",
    "slope",
    "aspect",                     # will match 'aspect' or 'aspectrad' etc.
    "b1",                         # land cover (categorical, will be cast to float32 in output)
    "relative_humidity",
    "total_precipitation_sum",
    "temperature_2m",
    "temperature_2m_min",
    "temperature_2m_max",
    "build_up_index",
    "drought_code",
    "duff_moisture_code",
    "fine_fuel_moisture_code",
    "fire_weather_index",
    "initial_fire_spread_index",  # if your files use 'initial_spread_index', it’ll still match
]

def _norm(s: str) -> str:
    return re.sub(r"[^a-z0-9]", "", s.lower())

WANTED_NORM = [_norm(x) for x in WANTED]

# -------------- HELPERS --------------
name_re = re.compile(r"cems_e5l_mcd_(\d{4})_(\d{1,2})\.tif$", re.IGNORECASE)

def parse_year_month(path: Path):
    m = name_re.search(path.name)
    return (int(m.group(1)), int(m.group(2))) if m else None

def doy_range_for_month(year: int, month: int):
    """Return inclusive DOY start/end for given month in year, accounting for leap years."""
    first = date(year, month, 1)
    last_day = monthrange(year, month)[1]
    last = date(year, month, last_day)
    doy1 = (first - date(year, 1, 1)).days + 1
    doy2 = (last  - date(year, 1, 1)).days + 1
    return doy1, doy2

def map_band_indices_by_name(ds: rio.DatasetReader):
    mapping = {}
    descs = ds.descriptions  # tuple length = band count; may contain None
    for i, d in enumerate(descs, start=1):
        if d is None:
            d = f"B{i}"
        mapping[_norm(d)] = i
    return mapping, descs

# Cache annual FireCCI arrays per year to avoid re-reading 20+ times
_firecci_cache = {}

def get_firecci_year_arr(year: int):
    if year in _firecci_cache:
        return _firecci_cache[year]
    src_path = Path(FIRECCI_ANNUAL_DIR) / f"{year}.tif"
    if not src_path.exists():
        raise FileNotFoundError(f"Missing FireCCI annual DOY: {src_path}")
    with rio.open(src_path) as src:
        arr = src.read(1)         # DOY or 0/NoData
        info = {
            "crs": src.crs,
            "transform": src.transform,
            "nodata": src.nodata,
            "height": src.height,
            "width": src.width,
        }
    _firecci_cache[year] = (arr, info)
    return _firecci_cache[year]

def monthly_fraction_from_annual(year: int, month: int, tmpl_ds: rio.DatasetReader) -> np.ndarray:
    """
    From annual DOY raster, build uint8 mask for that month and average -> fraction on tmpl grid.
    1 = burned in month; 0 = not burned; 255 = nodata (ignored in averaging).
    """
    arr, info = get_firecci_year_arr(year)
    doy1, doy2 = doy_range_for_month(year, month)

    # Make a float view for NaN tests without copying unless needed
    a = arr.astype("float32", copy=False)
    src_nodata = info["nodata"]

    # Identify nodata
    is_nan = np.isnan(a)
    if src_nodata is not None and not np.isnan(src_nodata):
        is_nodata = is_nan | (a == src_nodata)
    else:
        is_nodata = is_nan

    # Burned this month: valid DOY within [doy1, doy2] and >0
    burned_this_month = (a >= doy1) & (a <= doy2) & (~is_nodata) & (a > 0)

    # Build mask
    mask = np.zeros(a.shape, dtype=np.uint8)  # default 0 (unburned)
    mask[burned_this_month] = 1               # burned in this month
    mask[is_nodata] = 255                     # nodata -> 255 so it’s ignored by averaging

    # Reproject/aggregate to template grid
    frac = np.full((tmpl_ds.height, tmpl_ds.width), np.nan, dtype=np.float32)
    reproject(
        source=mask,
        destination=frac,
        src_transform=info["transform"],
        src_crs=info["crs"],
        src_nodata=255,                 # tell reproject to ignore 255
        dst_transform=tmpl_ds.transform,
        dst_crs=tmpl_ds.crs,
        dst_nodata=np.nan,              # keep NaNs in memory; we won’t set nodata on write
        resampling=Resampling.average,  # 0/1 average -> monthly burned fraction
        num_threads=0,                  # ALL_CPUS
    )
    return frac  # 0..1, NaN where no contributing data


# -------------- MAIN --------------
tifs = sorted(Path(CEMS_DIR).glob("cems_e5l_mcd_*.tif"))
if not tifs:
    print(f"No CEMS files found in {CEMS_DIR}")

for src_path in tqdm(tifs):
    ym = parse_year_month(src_path)
    if ym is None:
        print(f"[SKIP name] {src_path.name}")
        continue
    year, month = ym

    out_path = Path(OUT_DIR) / src_path.name.replace(".tif", "_with_fraction.tif")
    if out_path.exists():
        print(f"[SKIP exists] {out_path.name}")
        continue

    with rio.open(src_path) as ds:
        band_map, _ = map_band_indices_by_name(ds)

        # Select bands in the requested order (best-effort name matching)
        keep_indices, keep_names = [], []
        for want_norm, want_orig in zip(WANTED_NORM, WANTED):
            if want_norm in band_map:
                keep_indices.append(band_map[want_norm])
                keep_names.append(want_orig)
                continue
            # partial (handles 'aspect' vs 'aspectrad', etc.)
            match_idx = None
            for k_norm, idx in band_map.items():
                if want_norm in k_norm or k_norm in want_norm:
                    match_idx = idx
                    break
            if match_idx is not None:
                keep_indices.append(match_idx)
                keep_names.append(want_orig)
            else:
                print(f"[WARN] {src_path.name}: could not find band like '{want_orig}'")

        if not keep_indices:
            print(f"[SKIP no-bands] {src_path.name}")
            continue

        # Read selected bands (cast to float32 for a single dtype stack)
        data_list = [ds.read(bi).astype(np.float32) for bi in keep_indices]

        # Build monthly fraction from annual DOY and reproject/aggregate to this grid
        frac = monthly_fraction_from_annual(year, month, ds).astype(np.float32)
        data_list.append(frac)

        out_arr = np.stack(data_list, axis=0)  # (bands, H, W)

        # Profile for output
        profile = ds.profile.copy()
        profile.update(
            dtype="float32",
            count=out_arr.shape[0],
            compress="LZW",
            tiled=True,
            blockxsize=256,
            blockysize=256,
            BIGTIFF="IF_SAFER",
            nodata=None,  # leave NaNs; many stacks prefer this for float data
        )

        # Write output & band descriptions
        with rio.open(out_path, "w", **profile) as dst:
            dst.write(out_arr)
            descs = keep_names + ["fraction"]
            for i, nm in enumerate(descs, start=1):
                dst.set_band_description(i, nm)

    print(f"[OK] {out_path.name}  (kept {len(keep_names)} bands + fraction)")


  0%|          | 1/275 [00:27<2:04:40, 27.30s/it]

[OK] cems_e5l_mcd_2001_1_with_fraction.tif  (kept 15 bands + fraction)


  1%|          | 2/275 [00:45<1:40:16, 22.04s/it]

[OK] cems_e5l_mcd_2001_10_with_fraction.tif  (kept 15 bands + fraction)


  1%|          | 3/275 [01:03<1:32:11, 20.34s/it]

[OK] cems_e5l_mcd_2001_11_with_fraction.tif  (kept 15 bands + fraction)


  1%|▏         | 4/275 [01:22<1:28:16, 19.54s/it]

[OK] cems_e5l_mcd_2001_12_with_fraction.tif  (kept 15 bands + fraction)


  2%|▏         | 5/275 [01:40<1:25:34, 19.02s/it]

[OK] cems_e5l_mcd_2001_2_with_fraction.tif  (kept 15 bands + fraction)


  2%|▏         | 6/275 [01:58<1:23:39, 18.66s/it]

[OK] cems_e5l_mcd_2001_3_with_fraction.tif  (kept 15 bands + fraction)


  3%|▎         | 7/275 [02:16<1:22:34, 18.49s/it]

[OK] cems_e5l_mcd_2001_4_with_fraction.tif  (kept 15 bands + fraction)


  3%|▎         | 8/275 [02:34<1:21:46, 18.37s/it]

[OK] cems_e5l_mcd_2001_5_with_fraction.tif  (kept 15 bands + fraction)


  3%|▎         | 9/275 [02:53<1:21:30, 18.39s/it]

[OK] cems_e5l_mcd_2001_6_with_fraction.tif  (kept 15 bands + fraction)


  4%|▎         | 10/275 [03:11<1:20:45, 18.28s/it]

[OK] cems_e5l_mcd_2001_7_with_fraction.tif  (kept 15 bands + fraction)


  4%|▍         | 11/275 [03:29<1:20:22, 18.27s/it]

[OK] cems_e5l_mcd_2001_8_with_fraction.tif  (kept 15 bands + fraction)


  4%|▍         | 12/275 [03:47<1:20:04, 18.27s/it]

[OK] cems_e5l_mcd_2001_9_with_fraction.tif  (kept 15 bands + fraction)


  5%|▍         | 13/275 [04:13<1:30:21, 20.69s/it]

[OK] cems_e5l_mcd_2002_1_with_fraction.tif  (kept 15 bands + fraction)


  5%|▌         | 14/275 [04:32<1:26:56, 19.99s/it]

[OK] cems_e5l_mcd_2002_10_with_fraction.tif  (kept 15 bands + fraction)


  5%|▌         | 15/275 [04:50<1:24:48, 19.57s/it]

[OK] cems_e5l_mcd_2002_11_with_fraction.tif  (kept 15 bands + fraction)


  6%|▌         | 16/275 [05:09<1:22:57, 19.22s/it]

[OK] cems_e5l_mcd_2002_12_with_fraction.tif  (kept 15 bands + fraction)


  6%|▌         | 17/275 [05:27<1:21:29, 18.95s/it]

[OK] cems_e5l_mcd_2002_2_with_fraction.tif  (kept 15 bands + fraction)


  7%|▋         | 18/275 [05:45<1:20:31, 18.80s/it]

[OK] cems_e5l_mcd_2002_3_with_fraction.tif  (kept 15 bands + fraction)


  7%|▋         | 19/275 [06:04<1:19:41, 18.68s/it]

[OK] cems_e5l_mcd_2002_4_with_fraction.tif  (kept 15 bands + fraction)


  7%|▋         | 20/275 [06:22<1:19:01, 18.59s/it]

[OK] cems_e5l_mcd_2002_5_with_fraction.tif  (kept 15 bands + fraction)


  8%|▊         | 21/275 [06:40<1:18:06, 18.45s/it]

[OK] cems_e5l_mcd_2002_6_with_fraction.tif  (kept 15 bands + fraction)


  8%|▊         | 22/275 [06:59<1:17:38, 18.41s/it]

[OK] cems_e5l_mcd_2002_7_with_fraction.tif  (kept 15 bands + fraction)


  8%|▊         | 23/275 [07:17<1:17:19, 18.41s/it]

[OK] cems_e5l_mcd_2002_8_with_fraction.tif  (kept 15 bands + fraction)


  9%|▊         | 24/275 [07:36<1:17:06, 18.43s/it]

[OK] cems_e5l_mcd_2002_9_with_fraction.tif  (kept 15 bands + fraction)


  9%|▉         | 25/275 [08:04<1:28:53, 21.34s/it]

[OK] cems_e5l_mcd_2003_1_with_fraction.tif  (kept 15 bands + fraction)


  9%|▉         | 26/275 [08:22<1:24:55, 20.46s/it]

[OK] cems_e5l_mcd_2003_10_with_fraction.tif  (kept 15 bands + fraction)


 10%|▉         | 27/275 [08:41<1:21:59, 19.84s/it]

[OK] cems_e5l_mcd_2003_11_with_fraction.tif  (kept 15 bands + fraction)


 10%|█         | 28/275 [08:59<1:19:41, 19.36s/it]

[OK] cems_e5l_mcd_2003_12_with_fraction.tif  (kept 15 bands + fraction)


 11%|█         | 29/275 [09:17<1:18:00, 19.03s/it]

[OK] cems_e5l_mcd_2003_2_with_fraction.tif  (kept 15 bands + fraction)


 11%|█         | 30/275 [09:35<1:16:53, 18.83s/it]

[OK] cems_e5l_mcd_2003_3_with_fraction.tif  (kept 15 bands + fraction)


 11%|█▏        | 31/275 [09:54<1:16:11, 18.73s/it]

[OK] cems_e5l_mcd_2003_4_with_fraction.tif  (kept 15 bands + fraction)


 12%|█▏        | 32/275 [10:12<1:15:20, 18.60s/it]

[OK] cems_e5l_mcd_2003_5_with_fraction.tif  (kept 15 bands + fraction)


 12%|█▏        | 33/275 [10:30<1:14:38, 18.51s/it]

[OK] cems_e5l_mcd_2003_6_with_fraction.tif  (kept 15 bands + fraction)


In [9]:
't'

't'

Make parquet file

In [None]:
import os
import re
import numpy as np
import pandas as pd
import rasterio as rio
from rasterio.windows import Window
from rasterio.warp import transform as rio_transform
from pathlib import Path
from tqdm import tqdm

import pyarrow as pa
import pyarrow.parquet as pq

# ================== CONFIG ==================
IN_DIR   = "/explore/nobackup/people/spotter5/clelland_fire_ml/training_e5l_cems_mcd_with_fraction"
OUT_DATASET_DIR = "/explore/nobackup/people/spotter5/clelland_fire_ml/parquet_cems_with_fraction_dataset_mcd"  
os.makedirs(OUT_DATASET_DIR, exist_ok=True)

# If your TIFF CRS is projected (e.g., EPSG:3413) but you want true lat/lon degrees:
REPROJECT_TO_EPSG4326 = True

# Band names are taken from descriptions in the TIFFs
# (We’ll sanitize to safe column names and ensure uniqueness.)
def sanitize_names(names):
    seen = {}
    out = []
    for n in names:
        if n is None or str(n).strip() == "":
            n = "band"
        n0 = re.sub(r"[^a-zA-Z0-9_]", "_", str(n).strip())
        n0 = re.sub(r"_+", "_", n0).strip("_")
        if n0 == "":
            n0 = "band"
        if n0 in seen:
            seen[n0] += 1
            n0 = f"{n0}_{seen[n0]}"
        else:
            seen[n0] = 1
        out.append(n0)
    return out

name_re = re.compile(r"cems_e5l_mcd_(\d{4})_(\d{1,2})", re.IGNORECASE)
def parse_year_month(fname: str):
    m = name_re.search(fname)
    if not m: return None, None
    return int(m.group(1)), int(m.group(2))

def windows_for_dataset(ds: rio.DatasetReader):
    """Iterate using native block windows when possible (COG-friendly)."""
    try:
        for _, w in ds.block_windows(1):
            yield w
    except Exception:
        tile = 1024
        for row_off in range(0, ds.height, tile):
            height = min(tile, ds.height - row_off)
            for col_off in range(0, ds.width, tile):
                width = min(tile, ds.width - col_off)
                yield Window(col_off, row_off, width, height)

def append_chunk_to_dataset(df: pd.DataFrame, root: str):
    """Append a chunk to a partitioned Parquet dataset: year=/month= subdirs."""
    # PyArrow: convert then write into dataset with partition columns
    table = pa.Table.from_pandas(df, preserve_index=False)
    pq.write_to_dataset(
        table,
        root_path=root,
        partition_cols=["year", "month"],  # directories year=YYYY/month=MM
        # engine will create/append files inside the relevant partition folders
        use_dictionary=False
    )

# ================== MAIN ==================
tifs = sorted(Path(IN_DIR).glob("cems_e5l_mcd_*with_fraction.tif"))
if not tifs:
    tifs = sorted(Path(IN_DIR).glob("cems_e5l_mcd_*.tif"))

if not tifs:
    raise FileNotFoundError(f"No CEMS TIFFs found in {IN_DIR}")

# Optional: track a canonical column order from the first file for consistency
canonical_cols = None

for tif in tqdm(tifs, desc="Building partitioned Parquet dataset"):
    year, month = parse_year_month(tif.name)
    if year is None:
        print(f"[SKIP] {tif.name} (no YYYY_M in name)")
        continue

    with rio.open(tif) as ds:
        # Band names
        band_names = list(ds.descriptions)
        if not any(band_names):
            band_names = [f"B{i}" for i in range(1, ds.count + 1)]
        safe_names = sanitize_names(band_names)

        # Establish canonical column order (bands first) once
        if canonical_cols is None:
            canonical_cols = safe_names + ["longitude", "latitude", "year", "month"]

        # Process in windows and append per-window
        for w in windows_for_dataset(ds):
            data = ds.read(indexes=list(range(1, ds.count + 1)), window=w).astype(np.float32)  # (bands, h, w)
            h, w_cols = data.shape[1], data.shape[2]

            # Row/col grid for this window
            rows = np.arange(int(w.row_off), int(w.row_off + h))
            cols = np.arange(int(w.col_off), int(w.col_off + w_cols))
            rr, cc = np.meshgrid(rows, cols, indexing="ij")

            # Pixel center coords in native CRS
            xs, ys = rio.transform.xy(ds.transform, rr, cc, offset="center")
            xs = np.asarray(xs, dtype=np.float64).reshape(-1)
            ys = np.asarray(ys, dtype=np.float64).reshape(-1)

            # Reproject to lon/lat if desired
            if REPROJECT_TO_EPSG4326 and ds.crs is not None and ds.crs.to_string().upper() not in ("EPSG:4326", "OGC:CRS84"):
                # transform expects 1-D arrays; returns lists
                lons, lats = rio_transform(ds.crs, "EPSG:4326", xs, ys)
                lons = np.array(lons, dtype=np.float64)
                lats = np.array(lats, dtype=np.float64)
            else:
                # Already geographic or CRS unknown; treat xs=longitude, ys=latitude as-is
                lons, lats = xs, ys

            # Flatten bands to rows
            arr2d = data.reshape(ds.count, -1).T  # (h*w, bands)
            df = pd.DataFrame(arr2d, columns=safe_names)

            # Add coords + partitions
            df["longitude"] = lons
            df["latitude"]  = lats
            df["year"]      = year
            df["month"]     = month

            # Align column order to canonical set (in case bands vary)
            for col in canonical_cols:
                if col not in df.columns:
                    df[col] = np.nan  # ensure schema stability
            df = df[canonical_cols]

            # Append window-chunk to dataset
            append_chunk_to_dataset(df, OUT_DATASET_DIR)

print(f"✅ Done. Parquet dataset at:\n{OUT_DATASET_DIR}\n(partitioned by year=/month=)")


Building partitioned Parquet dataset:   6%|▌         | 16/275 [01:51<30:02,  6.96s/it]

In [14]:
't'

't'