This script will calculate total burned area per year and by ecoregion for fire cci, MCD64a1, and our predictions.  For our predictions we will take monthly predictions and calculate max value to deal with multiple burns in same location across months. For fire cci and mcd64a1 we will call it burned if the fractin is >0.50.

All models

In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Compute annual burned area (Mha) per ecoregion for ALL Option 4 focal models
(10x negatives, AUC-threshold) and merge into existing CSV that already has:

  - ba_mcd_native_Mha
  - ba_firecci_native_Mha
  - ba_pred_Mha (original predictions)
  - ba_pred_Mha_* from Option 2, etc.

For each prediction directory:

  /explore/nobackup/people/spotter5/clelland_fire_ml/
      predictions_option4_focal_10x_negative_auc_thresh_negfrac{pct:03d}_mcd/class/

we:

  - For each year (2001–2023), stack 12 monthly predicted class rasters
    (cems_pred_class_YYYY_MM_thr*.tif), take max across months
    -> annual 0/1 mask (1=burned at least once in the year).
  - For each ecoregion polygon:
      * rasterize polygon
      * count burned pixels
      * multiply by pixel area (m²) and convert to Mha.
  - Add a column:

      ba_pred_Mha_focal10x_negfrac{pct}

    e.g. ba_pred_Mha_focal10x_negfrac10, ba_pred_Mha_focal10x_negfrac50

and merge into the existing CSV.
"""

import os
import re
from pathlib import Path
from typing import Optional, Dict, Tuple

import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio as rio
from rasterio.features import geometry_mask

# ============================
# CONFIG
# ============================

# Years for MCD / predictions
YEARS_MCD = list(range(2001, 2024))
MONTHS    = list(range(1, 13))

# --- Native MCD monthly fraction rasters (for template / pixel area only) ---
NATIVE_MCD_DIR = Path(
    "/explore/nobackup/people/spotter5/clelland_fire_ml/training_e5l_cems_mcd_with_fraction"
)

# Root where prediction dirs live (must match prediction script)
PRED_ROOT_DIR = Path("/explore/nobackup/people/spotter5/clelland_fire_ml")

# Pattern of prediction dirs created by the Option 4 focal prediction script
# e.g. predictions_option4_focal_10x_negative_auc_thresh_negfrac050_mcd
PRED_DIR_PATTERN = "predictions_option4_focal_10x_negative_auc_thresh_negfrac*_mcd"

# Glob pattern for predicted class TIFFs:
#   cems_pred_class_YYYY_MM_thr{thr}.tif
PRED_CLASS_PATTERN = "cems_pred_class_{year}_{month:02d}_thr*.tif"

# --- Ecoregion shapefile ---
ECOS_PATH = "/explore/nobackup/people/spotter5/helene/raw/merge_eco_v2.shp"
ECO_ID_COL = "ecoregion"

# --- Existing + new CSV paths ---
OUT_DIR = Path("/explore/nobackup/people/spotter5/clelland_fire_ml/burned_area_summaries")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Existing CSV that already has MCD, FireCCI, Option 2 cols, etc.
IN_CSV_OLD  = OUT_DIR / "burned_area_by_ecoregion_predictions_all_models.csv"

# New CSV with added Option 4 focal cols.
# If you prefer to update in place, set this equal to IN_CSV_OLD.
OUT_CSV_NEW = OUT_DIR / "burned_area_by_ecoregion_predictions_all_models_option4_focal_auc_thresh.csv"


# ============================
# HELPERS
# ============================

def find_native_monthly_path(
    base_dir: Path,
    prefix: str,
    year: int,
    month: int
) -> Optional[Path]:
    """
    Try a few reasonable filename variants for the native monthly *_with_fraction.tif.
    prefix is 'cems_e5l_mcd'.
    """
    candidates = [
        base_dir / f"{prefix}_{year}_{month}_with_fraction.tif",
        base_dir / f"{prefix}_{year}_{month:02d}_with_fraction.tif",
        base_dir / f"{prefix}_{year}_{month}.tif",
        base_dir / f"{prefix}_{year}_{month:02d}.tif",
    ]
    for p in candidates:
        if p.exists():
            return p
    return None


def find_template_path_native(
    base_dir: Path,
    prefix: str,
    years,
    months
) -> Path:
    """
    Find one existing native monthly file to use as template (for CRS, transform, shape).
    """
    for y in years:
        for m in months:
            p = find_native_monthly_path(base_dir, prefix, y, m)
            if p is not None:
                return p
    raise FileNotFoundError(f"No native files found for prefix {prefix} in {base_dir}")


def find_pred_class_month_path(
    pred_class_dir: Path,
    year: int,
    month: int
) -> Optional[Path]:
    """
    Find predicted class raster for given year/month in a given model dir.
    Expects filenames like cems_pred_class_YYYY_MM_thr*.tif
    """
    matches = list(pred_class_dir.glob(PRED_CLASS_PATTERN.format(year=year, month=month)))
    if matches:
        # If multiple thresholds exist, just use the first one
        return matches[0]
    return None


def build_annual_pred_mask(
    year: int,
    months,
    pred_class_dir: Path
) -> Optional[np.ndarray]:
    """
    Build annual 0/1 burned mask from predicted monthly class rasters for given year.
    Assumes class TIFF has 1=burned, 0=unburned, 255=nodata.
    Returns uint8 array with shape (H, W) or None if no months are found.
    """
    annual = None

    for m in months:
        src_path = find_pred_class_month_path(pred_class_dir, year, m)
        if src_path is None:
            continue

        with rio.open(src_path) as ds:
            arr = ds.read(1)
            monthly_burn = (arr == 1).astype(np.uint8)

            if annual is None:
                annual = monthly_burn
            else:
                annual = np.maximum(annual, monthly_burn)

    return annual


def prepare_ecos_for_dataset(template_path: Path, ecos: gpd.GeoDataFrame):
    """
    Open template raster, check CRS, compute pixel area, and reproject ecos to that CRS.
    """
    with rio.open(template_path) as ds:
        crs = ds.crs
        transform = ds.transform
        height, width = ds.height, ds.width

        if crs is None:
            raise ValueError(f"Template {template_path} has no CRS")

        if crs.is_geographic:
            raise ValueError(
                f"Template CRS {crs} is geographic (degrees). "
                "Reproject rasters to an equal-area projection (meters) before area calculation."
            )

        pixel_area_m2 = abs(transform.a * transform.e)  # a=width, e=height (negative)
        ecos_reproj = ecos.to_crs(crs)

    return ecos_reproj, transform, (height, width), pixel_area_m2


def compute_area_per_ecoregion(
    annual_mask: np.ndarray,
    ecos_reproj: gpd.GeoDataFrame,
    transform,
    pixel_area_m2: float,
    id_col: str,
) -> Dict[str, float]:
    """
    Given an annual 0/1 mask and an ecoregion GeoDataFrame (already in raster CRS),
    compute burned area (Mha) for each ecoregion.
    Returns dict: {ecoregion_id: burned_area_Mha}
    """
    height, width = annual_mask.shape
    results = {}

    for idx, row in ecos_reproj.iterrows():
        eco_id = row[id_col]
        geom = row.geometry
        if geom is None or geom.is_empty:
            results[eco_id] = 0.0
            continue

        eco_mask = geometry_mask(
            [geom.__geo_interface__],
            transform=transform,
            invert=True,
            out_shape=(height, width),
        )

        burned_pixels = (annual_mask == 1) & eco_mask
        area_m2 = burned_pixels.sum() * pixel_area_m2
        area_Mha = area_m2 / 1e10  # m^2 -> Mha

        results[eco_id] = area_Mha

    return results


def parse_negfrac_from_dir(dirname: str) -> Optional[int]:
    """
    Parse neg_fraction_pct from a prediction directory name, e.g.:

      predictions_option4_focal_10x_negative_auc_thresh_negfrac050_mcd

    Returns neg_fraction_pct as int (e.g. 50) or None if it doesn't match.
    """
    m = re.search(r"negfrac(\d{2,3})_mcd", dirname)
    if not m:
        return None
    return int(m.group(1))  # e.g. 050 -> 50


# ============================
# MAIN
# ============================

def main():
    # ---------- Load ecoregions & old CSV ----------
    print("Loading ecoregions...")
    ecos = gpd.read_file(ECOS_PATH)
    if ECO_ID_COL not in ecos.columns:
        raise ValueError(
            f"ECO_ID_COL='{ECO_ID_COL}' not found in {ECOS_PATH}. "
            f"Columns: {list(ecos.columns)}"
        )

    print(f"Reading existing burned area CSV: {IN_CSV_OLD}")
    df_old = pd.read_csv(IN_CSV_OLD)

    # ---------- Prepare template & reproject ecos ----------
    print("Preparing template and reprojecting ecoregions (MCD grid)...")
    tmpl_mcd_native = find_template_path_native(
        NATIVE_MCD_DIR, "cems_e5l_mcd", YEARS_MCD, MONTHS
    )
    ecos_mcd, transform_mcd, shape_mcd, pixel_area_mcd = prepare_ecos_for_dataset(
        tmpl_mcd_native, ecos
    )

    # ---------- Find all Option 4 focal model 'class' directories ----------
    pred_class_dirs = sorted(PRED_ROOT_DIR.glob(f"{PRED_DIR_PATTERN}/class"))
    if not pred_class_dirs:
        raise FileNotFoundError(
            f"No prediction class directories matching '{PRED_DIR_PATTERN}' under {PRED_ROOT_DIR}"
        )

    print("\nFound Option 4 focal prediction model class directories:")
    model_info = []  # list of (neg_pct, class_dir)
    for class_dir in pred_class_dirs:
        neg_pct = parse_negfrac_from_dir(class_dir.parent.name)
        if neg_pct is None:
            print(f"  [SKIP] Could not parse negfrac from: {class_dir.parent.name}")
            continue
        print(f"  - {class_dir.parent.name}  -> negfrac={neg_pct}%")
        model_info.append((neg_pct, class_dir))

    if not model_info:
        raise RuntimeError("No valid Option 4 focal model prediction directories parsed from names.")

    # ---------- Check that prediction grids match native MCD grid ----------
    print("\nChecking that prediction grids match native MCD grid (using the first model)...")
    first_class_dir = model_info[0][1]
    tmpl_new_pred = None
    for y in YEARS_MCD:
        for m in MONTHS:
            p = find_pred_class_month_path(first_class_dir, y, m)
            if p is not None:
                tmpl_new_pred = p
                break
        if tmpl_new_pred is not None:
            break

    if tmpl_new_pred is None:
        raise FileNotFoundError(f"No prediction class files found in {first_class_dir}")

    with rio.open(tmpl_new_pred) as ds_pred, rio.open(tmpl_mcd_native) as ds_nat:
        if (
            ds_pred.transform != ds_nat.transform
            or ds_pred.width != ds_nat.width
            or ds_pred.height != ds_nat.height
        ):
            raise ValueError("Predicted MCD grid does not match native MCD grid.")

    # ---------- Start from the existing CSV and add columns per model ----------
    df_merged = df_old.copy()

    # ---------- For each model, compute burned area and merge ----------
    for neg_pct, class_dir in model_info:
        col_name = f"ba_pred_Mha_focal10x_negfrac{neg_pct}"
        print("\n" + "=" * 80)
        print(f"Processing Option 4 focal model: negfrac={neg_pct}%")
        print(f"Class rasters dir: {class_dir}")
        print(f"New column name:   {col_name}")

        results = {}  # key: (eco_id, year) -> area

        for year in YEARS_MCD:
            print(f"  Year {year}...")
            annual_mask = build_annual_pred_mask(year, MONTHS, class_dir)
            if annual_mask is None:
                print(f"    -> No predictions found for {year} in {class_dir}, skipping.")
                continue

            if annual_mask.shape != shape_mcd:
                raise ValueError(
                    f"Shape mismatch for predictions {year} in {class_dir}: "
                    f"{annual_mask.shape} vs {shape_mcd}"
                )

            area_dict = compute_area_per_ecoregion(
                annual_mask, ecos_mcd, transform_mcd, pixel_area_mcd, ECO_ID_COL
            )

            for eco_id, area_Mha in area_dict.items():
                results[(eco_id, year)] = area_Mha

        if not results:
            print(f"  -> No valid annual masks computed for model negfrac{neg_pct}; skipping merge.")
            continue

        # Convert results to DataFrame
        rows = []
        for (eco_id, year), area_Mha in results.items():
            rows.append({ECO_ID_COL: eco_id, "year": year, col_name: area_Mha})
        df_new = pd.DataFrame(rows)

        print(f"  Merging {len(df_new)} rows into main CSV for column '{col_name}'...")
        df_merged = df_merged.merge(df_new, on=[ECO_ID_COL, "year"], how="left")

    # ---------- Final save ----------
    df_merged = df_merged.sort_values(by=["year", ECO_ID_COL])
    df_merged.to_csv(OUT_CSV_NEW, index=False)

    print("\n✅ Saved updated CSV with all Option 4 focal model prediction columns to:")
    print(f"  {OUT_CSV_NEW}")


if __name__ == "__main__":
    main()


Loading ecoregions...
Reading existing burned area CSV: /explore/nobackup/people/spotter5/clelland_fire_ml/burned_area_summaries/burned_area_by_ecoregion_predictions_all_models.csv
Preparing template and reprojecting ecoregions (MCD grid)...

Found Option 4 focal prediction model class directories:
  - predictions_option4_focal_10x_negative_auc_thresh_negfrac010_mcd  -> negfrac=10%
  - predictions_option4_focal_10x_negative_auc_thresh_negfrac020_mcd  -> negfrac=20%
  - predictions_option4_focal_10x_negative_auc_thresh_negfrac030_mcd  -> negfrac=30%
  - predictions_option4_focal_10x_negative_auc_thresh_negfrac040_mcd  -> negfrac=40%
  - predictions_option4_focal_10x_negative_auc_thresh_negfrac050_mcd  -> negfrac=50%
  - predictions_option4_focal_10x_negative_auc_thresh_negfrac060_mcd  -> negfrac=60%
  - predictions_option4_focal_10x_negative_auc_thresh_negfrac070_mcd  -> negfrac=70%
  - predictions_option4_focal_10x_negative_auc_thresh_negfrac080_mcd  -> negfrac=80%
  - predictions_opti

All plots

In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
For EACH Option 4 focal prediction column (ba_pred_Mha_focal10x_negfracXX, ...),
create a multipanel plot (ecoregions as subplots) comparing:

    - MCD64A1 (ba_mcd_native_Mha)
    - Fire CCI (ba_firecci_native_Mha)
    - That specific Option 4 focal prediction column

Input:
    burned_area_by_ecoregion_predictions_all_models_option4_focal_auc_thresh.csv

Output:
    <OUT_DIR>/multipanel_option4_focal10x_auc_thresh/
        burned_area_multipanel_<pred_col>.png
"""

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# ============================
# CONFIG
# ============================

ECO_ID_COL = "ecoregion"

OUT_DIR = Path("/explore/nobackup/people/spotter5/clelland_fire_ml/burned_area_summaries")
IN_CSV  = OUT_DIR / "burned_area_by_ecoregion_predictions_all_models_option4_focal_auc_thresh.csv"

# Folder where all multipanel PNGs (one per focal model) will go
OUT_PNG_DIR = OUT_DIR / "multipanel_option4_focal10x_auc_thresh"
OUT_PNG_DIR.mkdir(parents=True, exist_ok=True)

# Base / native columns
MCD_COL      = "ba_mcd_native_Mha"
FIRECCI_COL  = "ba_firecci_native_Mha"

BASE_LABELS = {
    MCD_COL: "MCD64A1",
    FIRECCI_COL: "Fire CCI",
}

# Ecoregions to exclude from plotting
EXCLUDE_ECOS = {
    "WATER",
    "MIXED WOOD SHIELD",
    "TEMPERATE PRAIRIES",
    "WESTERN CORDILLERA",
}


def nice_pred_label(colname: str) -> str:
    """
    Turn prediction column names into nicer legend labels.

    Examples:
      ba_pred_Mha                         -> "Pred (orig)"
      ba_pred_Mha_neg50_w2                -> "Pred neg50 w2"
      ba_pred_Mha_focal10x_negfrac50      -> "Focal10x negfrac50%"
    """
    if colname == "ba_pred_Mha":
        return "Pred (orig)"

    # Option 4 focal 10x pattern
    if colname.startswith("ba_pred_Mha_focal10x_negfrac"):
        # try to extract the percent for a slightly nicer label
        m = re.search(r"negfrac(\d+)$", colname)
        if m:
            pct = m.group(1)
            return f"Focal10x negfrac{pct}%"
        return "Focal10x (AUC-thresh)"

    if colname.startswith("ba_pred_Mha_"):
        sfx = colname.replace("ba_pred_Mha_", "").replace("_", " ")
        return f"Pred {sfx}"

    return colname


# ============================
# MAIN
# ============================

def main():
    print(f"Reading CSV: {IN_CSV}")
    df = pd.read_csv(IN_CSV)

    # Basic sanity check
    needed_base = [ECO_ID_COL, "year", MCD_COL, FIRECCI_COL]
    missing_base = [c for c in needed_base if c not in df.columns]
    if missing_base:
        raise ValueError(f"Missing base columns in CSV: {missing_base}")

    df = df.sort_values(by=["year", ECO_ID_COL])

    # Find all Option 4 focal prediction columns
    # (anything starting with 'ba_pred_Mha_focal10x_')
    pred_cols = [c for c in df.columns if c.startswith("ba_pred_Mha_focal10x_")]
    if not pred_cols:
        raise ValueError(
            "No Option 4 focal prediction columns starting with "
            "'ba_pred_Mha_focal10x_' found in CSV."
        )

    print("Option 4 focal prediction columns to plot:")
    for c in pred_cols:
        print("  -", c)

    # Unique ecoregions excluding the undesired ones
    ecos_all = sorted(df[ECO_ID_COL].dropna().unique())
    ecos_list = [e for e in ecos_all if e not in EXCLUDE_ECOS]

    if not ecos_list:
        print("No ecoregions left after exclusion; nothing to plot.")
        return

    n_ecos = len(ecos_list)
    print(f"Found {n_ecos} ecoregions after exclusion: {ecos_list}")

    # Loop over each prediction column and make a separate multipanel PNG
    for pred_col in pred_cols:
        print("\n==============================================")
        print(f"Creating multipanel plot for prediction column: {pred_col}")

        # Columns to plot in this figure
        COLS_AREA = [MCD_COL, FIRECCI_COL, pred_col]
        LABELS = {
            MCD_COL: BASE_LABELS[MCD_COL],
            FIRECCI_COL: BASE_LABELS[FIRECCI_COL],
            pred_col: nice_pred_label(pred_col),
        }

        # Layout: up to 4 columns of subplots
        ncols = 4 if n_ecos > 4 else n_ecos
        nrows = int(np.ceil(n_ecos / ncols))

        fig, axes = plt.subplots(
            nrows=nrows,
            ncols=ncols,
            figsize=(4 * ncols, 3 * nrows),
            sharex=True,
            sharey=False,  # floating y-axis per panel
        )

        # Normalize axes to 2D array
        if nrows == 1 and ncols == 1:
            axes = np.array([[axes]])
        elif nrows == 1:
            axes = np.array([axes])
        elif ncols == 1:
            axes = axes.reshape(-1, 1)

        handles_for_legend, labels_for_legend = None, None

        # Plot per ecoregion
        for i, eco_id in enumerate(ecos_list):
            row = i // ncols
            col = i % ncols
            ax = axes[row, col]

            df_eco = df[df[ECO_ID_COL] == eco_id]

            # Plot all three datasets for this model
            for col_name in COLS_AREA:
                if col_name in df_eco.columns and df_eco[col_name].notna().any():
                    ax.plot(
                        df_eco["year"],
                        df_eco[col_name],
                        marker="o",
                        label=LABELS[col_name],
                    )

            # Capture legend handles from first non-empty panel
            if handles_for_legend is None:
                h, l = ax.get_legend_handles_labels()
                if h:
                    handles_for_legend, labels_for_legend = h, l

            ax.set_title(str(eco_id))
            ax.grid(True, ls="--", alpha=0.4)

            # Floating y-axis logic: choose sensible default if all zeros / NaN
            ydata_list = [df_eco[c].dropna().values for c in COLS_AREA if c in df_eco.columns]
            if ydata_list:
                ydata = np.concatenate(ydata_list)
                if ydata.size == 0 or np.nanmax(ydata) == 0:
                    ax.set_ylim(0, 1.0)
            else:
                ax.set_ylim(0, 1.0)

            if row == nrows - 1:
                ax.set_xlabel("Year")
            if col == 0:
                ax.set_ylabel("Burned area (Mha)")

        # Hide unused panels
        total_plots = nrows * ncols
        if total_plots > n_ecos:
            for j in range(n_ecos, total_plots):
                row = j // ncols
                col = j % ncols
                axes[row, col].axis("off")

        # Global legend at bottom
        if handles_for_legend:
            fig.legend(
                handles_for_legend,
                labels_for_legend,
                loc="lower center",
                ncol=len(handles_for_legend),
                bbox_to_anchor=(0.5, -0.02),
            )

        # Clean filenames: replace spaces with underscores
        pred_tag = pred_col.replace("ba_pred_Mha_", "") if pred_col != "ba_pred_Mha" else "orig"
        pred_tag = pred_tag.replace(" ", "_")

        out_png = OUT_PNG_DIR / f"burned_area_multipanel_{pred_tag}.png"

        plt.tight_layout(rect=[0, 0.05, 1, 1])
        plt.savefig(out_png, dpi=150, bbox_inches="tight")
        plt.close()

        print(f"  -> Saved multipanel PNG for {pred_col} to:\n     {out_png}")

    print("\n✅ Done. One multipanel PNG per Option 4 focal prediction model written to:")
    print(f"  {OUT_PNG_DIR}")


if __name__ == "__main__":
    main()


Reading CSV: /explore/nobackup/people/spotter5/clelland_fire_ml/burned_area_summaries/burned_area_by_ecoregion_predictions_all_models_option4_focal_auc_thresh.csv
Option 4 focal prediction columns to plot:
  - ba_pred_Mha_focal10x_negfrac10
  - ba_pred_Mha_focal10x_negfrac20
  - ba_pred_Mha_focal10x_negfrac30
  - ba_pred_Mha_focal10x_negfrac40
  - ba_pred_Mha_focal10x_negfrac50
  - ba_pred_Mha_focal10x_negfrac60
  - ba_pred_Mha_focal10x_negfrac70
  - ba_pred_Mha_focal10x_negfrac80
  - ba_pred_Mha_focal10x_negfrac90
  - ba_pred_Mha_focal10x_negfrac100
Found 23 ecoregions after exclusion: ['ALASKA BOREAL INTERIOR', 'ALASKA TUNDRA', 'ARCTIC CORDILLERA', 'Arctic Deserts and Tundra', 'BOREAL CORDILLERA', 'BOREAL PLAIN', 'BROOKS RANGE TUNDRA', 'Central Taiga', 'Forest Tundra', 'HUDSON PLAIN', 'MARINE WEST COAST FOREST', 'Montane Boreal', 'Montane Sub-Arctic', 'Montane Sub-Boreal', 'NORTHERN ARCTIC', 'Northern Taiga', 'SOFTWOOD SHIELD', 'SOUTHERN ARCTIC', 'Southern Taiga', 'TAIGA CORDILLERA',