This script will calculate total burned area per year and by ecoregion for fire cci, MCD64a1, and our predictions.  For our predictions we will take monthly predictions and calculate max value to deal with multiple burns in same location across months. For fire cci and mcd64a1 we will call it burned if the fractin is >0.50.

In [None]:
import geopandas as gpd
import os
import pandas as pd


#ecoregion columns is unique identifier, convert to crs of rasters when extracting by mask
ecos = gpd.read_file("/explore/nobackup/people/spotter5/helene/raw/merge_eco_v2.shp")



In [4]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Compute annual burned area (Mha) per ecoregion for three datasets:

  1) Native MCD64A1 (monthly fraction > 0.5 -> burned)
  2) Native FireCCI  (monthly fraction > 0.5 -> burned)
  3) Predictions (MCD64A1 model outputs; monthly class TIFFs, 1=burned)

Steps per dataset:
  - For each year, stack 12 monthly rasters, take max across months -> annual 0/1 mask.
  - For each ecoregion polygon (from merge_eco_v2.shp):
      * rasterize a polygon mask on the same grid
      * count burned pixels inside the polygon
      * multiply by pixel area (assumes projected CRS in meters)
      * convert to Mha.

Outputs:
  - CSV: burned_area_by_ecoregion_predictions.csv
  - PNG: burned_area_multipanel_by_ecoregion.png  (one panel per ecoregion)
"""

import os
from pathlib import Path
from typing import Optional

import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio as rio
from rasterio.features import geometry_mask
import matplotlib.pyplot as plt

# ============================
# CONFIG
# ============================

# Years
YEARS_FIRECCI = list(range(2001, 2020))   # Native FireCCI
YEARS_MCD     = list(range(2001, 2024))   # Native MCD + predictions

MONTHS = list(range(1, 13))               # 1..12

# --- Native monthly fraction rasters (last band = "fraction") ---
NATIVE_FIRECCI_DIR = Path(
    "/explore/nobackup/people/spotter5/clelland_fire_ml/training_e5l_cems_firecci_with_fraction"
)
NATIVE_MCD_DIR     = Path(
    "/explore/nobackup/people/spotter5/clelland_fire_ml/training_e5l_cems_mcd_with_fraction"
)

# --- Predicted class rasters (0/1, 255 nodata) ---
# MCD predictions live here:
PRED_MCD_DIR       = Path(
    "/explore/nobackup/people/spotter5/clelland_fire_ml/predictions_option2_neg040pct_w3_mcd"
)
PRED_MCD_CLASS_DIR = PRED_MCD_DIR / "class"

# Glob pattern for predicted class TIFFs:
#   cems_pred_class_YYYY_MM_thr{thr}.tif
PRED_CLASS_PATTERN = "cems_pred_class_{year}_{month:02d}_thr*.tif"

# --- Ecoregion shapefile ---
ECOS_PATH = "/explore/nobackup/people/spotter5/helene/raw/merge_eco_v2.shp"

# Column in ecos that uniquely identifies an ecoregion
ECO_ID_COL = "ecoregion"

# --- Output paths ---
OUT_DIR = Path("/explore/nobackup/people/spotter5/clelland_fire_ml/burned_area_summaries")
OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_CSV = OUT_DIR / "burned_area_by_ecoregion_predictions.csv"
OUT_PNG_MULTIPANEL = OUT_DIR / "burned_area_multipanel_by_ecoregion.png"

# Threshold for native monthly "fraction" band to consider burned
FRACTION_THRESHOLD = 0.5


# ============================
# HELPERS
# ============================

def find_native_monthly_path(
    base_dir: Path,
    prefix: str,
    year: int,
    month: int
) -> Optional[Path]:
    """
    Try a few reasonable filename variants for the native monthly *_with_fraction.tif.
    prefix is 'cems_e5l_mcd' or 'cems_e5l_firecci'.
    """
    candidates = [
        base_dir / f"{prefix}_{year}_{month}_with_fraction.tif",
        base_dir / f"{prefix}_{year}_{month:02d}_with_fraction.tif",
        base_dir / f"{prefix}_{year}_{month}.tif",
        base_dir / f"{prefix}_{year}_{month:02d}.tif",
    ]
    for p in candidates:
        if p.exists():
            return p
    return None


def build_annual_native_mask(
    year: int,
    months,
    base_dir: Path,
    prefix: str
) -> Optional[np.ndarray]:
    """
    Build annual 0/1 burned mask from native monthly fraction rasters for given year.
    Returns uint8 array with shape (H, W) or None if no months found.
    """
    annual = None

    for m in months:
        src_path = find_native_monthly_path(base_dir, prefix, year, m)
        if src_path is None:
            continue

        with rio.open(src_path) as ds:
            # assume last band is 'fraction'
            frac = ds.read(ds.count).astype(np.float32)
            monthly_burn = (frac > FRACTION_THRESHOLD) & np.isfinite(frac)
            monthly_burn = monthly_burn.astype(np.uint8)

            if annual is None:
                annual = monthly_burn
            else:
                annual = np.maximum(annual, monthly_burn)

    return annual


def find_pred_class_month_path(
    pred_class_dir: Path,
    year: int,
    month: int
) -> Optional[Path]:
    """
    Find predicted class raster for given year/month.
    Expects filenames like cems_pred_class_YYYY_MM_thr*.tif
    """
    matches = list(pred_class_dir.glob(PRED_CLASS_PATTERN.format(year=year, month=month)))
    if matches:
        # If multiple thresholds exist, just use the first one
        return matches[0]
    return None


def build_annual_pred_mask(
    year: int,
    months,
    pred_class_dir: Path
) -> Optional[np.ndarray]:
    """
    Build annual 0/1 burned mask from predicted monthly class rasters for given year.
    Assumes class TIFF has 1=burned, 0=unburned, 255=nodata.
    Returns uint8 array with shape (H, W) or None if no months are found.
    """
    annual = None

    for m in months:
        src_path = find_pred_class_month_path(pred_class_dir, year, m)
        if src_path is None:
            continue

        with rio.open(src_path) as ds:
            arr = ds.read(1)
            monthly_burn = (arr == 1).astype(np.uint8)

            if annual is None:
                annual = monthly_burn
            else:
                annual = np.maximum(annual, monthly_burn)

    return annual


def find_template_path_native(
    base_dir: Path,
    prefix: str,
    years,
    months
) -> Path:
    """
    Find one existing native monthly file to use as template (for CRS, transform, shape).
    """
    for y in years:
        for m in months:
            p = find_native_monthly_path(base_dir, prefix, y, m)
            if p is not None:
                return p
    raise FileNotFoundError(f"No native files found for prefix {prefix} in {base_dir}")


def find_template_path_pred(
    pred_class_dir: Path,
    years,
    months
) -> Path:
    """
    Find one existing predicted class file to use as template.
    """
    for y in years:
        for m in months:
            p = find_pred_class_month_path(pred_class_dir, y, m)
            if p is not None:
                return p
    raise FileNotFoundError(f"No predicted class files found in {pred_class_dir}")


def prepare_ecos_for_dataset(template_path: Path, ecos: gpd.GeoDataFrame):
    """
    Open template raster, check CRS, compute pixel area, and reproject ecos to that CRS.
    """
    with rio.open(template_path) as ds:
        crs = ds.crs
        transform = ds.transform
        height, width = ds.height, ds.width

        if crs is None:
            raise ValueError(f"Template {template_path} has no CRS")

        if crs.is_geographic:
            raise ValueError(
                f"Template CRS {crs} is geographic (degrees). "
                "Reproject rasters to an equal-area projection (meters) before area calculation."
            )

        pixel_area_m2 = abs(transform.a * transform.e)  # a=width, e=height (negative)
        ecos_reproj = ecos.to_crs(crs)

    return ecos_reproj, transform, (height, width), pixel_area_m2


def compute_area_per_ecoregion(
    annual_mask: np.ndarray,
    ecos_reproj: gpd.GeoDataFrame,
    transform,
    pixel_area_m2: float,
    id_col: str,
) -> dict:
    """
    Given an annual 0/1 mask and an ecoregion GeoDataFrame (already in raster CRS),
    compute burned area (Mha) for each ecoregion.
    Returns dict: {ecoregion_id: burned_area_Mha}
    """
    height, width = annual_mask.shape
    results = {}

    for idx, row in ecos_reproj.iterrows():
        eco_id = row[id_col]
        geom = row.geometry
        if geom is None or geom.is_empty:
            results[eco_id] = 0.0
            continue

        eco_mask = geometry_mask(
            [geom.__geo_interface__],
            transform=transform,
            invert=True,
            out_shape=(height, width),
        )

        burned_pixels = (annual_mask == 1) & eco_mask
        area_m2 = burned_pixels.sum() * pixel_area_m2
        area_Mha = area_m2 / 1e10  # m^2 -> Mha

        results[eco_id] = area_Mha

    return results


# ============================
# MAIN
# ============================

def main():
    print("Loading ecoregions...")
    ecos = gpd.read_file(ECOS_PATH)
    if ECO_ID_COL not in ecos.columns:
        raise ValueError(f"ECO_ID_COL='{ECO_ID_COL}' not found in {ECOS_PATH}. Columns: {list(ecos.columns)}")

    # ------------- Prepare templates + reproject ecos -------------
    print("Preparing templates and reprojecting ecoregions...")

    # MCD (native & predictions) use same grid
    tmpl_mcd_native = find_template_path_native(NATIVE_MCD_DIR, "cems_e5l_mcd", YEARS_MCD, MONTHS)
    ecos_mcd, transform_mcd, shape_mcd, pixel_area_mcd = prepare_ecos_for_dataset(tmpl_mcd_native, ecos)

    # FireCCI (native) grid
    tmpl_firecci_native = find_template_path_native(NATIVE_FIRECCI_DIR, "cems_e5l_firecci", YEARS_FIRECCI, MONTHS)
    ecos_firecci, transform_firecci, shape_firecci, pixel_area_firecci = prepare_ecos_for_dataset(tmpl_firecci_native, ecos)

    # ------------- Accumulate results -------------
    # Key: (eco_id, year) -> dict of values
    results = {}

    def update_results(eco_id, year, col_name, value):
        key = (eco_id, year)
        if key not in results:
            results[key] = {ECO_ID_COL: eco_id, "year": year}
        results[key][col_name] = value

    # --- 1) Native MCD ---
    print("\n[1/3] Processing native MCD64A1...")
    for year in YEARS_MCD:
        print(f"  Year {year} (native MCD64A1)")
        annual_mask = build_annual_native_mask(year, MONTHS, NATIVE_MCD_DIR, "cems_e5l_mcd")
        if annual_mask is None:
            print(f"    -> No native MCD data found for {year}, skipping.")
            continue

        if annual_mask.shape != shape_mcd:
            raise ValueError(f"Shape mismatch for native MCD {year}: {annual_mask.shape} vs {shape_mcd}")

        area_dict = compute_area_per_ecoregion(
            annual_mask, ecos_mcd, transform_mcd, pixel_area_mcd, ECO_ID_COL
        )

        for eco_id, area_Mha in area_dict.items():
            update_results(eco_id, year, "ba_mcd_native_Mha", area_Mha)

    # --- 2) Native FireCCI ---
    print("\n[2/3] Processing native FireCCI...")
    for year in YEARS_FIRECCI:
        print(f"  Year {year} (native FireCCI)")
        annual_mask = build_annual_native_mask(year, MONTHS, NATIVE_FIRECCI_DIR, "cems_e5l_firecci")
        if annual_mask is None:
            print(f"    -> No native FireCCI data found for {year}, skipping.")
            continue

        if annual_mask.shape != shape_firecci:
            raise ValueError(f"Shape mismatch for native FireCCI {year}: {annual_mask.shape} vs {shape_firecci}")

        area_dict = compute_area_per_ecoregion(
            annual_mask, ecos_firecci, transform_firecci, pixel_area_firecci, ECO_ID_COL
        )

        for eco_id, area_Mha in area_dict.items():
            update_results(eco_id, year, "ba_firecci_native_Mha", area_Mha)

    # --- 3) Predictions (MCD model) ---
    print("\n[3/3] Processing predictions (MCD model)...")
    tmpl_mcd_pred = find_template_path_pred(PRED_MCD_CLASS_DIR, YEARS_MCD, MONTHS)
    # Ensure the predicted grid matches native MCD grid
    with rio.open(tmpl_mcd_pred) as ds_pred, rio.open(tmpl_mcd_native) as ds_nat:
        if (ds_pred.transform != ds_nat.transform) or (ds_pred.width != ds_nat.width) or (ds_pred.height != ds_nat.height):
            raise ValueError("Predicted MCD grid does not match native MCD grid.")

    for year in YEARS_MCD:
        print(f"  Year {year} (predictions / MCD model)")
        annual_mask = build_annual_pred_mask(year, MONTHS, PRED_MCD_CLASS_DIR)
        if annual_mask is None:
            print(f"    -> No predictions found for {year}, skipping.")
            continue

        if annual_mask.shape != shape_mcd:
            raise ValueError(f"Shape mismatch for predictions {year}: {annual_mask.shape} vs {shape_mcd}")

        area_dict = compute_area_per_ecoregion(
            annual_mask, ecos_mcd, transform_mcd, pixel_area_mcd, ECO_ID_COL
        )

        for eco_id, area_Mha in area_dict.items():
            update_results(eco_id, year, "ba_pred_Mha", area_Mha)

    # ------------- Build DataFrame & save -------------
    print("\nBuilding DataFrame and saving CSV...")
    df = pd.DataFrame(list(results.values()))
    df = df.sort_values(by=["year", ECO_ID_COL])
    df.to_csv(OUT_CSV, index=False)
    print(f"Saved per-ecoregion annual burned area to:\n  {OUT_CSV}")

    # ------------- Multipanel plot: per-ecoregion comparisons -------------
    print("Creating multipanel plot (one panel per ecoregion)...")

    cols_area = [
        "ba_mcd_native_Mha",
        "ba_firecci_native_Mha",
        "ba_pred_Mha",
    ]
    labels = {
        "ba_mcd_native_Mha": "MCD native",
        "ba_firecci_native_Mha": "FireCCI native",
        "ba_pred_Mha": "Predictions",
    }

    # Unique ecoregions
    ecos_list = sorted(df[ECO_ID_COL].dropna().unique())
    n_ecos = len(ecos_list)
    if n_ecos == 0:
        print("No ecoregions found in results; skipping multipanel plot.")
        return

    # Global y-limit for comparability
    global_max = 0.0
    for col in cols_area:
        if col in df.columns:
            col_max = df[col].max(skipna=True)
            if col_max > global_max:
                global_max = col_max
    if global_max <= 0:
        global_max = 1.0

    # Layout: up to 4 columns
    ncols = 4 if n_ecos > 4 else n_ecos
    nrows = int(np.ceil(n_ecos / ncols))

    fig, axes = plt.subplots(
        nrows=nrows,
        ncols=ncols,
        figsize=(4 * ncols, 3 * nrows),
        sharex=True,
        sharey=True,
    )

    # Normalize axes to 2D array
    if nrows == 1 and ncols == 1:
        axes = np.array([[axes]])
    elif nrows == 1:
        axes = np.array([axes])
    elif ncols == 1:
        axes = axes.reshape(-1, 1)

    # Plot per ecoregion
    for i, eco_id in enumerate(ecos_list):
        row = i // ncols
        col = i % ncols
        ax = axes[row, col]

        df_eco = df[df[ECO_ID_COL] == eco_id]

        for col_name in cols_area:
            if col_name in df_eco.columns and df_eco[col_name].notna().any():
                ax.plot(
                    df_eco["year"],
                    df_eco[col_name],
                    marker="o",
                    label=labels[col_name],
                )

        ax.set_title(str(eco_id))
        ax.grid(True, ls="--", alpha=0.4)
        ax.set_ylim(0, global_max * 1.05)

        if row == nrows - 1:
            ax.set_xlabel("Year")
        if col == 0:
            ax.set_ylabel("Burned area (Mha)")

    # Hide any unused subplots
    total_plots = nrows * ncols
    if total_plots > n_ecos:
        for j in range(n_ecos, total_plots):
            row = j // ncols
            col = j % ncols
            axes[row, col].axis("off")

    # Global legend (from first axis that has lines)
    handles, legend_labels = [], []
    for ax in axes.flat:
        h, l = ax.get_legend_handles_labels()
        if h:
            handles, legend_labels = h, l
            break
    if handles:
        fig.legend(
            handles,
            legend_labels,
            loc="upper center",
            ncol=len(handles),
            bbox_to_anchor=(0.5, 1.02),
        )

    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(OUT_PNG_MULTIPANEL, dpi=150)
    plt.close()
    print(f"Saved multipanel per-ecoregion plot to:\n  {OUT_PNG_MULTIPANEL}")

    print("\n✅ Done.")


if __name__ == "__main__":
    main()


Loading ecoregions...
Preparing templates and reprojecting ecoregions...

[1/3] Processing native MCD64A1...
  Year 2001 (native MCD64A1)
  Year 2002 (native MCD64A1)
  Year 2003 (native MCD64A1)
  Year 2004 (native MCD64A1)
  Year 2005 (native MCD64A1)
  Year 2006 (native MCD64A1)
  Year 2007 (native MCD64A1)
  Year 2008 (native MCD64A1)
  Year 2009 (native MCD64A1)
  Year 2010 (native MCD64A1)
  Year 2011 (native MCD64A1)
  Year 2012 (native MCD64A1)
  Year 2013 (native MCD64A1)
  Year 2014 (native MCD64A1)
  Year 2015 (native MCD64A1)
  Year 2016 (native MCD64A1)
  Year 2017 (native MCD64A1)
  Year 2018 (native MCD64A1)
  Year 2019 (native MCD64A1)
  Year 2020 (native MCD64A1)
  Year 2021 (native MCD64A1)
  Year 2022 (native MCD64A1)
  Year 2023 (native MCD64A1)

[2/3] Processing native FireCCI...
  Year 2001 (native FireCCI)
  Year 2002 (native FireCCI)
  Year 2003 (native FireCCI)
  Year 2004 (native FireCCI)
  Year 2005 (native FireCCI)
  Year 2006 (native FireCCI)
  Year 2007 (n

Now get burned arera with different model

In [11]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Compute annual burned area (Mha) per ecoregion for NEW predictions
(NEG=50%, w=2) and merge into existing CSV that already has:

  - ba_mcd_native_Mha
  - ba_firecci_native_Mha
  - ba_pred_Mha   (old predictions)

New column added:
  - ba_pred_Mha_neg50_w2

Steps:
  - For each year (2001–2023), stack 12 monthly predicted class rasters
    from predictions_option2_neg050pct_w2_mcd/class, take max across months
    -> annual 0/1 mask (1=burned at least once in the year).
  - For each ecoregion polygon:
      * rasterize polygon
      * count burned pixels
      * multiply by pixel area (m²) and convert to Mha.
  - Merge results into existing CSV and save a new CSV.
"""

import os
from pathlib import Path
from typing import Optional

import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio as rio
from rasterio.features import geometry_mask

# ============================
# CONFIG
# ============================

# Years for MCD / predictions
YEARS_MCD = list(range(2001, 2024))
MONTHS    = list(range(1, 13))

# --- Native MCD monthly fraction rasters (for template / pixel area only) ---
NATIVE_MCD_DIR = Path(
    "/explore/nobackup/people/spotter5/clelland_fire_ml/training_e5l_cems_mcd_with_fraction"
)

# --- NEW predicted class rasters (NEG=50%, w=2) ---
NEW_PRED_MCD_DIR       = Path(
    "/explore/nobackup/people/spotter5/clelland_fire_ml/predictions_option2_neg050pct_w2_mcd"
)
NEW_PRED_MCD_CLASS_DIR = NEW_PRED_MCD_DIR / "class"

# Glob pattern for predicted class TIFFs:
#   cems_pred_class_YYYY_MM_thr{thr}.tif
PRED_CLASS_PATTERN = "cems_pred_class_{year}_{month:02d}_thr*.tif"

# --- Ecoregion shapefile ---
ECOS_PATH = "/explore/nobackup/people/spotter5/helene/raw/merge_eco_v2.shp"
ECO_ID_COL = "ecoregion"

# --- Existing + new CSV paths ---
OUT_DIR = Path("/explore/nobackup/people/spotter5/clelland_fire_ml/burned_area_summaries")
OUT_DIR.mkdir(parents=True, exist_ok=True)

IN_CSV_OLD  = OUT_DIR / "burned_area_by_ecoregion_predictions.csv"
OUT_CSV_NEW = OUT_DIR / "burned_area_by_ecoregion_predictions_neg50_w2.csv"

# Name of new column
NEW_COL_NAME = "ba_pred_Mha_neg50_w2"


# ============================
# HELPERS
# ============================

def find_native_monthly_path(
    base_dir: Path,
    prefix: str,
    year: int,
    month: int
) -> Optional[Path]:
    """
    Try a few reasonable filename variants for the native monthly *_with_fraction.tif.
    prefix is 'cems_e5l_mcd'.
    """
    candidates = [
        base_dir / f"{prefix}_{year}_{month}_with_fraction.tif",
        base_dir / f"{prefix}_{year}_{month:02d}_with_fraction.tif",
        base_dir / f"{prefix}_{year}_{month}.tif",
        base_dir / f"{prefix}_{year}_{month:02d}.tif",
    ]
    for p in candidates:
        if p.exists():
            return p
    return None


def find_template_path_native(
    base_dir: Path,
    prefix: str,
    years,
    months
) -> Path:
    """
    Find one existing native monthly file to use as template (for CRS, transform, shape).
    """
    for y in years:
        for m in months:
            p = find_native_monthly_path(base_dir, prefix, y, m)
            if p is not None:
                return p
    raise FileNotFoundError(f"No native files found for prefix {prefix} in {base_dir}")


def find_pred_class_month_path(
    pred_class_dir: Path,
    year: int,
    month: int
) -> Optional[Path]:
    """
    Find NEW predicted class raster for given year/month.
    Expects filenames like cems_pred_class_YYYY_MM_thr*.tif
    """
    matches = list(pred_class_dir.glob(PRED_CLASS_PATTERN.format(year=year, month=month)))
    if matches:
        # If multiple thresholds exist, just use the first one
        return matches[0]
    return None


def build_annual_pred_mask(
    year: int,
    months,
    pred_class_dir: Path
) -> Optional[np.ndarray]:
    """
    Build annual 0/1 burned mask from predicted monthly class rasters for given year.
    Assumes class TIFF has 1=burned, 0=unburned, 255=nodata.
    Returns uint8 array with shape (H, W) or None if no months are found.
    """
    annual = None

    for m in months:
        src_path = find_pred_class_month_path(pred_class_dir, year, m)
        if src_path is None:
            continue

        with rio.open(src_path) as ds:
            arr = ds.read(1)
            monthly_burn = (arr == 1).astype(np.uint8)

            if annual is None:
                annual = monthly_burn
            else:
                annual = np.maximum(annual, monthly_burn)

    return annual


def prepare_ecos_for_dataset(template_path: Path, ecos: gpd.GeoDataFrame):
    """
    Open template raster, check CRS, compute pixel area, and reproject ecos to that CRS.
    """
    with rio.open(template_path) as ds:
        crs = ds.crs
        transform = ds.transform
        height, width = ds.height, ds.width

        if crs is None:
            raise ValueError(f"Template {template_path} has no CRS")

        if crs.is_geographic:
            raise ValueError(
                f"Template CRS {crs} is geographic (degrees). "
                "Reproject rasters to an equal-area projection (meters) before area calculation."
            )

        pixel_area_m2 = abs(transform.a * transform.e)  # a=width, e=height (negative)
        ecos_reproj = ecos.to_crs(crs)

    return ecos_reproj, transform, (height, width), pixel_area_m2


def compute_area_per_ecoregion(
    annual_mask: np.ndarray,
    ecos_reproj: gpd.GeoDataFrame,
    transform,
    pixel_area_m2: float,
    id_col: str,
) -> dict:
    """
    Given an annual 0/1 mask and an ecoregion GeoDataFrame (already in raster CRS),
    compute burned area (Mha) for each ecoregion.
    Returns dict: {ecoregion_id: burned_area_Mha}
    """
    from rasterio.features import geometry_mask  # local import just in case

    height, width = annual_mask.shape
    results = {}

    for idx, row in ecos_reproj.iterrows():
        eco_id = row[id_col]
        geom = row.geometry
        if geom is None or geom.is_empty:
            results[eco_id] = 0.0
            continue

        eco_mask = geometry_mask(
            [geom.__geo_interface__],
            transform=transform,
            invert=True,
            out_shape=(height, width),
        )

        burned_pixels = (annual_mask == 1) & eco_mask
        area_m2 = burned_pixels.sum() * pixel_area_m2
        area_Mha = area_m2 / 1e10  # m^2 -> Mha

        results[eco_id] = area_Mha

    return results


# ============================
# MAIN
# ============================

def main():
    # ---------- Load ecoregions & old CSV ----------
    print("Loading ecoregions...")
    ecos = gpd.read_file(ECOS_PATH)
    if ECO_ID_COL not in ecos.columns:
        raise ValueError(f"ECO_ID_COL='{ECO_ID_COL}' not found in {ECOS_PATH}. Columns: {list(ecos.columns)}")

    print(f"Reading existing burned area CSV: {IN_CSV_OLD}")
    df_old = pd.read_csv(IN_CSV_OLD)

    # ---------- Prepare template & reproject ecos ----------
    print("Preparing template and reprojecting ecoregions (MCD grid)...")
    tmpl_mcd_native = find_template_path_native(NATIVE_MCD_DIR, "cems_e5l_mcd", YEARS_MCD, MONTHS)
    ecos_mcd, transform_mcd, shape_mcd, pixel_area_mcd = prepare_ecos_for_dataset(tmpl_mcd_native, ecos)

    # Check that NEW predicted grid matches native MCD grid
    print("Checking that new prediction grid matches native MCD grid...")
    tmpl_new_pred = None
    for y in YEARS_MCD:
        for m in MONTHS:
            p = find_pred_class_month_path(NEW_PRED_MCD_CLASS_DIR, y, m)
            if p is not None:
                tmpl_new_pred = p
                break
        if tmpl_new_pred is not None:
            break

    if tmpl_new_pred is None:
        raise FileNotFoundError(f"No new prediction class files found in {NEW_PRED_MCD_CLASS_DIR}")

    with rio.open(tmpl_new_pred) as ds_pred, rio.open(tmpl_mcd_native) as ds_nat:
        if (ds_pred.transform != ds_nat.transform) or (ds_pred.width != ds_nat.width) or (ds_pred.height != ds_nat.height):
            raise ValueError("New predicted MCD grid does not match native MCD grid.")

    # ---------- Compute new prediction burned area ----------
    print("\n[1/1] Processing NEW predictions (NEG=50%, w=2)...")
    results = {}  # key: (eco_id, year) -> area

    for year in YEARS_MCD:
        print(f"  Year {year} (new predictions)")
        annual_mask = build_annual_pred_mask(year, MONTHS, NEW_PRED_MCD_CLASS_DIR)
        if annual_mask is None:
            print(f"    -> No new predictions found for {year}, skipping.")
            continue

        if annual_mask.shape != shape_mcd:
            raise ValueError(f"Shape mismatch for new predictions {year}: {annual_mask.shape} vs {shape_mcd}")

        area_dict = compute_area_per_ecoregion(
            annual_mask, ecos_mcd, transform_mcd, pixel_area_mcd, ECO_ID_COL
        )

        for eco_id, area_Mha in area_dict.items():
            results[(eco_id, year)] = area_Mha

    # Convert results to DataFrame
    print("\nBuilding DataFrame of new prediction areas...")
    rows = []
    for (eco_id, year), area_Mha in results.items():
        rows.append({ECO_ID_COL: eco_id, "year": year, NEW_COL_NAME: area_Mha})
    df_new = pd.DataFrame(rows)

    # ---------- Merge with existing CSV ----------
    print("Merging new prediction areas into existing CSV...")
    df_merged = df_old.merge(df_new, on=[ECO_ID_COL, "year"], how="left")

    df_merged = df_merged.sort_values(by=["year", ECO_ID_COL])

    df_merged.to_csv(OUT_CSV_NEW, index=False)
    print(f"\n✅ Saved updated CSV with new prediction column '{NEW_COL_NAME}' to:\n  {OUT_CSV_NEW}")


if __name__ == "__main__":
    main()


Loading ecoregions...
Reading existing burned area CSV: /explore/nobackup/people/spotter5/clelland_fire_ml/burned_area_summaries/burned_area_by_ecoregion_predictions.csv
Preparing template and reprojecting ecoregions (MCD grid)...
Checking that new prediction grid matches native MCD grid...

[1/1] Processing NEW predictions (NEG=50%, w=2)...
  Year 2001 (new predictions)
  Year 2002 (new predictions)
  Year 2003 (new predictions)
  Year 2004 (new predictions)
  Year 2005 (new predictions)
  Year 2006 (new predictions)
  Year 2007 (new predictions)
  Year 2008 (new predictions)
  Year 2009 (new predictions)
  Year 2010 (new predictions)
  Year 2011 (new predictions)
  Year 2012 (new predictions)
  Year 2013 (new predictions)
  Year 2014 (new predictions)
  Year 2015 (new predictions)
  Year 2016 (new predictions)
  Year 2017 (new predictions)
  Year 2018 (new predictions)
  Year 2019 (new predictions)
  Year 2020 (new predictions)
  Year 2021 (new predictions)
  Year 2022 (new predictio

All models

In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Compute annual burned area (Mha) per ecoregion for ALL Option 2 models
and merge into existing CSV that already has:

  - ba_mcd_native_Mha
  - ba_firecci_native_Mha
  - ba_pred_Mha   (original predictions)
  - optionally ba_pred_Mha_neg50_w2, etc.

For each prediction directory:

  /explore/nobackup/people/spotter5/clelland_fire_ml/
      predictions_option2_neg{pct:03d}pct_w{w_tag}_mcd/class/

we:

  - For each year (2001–2023), stack 12 monthly predicted class rasters
    (cems_pred_class_YYYY_MM_thr*.tif), take max across months
    -> annual 0/1 mask (1=burned at least once in the year).
  - For each ecoregion polygon:
      * rasterize polygon
      * count burned pixels
      * multiply by pixel area (m²) and convert to Mha.
  - Add a column:

      ba_pred_Mha_neg{pct}_w{w_tag}

    e.g. ba_pred_Mha_neg50_w2

and merge into the existing CSV.
"""

import os
import re
from pathlib import Path
from typing import Optional, Dict, Tuple

import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio as rio
from rasterio.features import geometry_mask

# ============================
# CONFIG
# ============================

# Years for MCD / predictions
YEARS_MCD = list(range(2001, 2024))
MONTHS    = list(range(1, 13))

# --- Native MCD monthly fraction rasters (for template / pixel area only) ---
NATIVE_MCD_DIR = Path(
    "/explore/nobackup/people/spotter5/clelland_fire_ml/training_e5l_cems_mcd_with_fraction"
)

# Root where prediction dirs live (must match prediction script)
PRED_ROOT_DIR = Path("/explore/nobackup/people/spotter5/clelland_fire_ml")

# Pattern of prediction dirs created by the all-models script
# e.g. predictions_option2_neg050pct_w2_mcd
PRED_DIR_PATTERN = "predictions_option2_neg*pct_w*_mcd"

# Glob pattern for predicted class TIFFs:
#   cems_pred_class_YYYY_MM_thr{thr}.tif
PRED_CLASS_PATTERN = "cems_pred_class_{year}_{month:02d}_thr*.tif"

# --- Ecoregion shapefile ---
ECOS_PATH = "/explore/nobackup/people/spotter5/helene/raw/merge_eco_v2.shp"
ECO_ID_COL = "ecoregion"

# --- Existing + new CSV paths ---
OUT_DIR = Path("/explore/nobackup/people/spotter5/clelland_fire_ml/burned_area_summaries")
OUT_DIR.mkdir(parents=True, exist_ok=True)

IN_CSV_OLD  = OUT_DIR / "burned_area_by_ecoregion_predictions.csv"
OUT_CSV_NEW = OUT_DIR / "burned_area_by_ecoregion_predictions_all_models.csv"


# ============================
# HELPERS
# ============================

def find_native_monthly_path(
    base_dir: Path,
    prefix: str,
    year: int,
    month: int
) -> Optional[Path]:
    """
    Try a few reasonable filename variants for the native monthly *_with_fraction.tif.
    prefix is 'cems_e5l_mcd'.
    """
    candidates = [
        base_dir / f"{prefix}_{year}_{month}_with_fraction.tif",
        base_dir / f"{prefix}_{year}_{month:02d}_with_fraction.tif",
        base_dir / f"{prefix}_{year}_{month}.tif",
        base_dir / f"{prefix}_{year}_{month:02d}.tif",
    ]
    for p in candidates:
        if p.exists():
            return p
    return None


def find_template_path_native(
    base_dir: Path,
    prefix: str,
    years,
    months
) -> Path:
    """
    Find one existing native monthly file to use as template (for CRS, transform, shape).
    """
    for y in years:
        for m in months:
            p = find_native_monthly_path(base_dir, prefix, y, m)
            if p is not None:
                return p
    raise FileNotFoundError(f"No native files found for prefix {prefix} in {base_dir}")


def find_pred_class_month_path(
    pred_class_dir: Path,
    year: int,
    month: int
) -> Optional[Path]:
    """
    Find predicted class raster for given year/month in a given model dir.
    Expects filenames like cems_pred_class_YYYY_MM_thr*.tif
    """
    matches = list(pred_class_dir.glob(PRED_CLASS_PATTERN.format(year=year, month=month)))
    if matches:
        # If multiple thresholds exist, just use the first one
        return matches[0]
    return None


def build_annual_pred_mask(
    year: int,
    months,
    pred_class_dir: Path
) -> Optional[np.ndarray]:
    """
    Build annual 0/1 burned mask from predicted monthly class rasters for given year.
    Assumes class TIFF has 1=burned, 0=unburned, 255=nodata.
    Returns uint8 array with shape (H, W) or None if no months are found.
    """
    annual = None

    for m in months:
        src_path = find_pred_class_month_path(pred_class_dir, year, m)
        if src_path is None:
            continue

        with rio.open(src_path) as ds:
            arr = ds.read(1)
            monthly_burn = (arr == 1).astype(np.uint8)

            if annual is None:
                annual = monthly_burn
            else:
                annual = np.maximum(annual, monthly_burn)

    return annual


def prepare_ecos_for_dataset(template_path: Path, ecos: gpd.GeoDataFrame):
    """
    Open template raster, check CRS, compute pixel area, and reproject ecos to that CRS.
    """
    with rio.open(template_path) as ds:
        crs = ds.crs
        transform = ds.transform
        height, width = ds.height, ds.width

        if crs is None:
            raise ValueError(f"Template {template_path} has no CRS")

        if crs.is_geographic:
            raise ValueError(
                f"Template CRS {crs} is geographic (degrees). "
                "Reproject rasters to an equal-area projection (meters) before area calculation."
            )

        pixel_area_m2 = abs(transform.a * transform.e)  # a=width, e=height (negative)
        ecos_reproj = ecos.to_crs(crs)

    return ecos_reproj, transform, (height, width), pixel_area_m2


def compute_area_per_ecoregion(
    annual_mask: np.ndarray,
    ecos_reproj: gpd.GeoDataFrame,
    transform,
    pixel_area_m2: float,
    id_col: str,
) -> Dict[str, float]:
    """
    Given an annual 0/1 mask and an ecoregion GeoDataFrame (already in raster CRS),
    compute burned area (Mha) for each ecoregion.
    Returns dict: {ecoregion_id: burned_area_Mha}
    """
    height, width = annual_mask.shape
    results = {}

    for idx, row in ecos_reproj.iterrows():
        eco_id = row[id_col]
        geom = row.geometry
        if geom is None or geom.is_empty:
            results[eco_id] = 0.0
            continue

        eco_mask = geometry_mask(
            [geom.__geo_interface__],
            transform=transform,
            invert=True,
            out_shape=(height, width),
        )

        burned_pixels = (annual_mask == 1) & eco_mask
        area_m2 = burned_pixels.sum() * pixel_area_m2
        area_Mha = area_m2 / 1e10  # m^2 -> Mha

        results[eco_id] = area_Mha

    return results


def parse_model_tag_from_dir(dirname: str) -> Optional[Tuple[int, int]]:
    """
    Parse neg_percent and w_tag from a prediction directory name, e.g.:

      predictions_option2_neg050pct_w2_mcd

    Returns (neg_percent_int, w_tag_int) or None if it doesn't match.
    """
    # Accept both 2 or 3 digits for percent; directory always has 3 but we drop leading zeros.
    m = re.search(r"neg(\d{2,3})pct_w(\d+)_mcd", dirname)
    if not m:
        return None
    neg_percent = int(m.group(1))  # 050 -> 50
    w_tag = int(m.group(2))
    return neg_percent, w_tag


# ============================
# MAIN
# ============================

def main():
    # ---------- Load ecoregions & old CSV ----------
    print("Loading ecoregions...")
    ecos = gpd.read_file(ECOS_PATH)
    if ECO_ID_COL not in ecos.columns:
        raise ValueError(f"ECO_ID_COL='{ECO_ID_COL}' not found in {ECOS_PATH}. Columns: {list(ecos.columns)}")

    print(f"Reading existing burned area CSV: {IN_CSV_OLD}")
    df_old = pd.read_csv(IN_CSV_OLD)

    # ---------- Prepare template & reproject ecos ----------
    print("Preparing template and reprojecting ecoregions (MCD grid)...")
    tmpl_mcd_native = find_template_path_native(NATIVE_MCD_DIR, "cems_e5l_mcd", YEARS_MCD, MONTHS)
    ecos_mcd, transform_mcd, shape_mcd, pixel_area_mcd = prepare_ecos_for_dataset(tmpl_mcd_native, ecos)

    # ---------- Find all prediction model 'class' directories ----------
    pred_class_dirs = sorted(PRED_ROOT_DIR.glob(f"{PRED_DIR_PATTERN}/class"))
    if not pred_class_dirs:
        raise FileNotFoundError(f"No prediction class directories matching '{PRED_DIR_PATTERN}' under {PRED_ROOT_DIR}")

    print("\nFound prediction model class directories:")
    model_info = []  # list of (neg_percent, w_tag, class_dir)
    for class_dir in pred_class_dirs:
        neg_w = parse_model_tag_from_dir(class_dir.parent.name)
        if neg_w is None:
            print(f"  [SKIP] Could not parse neg/w from: {class_dir.parent.name}")
            continue
        neg_percent, w_tag = neg_w
        print(f"  - {class_dir.parent.name}  -> neg={neg_percent}%, w={w_tag}")
        model_info.append((neg_percent, w_tag, class_dir))

    if not model_info:
        raise RuntimeError("No valid model prediction directories parsed from names.")

    # ---------- Check that prediction grids match native MCD grid ----------
    print("\nChecking that prediction grids match native MCD grid (using the first model)...")
    first_class_dir = model_info[0][2]
    tmpl_new_pred = None
    for y in YEARS_MCD:
        for m in MONTHS:
            p = find_pred_class_month_path(first_class_dir, y, m)
            if p is not None:
                tmpl_new_pred = p
                break
        if tmpl_new_pred is not None:
            break

    if tmpl_new_pred is None:
        raise FileNotFoundError(f"No prediction class files found in {first_class_dir}")

    with rio.open(tmpl_new_pred) as ds_pred, rio.open(tmpl_mcd_native) as ds_nat:
        if (ds_pred.transform != ds_nat.transform) or (ds_pred.width != ds_nat.width) or (ds_pred.height != ds_nat.height):
            raise ValueError("Predicted MCD grid does not match native MCD grid.")

    # ---------- Start from the existing CSV and add columns per model ----------
    df_merged = df_old.copy()

    # ---------- For each model, compute burned area and merge ----------
    for neg_percent, w_tag, class_dir in model_info:
        col_name = f"ba_pred_Mha_neg{neg_percent}_w{w_tag}"
        print("\n" + "=" * 80)
        print(f"Processing model: neg={neg_percent}%, w={w_tag}")
        print(f"Class rasters dir: {class_dir}")
        print(f"New column name:   {col_name}")

        results = {}  # key: (eco_id, year) -> area

        for year in YEARS_MCD:
            print(f"  Year {year}...")
            annual_mask = build_annual_pred_mask(year, MONTHS, class_dir)
            if annual_mask is None:
                print(f"    -> No predictions found for {year} in {class_dir}, skipping.")
                continue

            if annual_mask.shape != shape_mcd:
                raise ValueError(
                    f"Shape mismatch for predictions {year} in {class_dir}: "
                    f"{annual_mask.shape} vs {shape_mcd}"
                )

            area_dict = compute_area_per_ecoregion(
                annual_mask, ecos_mcd, transform_mcd, pixel_area_mcd, ECO_ID_COL
            )

            for eco_id, area_Mha in area_dict.items():
                results[(eco_id, year)] = area_Mha

        if not results:
            print(f"  -> No valid annual masks computed for model neg{neg_percent}_w{w_tag}; skipping merge.")
            continue

        # Convert results to DataFrame
        rows = []
        for (eco_id, year), area_Mha in results.items():
            rows.append({ECO_ID_COL: eco_id, "year": year, col_name: area_Mha})
        df_new = pd.DataFrame(rows)

        print(f"  Merging {len(df_new)} rows into main CSV for column '{col_name}'...")
        df_merged = df_merged.merge(df_new, on=[ECO_ID_COL, "year"], how="left")

    # ---------- Final save ----------
    df_merged = df_merged.sort_values(by=["year", ECO_ID_COL])
    df_merged.to_csv(OUT_CSV_NEW, index=False)

    print("\n✅ Saved updated CSV with all model prediction columns to:")
    print(f"  {OUT_CSV_NEW}")


if __name__ == "__main__":
    main()


Loading ecoregions...
Reading existing burned area CSV: /explore/nobackup/people/spotter5/clelland_fire_ml/burned_area_summaries/burned_area_by_ecoregion_predictions.csv
Preparing template and reprojecting ecoregions (MCD grid)...

Found prediction model class directories:
  - predictions_option2_neg010pct_w1_mcd  -> neg=10%, w=1
  - predictions_option2_neg010pct_w2_mcd  -> neg=10%, w=2
  - predictions_option2_neg010pct_w3_mcd  -> neg=10%, w=3
  - predictions_option2_neg010pct_w5_mcd  -> neg=10%, w=5
  - predictions_option2_neg020pct_w1_mcd  -> neg=20%, w=1
  - predictions_option2_neg020pct_w2_mcd  -> neg=20%, w=2
  - predictions_option2_neg020pct_w3_mcd  -> neg=20%, w=3
  - predictions_option2_neg020pct_w5_mcd  -> neg=20%, w=5
  - predictions_option2_neg030pct_w1_mcd  -> neg=30%, w=1
  - predictions_option2_neg030pct_w2_mcd  -> neg=30%, w=2
  - predictions_option2_neg030pct_w3_mcd  -> neg=30%, w=3
  - predictions_option2_neg030pct_w5_mcd  -> neg=30%, w=5
  - predictions_option2_neg040

Just plot

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Read precomputed burned area CSV and create a multipanel plot:

  - Input: burned_area_by_ecoregion_predictions.csv
      columns:
        - ecoregion (ECO_ID_COL)
        - year
        - ba_mcd_native_Mha
        - ba_firecci_native_Mha
        - ba_pred_Mha

  - Output:
      burned_area_multipanel_by_ecoregion_floating_y.png
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# ============================
# CONFIG
# ============================

ECO_ID_COL = "ecoregion"

OUT_DIR = Path("/explore/nobackup/people/spotter5/clelland_fire_ml/burned_area_summaries")
IN_CSV  = OUT_DIR / "burned_area_by_ecoregion_predictions.csv"
OUT_PNG = OUT_DIR / "burned_area_multipanel_by_ecoregion_floating_y.png"

# columns to plot + nicer labels
COLS_AREA = [
    "ba_mcd_native_Mha",
    "ba_firecci_native_Mha",
    "ba_pred_Mha",
]

LABELS = {
    "ba_mcd_native_Mha": "MCD64A1",
    "ba_firecci_native_Mha": "Fire CCI",
    "ba_pred_Mha": "Predictions",
}

# Ecoregions to exclude from plotting
EXCLUDE_ECOS = {
    "WATER",
    "MIXED WOOD SHIELD",
    "TEMPERATE PRAIRIES",
    "WESTERN CORDILLERA",
}

# ============================
# MAIN
# ============================

def main():
    print(f"Reading CSV: {IN_CSV}")
    df = pd.read_csv(IN_CSV)

    # Basic sanity
    missing_cols = [c for c in [ECO_ID_COL, "year"] + COLS_AREA if c not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing expected columns in CSV: {missing_cols}")

    df = df.sort_values(by=["year", ECO_ID_COL])

    # Unique ecoregions excluding the undesired ones
    ecos_all = sorted(df[ECO_ID_COL].dropna().unique())
    ecos_list = [e for e in ecos_all if e not in EXCLUDE_ECOS]

    if not ecos_list:
        print("No ecoregions left after exclusion; nothing to plot.")
        return

    n_ecos = len(ecos_list)
    print(f"Found {n_ecos} ecoregions after exclusion: {ecos_list}")

    # Layout: up to 4 columns
    ncols = 4 if n_ecos > 4 else n_ecos
    nrows = int(np.ceil(n_ecos / ncols))

    fig, axes = plt.subplots(
        nrows=nrows,
        ncols=ncols,
        figsize=(4 * ncols, 3 * nrows),
        sharex=True,
        sharey=False,  # floating y-axis
    )

    # Normalize axes to 2D array
    if nrows == 1 and ncols == 1:
        axes = np.array([[axes]])
    elif nrows == 1:
        axes = np.array([axes])
    elif ncols == 1:
        axes = axes.reshape(-1, 1)

    handles_for_legend, labels_for_legend = None, None

    # Plot per ecoregion
    for i, eco_id in enumerate(ecos_list):
        row = i // ncols
        col = i % ncols
        ax = axes[row, col]

        df_eco = df[df[ECO_ID_COL] == eco_id]

        # plot all datasets
        for col_name in COLS_AREA:
            if col_name in df_eco.columns and df_eco[col_name].notna().any():
                ax.plot(
                    df_eco["year"],
                    df_eco[col_name],
                    marker="o",
                    label=LABELS[col_name],
                )

        # capture legend handles after all 3 plotted
        if handles_for_legend is None:
            h, l = ax.get_legend_handles_labels()
            if h:
                handles_for_legend, labels_for_legend = h, l

        ax.set_title(str(eco_id))
        ax.grid(True, ls="--", alpha=0.4)

        # floating y-axis logic
        ydata_list = [df_eco[c].dropna().values for c in COLS_AREA if c in df_eco.columns]
        if ydata_list:
            ydata = np.concatenate(ydata_list)
            if ydata.size == 0 or np.nanmax(ydata) == 0:
                ax.set_ylim(0, 1.0)
        else:
            ax.set_ylim(0, 1.0)

        if row == nrows - 1:
            ax.set_xlabel("Year")
        if col == 0:
            ax.set_ylabel("Burned area (Mha)")

    # Hide unused panels
    total_plots = nrows * ncols
    if total_plots > n_ecos:
        for j in range(n_ecos, total_plots):
            row = j // ncols
            col = j % ncols
            axes[row, col].axis("off")

    # Global legend at bottom with all three dataset names
    if handles_for_legend:
        fig.legend(
            handles_for_legend,
            labels_for_legend,
            loc="lower center",
            ncol=len(handles_for_legend),
            bbox_to_anchor=(0.5, -0.02),
        )

    plt.tight_layout(rect=[0, 0.05, 1, 1])
    plt.savefig(OUT_PNG, dpi=150, bbox_inches="tight")
    plt.close()

    print(f"Saved multipanel burned area plot to:\n  {OUT_PNG}")


if __name__ == "__main__":
    main()


new plot with fourth line

In [12]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Read precomputed burned area CSV and create a multipanel plot:

  - Input: burned_area_by_ecoregion_predictions_neg50_w2.csv
      columns:
        - ecoregion (ECO_ID_COL)
        - year
        - ba_mcd_native_Mha
        - ba_firecci_native_Mha
        - ba_pred_Mha                (older prediction set)
        - ba_pred_Mha_neg50_w2       (new NEG=50%, w=2 predictions)

  - Output:
      burned_area_multipanel_by_ecoregion_floating_y_neg50_w2.png
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# ============================
# CONFIG
# ============================

ECO_ID_COL = "ecoregion"

OUT_DIR = Path("/explore/nobackup/people/spotter5/clelland_fire_ml/burned_area_summaries")
IN_CSV  = OUT_DIR / "burned_area_by_ecoregion_predictions_neg50_w2.csv"
OUT_PNG = OUT_DIR / "burned_area_multipanel_by_ecoregion_floating_y_neg50_w2.png"

# columns to plot + nicer labels
COLS_AREA = [
    "ba_mcd_native_Mha",
    "ba_firecci_native_Mha",
    "ba_pred_Mha",
    "ba_pred_Mha_neg50_w2",
]

LABELS = {
    "ba_mcd_native_Mha":      "MCD64A1",
    "ba_firecci_native_Mha":  "Fire CCI",
    "ba_pred_Mha":            "Predictions (neg40 w3)",
    "ba_pred_Mha_neg50_w2":   "Predictions (neg50 w2)",
}

# Ecoregions to exclude from plotting
EXCLUDE_ECOS = {
    "WATER",
    "MIXED WOOD SHIELD",
    "TEMPERATE PRAIRIES",
    "WESTERN CORDILLERA",
}

# ============================
# MAIN
# ============================

def main():
    print(f"Reading CSV: {IN_CSV}")
    df = pd.read_csv(IN_CSV)

    # Basic sanity
    missing_cols = [c for c in [ECO_ID_COL, "year"] + COLS_AREA if c not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing expected columns in CSV: {missing_cols}")

    df = df.sort_values(by=["year", ECO_ID_COL])

    # Unique ecoregions excluding the undesired ones
    ecos_all = sorted(df[ECO_ID_COL].dropna().unique())
    ecos_list = [e for e in ecos_all if e not in EXCLUDE_ECOS]

    if not ecos_list:
        print("No ecoregions left after exclusion; nothing to plot.")
        return

    n_ecos = len(ecos_list)
    print(f"Found {n_ecos} ecoregions after exclusion: {ecos_list}")

    # Layout: up to 4 columns
    ncols = 4 if n_ecos > 4 else n_ecos
    nrows = int(np.ceil(n_ecos / ncols))

    fig, axes = plt.subplots(
        nrows=nrows,
        ncols=ncols,
        figsize=(4 * ncols, 3 * nrows),
        sharex=True,
        sharey=False,  # floating y-axis
    )

    # Normalize axes to 2D array
    if nrows == 1 and ncols == 1:
        axes = np.array([[axes]])
    elif nrows == 1:
        axes = np.array([axes])
    elif ncols == 1:
        axes = axes.reshape(-1, 1)

    handles_for_legend, labels_for_legend = None, None

    # Plot per ecoregion
    for i, eco_id in enumerate(ecos_list):
        row = i // ncols
        col = i % ncols
        ax = axes[row, col]

        df_eco = df[df[ECO_ID_COL] == eco_id]

        # plot all datasets
        for col_name in COLS_AREA:
            if col_name in df_eco.columns and df_eco[col_name].notna().any():
                ax.plot(
                    df_eco["year"],
                    df_eco[col_name],
                    marker="o",
                    label=LABELS[col_name],
                )

        # capture legend handles after all lines for the first ecoregion
        if handles_for_legend is None:
            h, l = ax.get_legend_handles_labels()
            if h:
                handles_for_legend, labels_for_legend = h, l

        ax.set_title(str(eco_id))
        ax.grid(True, ls="--", alpha=0.4)

        # floating y-axis logic
        ydata_list = [df_eco[c].dropna().values for c in COLS_AREA if c in df_eco.columns]
        if ydata_list:
            ydata = np.concatenate(ydata_list)
            if ydata.size == 0 or np.nanmax(ydata) == 0:
                ax.set_ylim(0, 1.0)
        else:
            ax.set_ylim(0, 1.0)

        if row == nrows - 1:
            ax.set_xlabel("Year")
        if col == 0:
            ax.set_ylabel("Burned area (Mha)")

    # Hide unused panels
    total_plots = nrows * ncols
    if total_plots > n_ecos:
        for j in range(n_ecos, total_plots):
            row = j // ncols
            col = j % ncols
            axes[row, col].axis("off")

    # Global legend at bottom with all four dataset names
    if handles_for_legend:
        fig.legend(
            handles_for_legend,
            labels_for_legend,
            loc="lower center",
            ncol=len(handles_for_legend),
            bbox_to_anchor=(0.5, -0.02),
        )

    plt.tight_layout(rect=[0, 0.05, 1, 1])
    plt.savefig(OUT_PNG, dpi=150, bbox_inches="tight")
    plt.close()

    print(f"Saved multipanel burned area plot to:\n  {OUT_PNG}")


if __name__ == "__main__":
    main()


Reading CSV: /explore/nobackup/people/spotter5/clelland_fire_ml/burned_area_summaries/burned_area_by_ecoregion_predictions_neg50_w2.csv
Found 23 ecoregions after exclusion: ['ALASKA BOREAL INTERIOR', 'ALASKA TUNDRA', 'ARCTIC CORDILLERA', 'Arctic Deserts and Tundra', 'BOREAL CORDILLERA', 'BOREAL PLAIN', 'BROOKS RANGE TUNDRA', 'Central Taiga', 'Forest Tundra', 'HUDSON PLAIN', 'MARINE WEST COAST FOREST', 'Montane Boreal', 'Montane Sub-Arctic', 'Montane Sub-Boreal', 'NORTHERN ARCTIC', 'Northern Taiga', 'SOFTWOOD SHIELD', 'SOUTHERN ARCTIC', 'Southern Taiga', 'TAIGA CORDILLERA', 'TAIGA PLAIN', 'TAIGA SHIELD', 'Wetlands']
Saved multipanel burned area plot to:
  /explore/nobackup/people/spotter5/clelland_fire_ml/burned_area_summaries/burned_area_multipanel_by_ecoregion_floating_y_neg50_w2.png


All plots

In [3]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
For EACH prediction column (ba_pred_Mha, ba_pred_Mha_negXX_wY, ...),
create a multipanel plot (ecoregions as subplots) comparing:

    - MCD64A1 (ba_mcd_native_Mha)
    - Fire CCI (ba_firecci_native_Mha)
    - That specific prediction column

Input:
    burned_area_by_ecoregion_predictions_all_models.csv

Output:
    <OUT_DIR>/multipanel_per_model/burned_area_multipanel_<pred_col>.png
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# ============================
# CONFIG
# ============================

ECO_ID_COL = "ecoregion"

OUT_DIR = Path("/explore/nobackup/people/spotter5/clelland_fire_ml/burned_area_summaries")
IN_CSV  = OUT_DIR / "burned_area_by_ecoregion_predictions_all_models.csv"

# Folder where all multipanel PNGs (one per model) will go
OUT_PNG_DIR = OUT_DIR / "multipanel_per_model"
OUT_PNG_DIR.mkdir(parents=True, exist_ok=True)

# Base / native columns
MCD_COL      = "ba_mcd_native_Mha"
FIRECCI_COL  = "ba_firecci_native_Mha"

BASE_LABELS = {
    MCD_COL: "MCD64A1",
    FIRECCI_COL: "Fire CCI",
}

# Ecoregions to exclude from plotting
EXCLUDE_ECOS = {
    "WATER",
    "MIXED WOOD SHIELD",
    "TEMPERATE PRAIRIES",
    "WESTERN CORDILLERA",
}


def nice_pred_label(colname: str) -> str:
    """
    Turn prediction column names into nicer legend labels.
    Examples:
      ba_pred_Mha              -> "Pred (orig)"
      ba_pred_Mha_neg50_w2     -> "Pred neg50 w2"
    """
    if colname == "ba_pred_Mha":
        return "Pred (orig)"

    if colname.startswith("ba_pred_Mha_"):
        sfx = colname.replace("ba_pred_Mha_", "").replace("_", " ")
        return f"Pred {sfx}"

    return colname


# ============================
# MAIN
# ============================

def main():
    print(f"Reading CSV: {IN_CSV}")
    df = pd.read_csv(IN_CSV)

    # Basic sanity check
    needed_base = [ECO_ID_COL, "year", MCD_COL, FIRECCI_COL]
    missing_base = [c for c in needed_base if c not in df.columns]
    if missing_base:
        raise ValueError(f"Missing base columns in CSV: {missing_base}")

    df = df.sort_values(by=["year", ECO_ID_COL])

    # Find all prediction columns (anything starting with ba_pred_Mha)
    pred_cols = [c for c in df.columns if c.startswith("ba_pred_Mha")]
    if not pred_cols:
        raise ValueError("No prediction columns starting with 'ba_pred_Mha' found in CSV.")

    print("Prediction columns to plot:")
    for c in pred_cols:
        print("  -", c)

    # Unique ecoregions excluding the undesired ones
    ecos_all = sorted(df[ECO_ID_COL].dropna().unique())
    ecos_list = [e for e in ecos_all if e not in EXCLUDE_ECOS]

    if not ecos_list:
        print("No ecoregions left after exclusion; nothing to plot.")
        return

    n_ecos = len(ecos_list)
    print(f"Found {n_ecos} ecoregions after exclusion: {ecos_list}")

    # Loop over each prediction column and make a separate multipanel PNG
    for pred_col in pred_cols:
        print("\n==============================================")
        print(f"Creating multipanel plot for prediction column: {pred_col}")

        # Columns to plot in this figure
        COLS_AREA = [MCD_COL, FIRECCI_COL, pred_col]
        LABELS = {
            MCD_COL: BASE_LABELS[MCD_COL],
            FIRECCI_COL: BASE_LABELS[FIRECCI_COL],
            pred_col: nice_pred_label(pred_col),
        }

        # Layout: up to 4 columns of subplots
        ncols = 4 if n_ecos > 4 else n_ecos
        nrows = int(np.ceil(n_ecos / ncols))

        fig, axes = plt.subplots(
            nrows=nrows,
            ncols=ncols,
            figsize=(4 * ncols, 3 * nrows),
            sharex=True,
            sharey=False,  # floating y-axis per panel
        )

        # Normalize axes to 2D array
        if nrows == 1 and ncols == 1:
            axes = np.array([[axes]])
        elif nrows == 1:
            axes = np.array([axes])
        elif ncols == 1:
            axes = axes.reshape(-1, 1)

        handles_for_legend, labels_for_legend = None, None

        # Plot per ecoregion
        for i, eco_id in enumerate(ecos_list):
            row = i // ncols
            col = i % ncols
            ax = axes[row, col]

            df_eco = df[df[ECO_ID_COL] == eco_id]

            # Plot all three datasets for this model
            for col_name in COLS_AREA:
                if col_name in df_eco.columns and df_eco[col_name].notna().any():
                    ax.plot(
                        df_eco["year"],
                        df_eco[col_name],
                        marker="o",
                        label=LABELS[col_name],
                    )

            # Capture legend handles from first non-empty panel
            if handles_for_legend is None:
                h, l = ax.get_legend_handles_labels()
                if h:
                    handles_for_legend, labels_for_legend = h, l

            ax.set_title(str(eco_id))
            ax.grid(True, ls="--", alpha=0.4)

            # Floating y-axis logic: choose sensible default if all zeros / NaN
            ydata_list = [df_eco[c].dropna().values for c in COLS_AREA if c in df_eco.columns]
            if ydata_list:
                ydata = np.concatenate(ydata_list)
                if ydata.size == 0 or np.nanmax(ydata) == 0:
                    ax.set_ylim(0, 1.0)
            else:
                ax.set_ylim(0, 1.0)

            if row == nrows - 1:
                ax.set_xlabel("Year")
            if col == 0:
                ax.set_ylabel("Burned area (Mha)")

        # Hide unused panels
        total_plots = nrows * ncols
        if total_plots > n_ecos:
            for j in range(n_ecos, total_plots):
                row = j // ncols
                col = j % ncols
                axes[row, col].axis("off")

        # Global legend at bottom
        if handles_for_legend:
            fig.legend(
                handles_for_legend,
                labels_for_legend,
                loc="lower center",
                ncol=len(handles_for_legend),
                bbox_to_anchor=(0.5, -0.02),
            )

        # Clean filenames: replace spaces with underscores
        pred_tag = pred_col.replace("ba_pred_Mha_", "") if pred_col != "ba_pred_Mha" else "orig"
        pred_tag = pred_tag.replace(" ", "_")

        out_png = OUT_PNG_DIR / f"burned_area_multipanel_{pred_tag}.png"

        plt.tight_layout(rect=[0, 0.05, 1, 1])
        plt.savefig(out_png, dpi=150, bbox_inches="tight")
        plt.close()

        print(f"  -> Saved multipanel PNG for {pred_col} to:\n     {out_png}")

    print("\n✅ Done. One multipanel PNG per prediction model written to:")
    print(f"  {OUT_PNG_DIR}")


if __name__ == "__main__":
    main()


Reading CSV: /explore/nobackup/people/spotter5/clelland_fire_ml/burned_area_summaries/burned_area_by_ecoregion_predictions_all_models.csv
Prediction columns to plot:
  - ba_pred_Mha
  - ba_pred_Mha_neg10_w1
  - ba_pred_Mha_neg10_w2
  - ba_pred_Mha_neg10_w3
  - ba_pred_Mha_neg10_w5
  - ba_pred_Mha_neg20_w1
  - ba_pred_Mha_neg20_w2
  - ba_pred_Mha_neg20_w3
  - ba_pred_Mha_neg20_w5
  - ba_pred_Mha_neg30_w1
  - ba_pred_Mha_neg30_w2
  - ba_pred_Mha_neg30_w3
  - ba_pred_Mha_neg30_w5
  - ba_pred_Mha_neg40_w1
  - ba_pred_Mha_neg40_w2
  - ba_pred_Mha_neg40_w3
  - ba_pred_Mha_neg40_w5
  - ba_pred_Mha_neg50_w1
  - ba_pred_Mha_neg50_w2
  - ba_pred_Mha_neg50_w3
  - ba_pred_Mha_neg50_w5
  - ba_pred_Mha_neg60_w1
  - ba_pred_Mha_neg60_w2
  - ba_pred_Mha_neg60_w3
  - ba_pred_Mha_neg60_w5
  - ba_pred_Mha_neg70_w1
  - ba_pred_Mha_neg70_w2
  - ba_pred_Mha_neg70_w3
  - ba_pred_Mha_neg70_w5
  - ba_pred_Mha_neg80_w1
  - ba_pred_Mha_neg80_w2
  - ba_pred_Mha_neg80_w3
  - ba_pred_Mha_neg80_w5
  - ba_pred_Mha_ne