Distriutions

In [3]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Make multi-panel density plots for predictors in the 100x Train/Val dataset.

- Input (100x Train/Val parquet directory):
    /explore/nobackup/people/spotter5/clelland_fire_ml/parquet_cems_trainval_100x

- For each numeric predictor:
    * Plot density (KDE) with x = predictor, y = density
    * Color curves by burned class (0 = no fire, 1 = fire)

- To avoid huge data volume, we:
    * Keep ALL burned==1 samples (with fraction > 0.5)
    * Downsample burned==0 samples (with fraction == 0) to match the number of 1s

- Global Cleaning:
    * Remove rows where ANY numeric variable is < -1000 (nodata/fill values).

- Output:
    One multi-panel PNG with all predictor distributions.

    NOTE: This script *only* uses:
        - 0-class where fraction == 0
        - 1-class where fraction > 0.5
"""

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pyarrow.dataset as ds

# ============================================================
# CONFIG
# ============================================================

RANDOM_STATE = 42

TRAINVAL_DIR = "/explore/nobackup/people/spotter5/clelland_fire_ml/parquet_cems_trainval_100x"

OUT_ROOT = "/explore/nobackup/people/spotter5/clelland_fire_ml/ml_training/neg_ratio_experiments_globaltest"
OUT_DIR  = os.path.join(OUT_ROOT, "predictor_distributions_100x")
os.makedirs(OUT_DIR, exist_ok=True)

FIG_PATH = os.path.join(
    OUT_DIR,
    "trainval100x_predictor_distributions_balanced_frac0_vs_gt0p5.png"
)

# Columns we do NOT want as predictors
DROP_COLS = {"fraction", "burned", "bin", "year", "month", "latitude", "longitude"}

# ============================================================
# 1. LOAD TRAIN/VAL 100x DATASET
# ============================================================

print(f"Loading Train/Val 100x dataset from:\n  {TRAINVAL_DIR}")
tv_dataset = ds.dataset(TRAINVAL_DIR, format="parquet")
tv_table   = tv_dataset.to_table()
df_tv_full = tv_table.to_pandas()
print(f"Train/Val 100x raw size: {len(df_tv_full):,} rows\n")

# ============================================================
# 2. PREPARE LABEL + PREDICTORS
# ============================================================

df = df_tv_full.copy()

# Ensure fraction is clean and present
if "fraction" not in df.columns:
    raise ValueError("Column 'fraction' is required in the Train/Val dataset.")

df["fraction"] = df["fraction"].astype("float32").clip(0, 1)

# Create burned label if needed (1 if fraction > 0.5 else 0)
if "burned" not in df.columns:
    df["burned"] = (df["fraction"] > 0.5).astype("uint8")

# Replace inf with NaN
df = df.replace([np.inf, -np.inf], np.nan)

# Define predictor columns: everything except DROP_COLS
predictors = [c for c in df.columns if c not in DROP_COLS]

print(f"Total columns: {len(df.columns)}")
print(f"Num predictors (before numeric filter): {len(predictors)}\n")

# Build a frame with predictors + label
tv_plot = df[predictors].copy()
tv_plot["burned"]   = df["burned"].astype("uint8")
tv_plot["fraction"] = df["fraction"].astype("float32")

# Keep only numeric predictors for KDE
numeric_predictors = [
    c for c in predictors
    if np.issubdtype(tv_plot[c].dtype, np.number)
]

print(f"Numeric predictors for plotting: {len(numeric_predictors)}")
if not numeric_predictors:
    raise RuntimeError("No numeric predictors found to plot.")

# --- GLOBAL CLEANING FOR VALUES < -1000 ---
print("--- Global cleaning: Removing rows with values < -1000 in any numeric predictor ---")
before_glob = len(tv_plot)

# We use a mask to find rows where any numeric predictor is < -1000
# Note: Comparisons with NaN are False, so this preserves NaNs (which are handled in the next step)
mask_low_values = (tv_plot[numeric_predictors] < -1000).any(axis=1)
tv_plot = tv_plot[~mask_low_values]

print(f"Dropped {before_glob - len(tv_plot):,} rows containing values < -1000.")
print("----------------------------------------------------------------------------------\n")

# Drop rows with NaNs in any numeric predictor or in burned
cols_for_clean = numeric_predictors + ["burned"]
before_clean = len(tv_plot)
tv_plot = tv_plot.dropna(subset=cols_for_clean)
after_clean = len(tv_plot)
print(f"Dropped {before_clean - after_clean:,} rows with NaNs in predictors/burned.")
print(f"Remaining rows for plotting pool (before fraction filtering): {after_clean:,}\n")

# ============================================================
# 3. FILTER BY FRACTION & BALANCE 0/1 FOR PLOTTING
# ============================================================

# Only:
#   - negatives: burned == 0 AND fraction == 0
#   - positives: burned == 1 AND fraction > 0.5
neg = tv_plot[(tv_plot["burned"] == 0) & (tv_plot["fraction"] == 0)]
pos = tv_plot[(tv_plot["burned"] == 1) & (tv_plot["fraction"] > 0.5)]

n_pos = len(pos)
n_neg = len(neg)

print(f"After fraction filter:")
print(f"  Positives (burned=1, fraction>0.5): {n_pos:,}")
print(f"  Negatives (burned=0, fraction==0): {n_neg:,}")

if n_pos == 0 or n_neg == 0:
    raise RuntimeError(
        "One of the filtered classes is empty (after applying fraction==0 for 0s "
        "and fraction>0.5 for 1s); cannot make 0/1 comparison plots."
    )

# Downsample negatives to match number of positives
n_neg_sample = min(n_neg, n_pos)
neg_sample = neg.sample(n=n_neg_sample, random_state=RANDOM_STATE)

plot_df = (
    pd.concat([pos, neg_sample], axis=0)
    .sample(frac=1.0, random_state=RANDOM_STATE)
    .reset_index(drop=True)
)

print(
    f"\nBalanced plotting sample size (after fraction filter): {len(plot_df):,} rows "
    f"(0: {n_neg_sample:,}, 1: {n_pos:,})\n"
)

# ============================================================
# 4. MULTI-PANEL DENSITY PLOTS
# ============================================================

sns.set_style("whitegrid")

num_pred = len(numeric_predictors)
ncols = 3
nrows = int(np.ceil(num_pred / ncols))

fig, axes = plt.subplots(
    nrows=nrows,
    ncols=ncols,
    figsize=(5 * ncols, 3.5 * nrows),
    squeeze=False
)

for idx, col in enumerate(numeric_predictors):
    r, c = divmod(idx, ncols)
    ax = axes[r][c]

    sns.kdeplot(
        data=plot_df,
        x=col,
        hue="burned",          # 0 = no fire (fraction==0), 1 = fire (fraction>0.5)
        common_norm=False,
        fill=False,
        ax=ax,
    )
    ax.set_title(col)
    ax.set_ylabel("Density")
    ax.set_xlabel(col)

# Turn off any unused subplots
for j in range(num_pred, nrows * ncols):
    r, c = divmod(j, ncols)
    axes[r][c].axis("off")

plt.tight_layout()
plt.savefig(FIG_PATH, dpi=150)
plt.close()

print(f"Saved balanced predictor distribution multipanel plot to:\n  {FIG_PATH}")
print("\n✅ Done.")

Loading Train/Val 100x dataset from:
  /explore/nobackup/people/spotter5/clelland_fire_ml/parquet_cems_trainval_100x
Train/Val 100x raw size: 35,953,144 rows

Total columns: 19
Num predictors (before numeric filter): 15

Numeric predictors for plotting: 15
--- Global cleaning: Removing rows with values < -1000 in any numeric predictor ---
Dropped 352,246 rows containing values < -1000.
----------------------------------------------------------------------------------

Dropped 14,208,107 rows with NaNs in predictors/burned.
Remaining rows for plotting pool (before fraction filtering): 21,392,791

After fraction filter:
  Positives (burned=1, fraction>0.5): 192,601
  Negatives (burned=0, fraction==0): 20,886,538

Balanced plotting sample size (after fraction filter): 385,202 rows (0: 192,601, 1: 192,601)

Saved balanced predictor distribution multipanel plot to:
  /explore/nobackup/people/spotter5/clelland_fire_ml/ml_training/neg_ratio_experiments_globaltest/predictor_distributions_100x/

In [None]:
't'

Use true 0 and fraction >0.5

In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Make multi-panel density plots for predictors in the 100x Train/Val dataset.

- Input (100x Train/Val parquet directory):
    /explore/nobackup/people/spotter5/clelland_fire_ml/parquet_cems_trainval_100x

- For each numeric predictor:
    * Plot density (KDE) with x = predictor, y = density
    * Color curves by burned class (0 = no fire, 1 = fire)

- To avoid huge data volume, we:
    * Keep ALL burned==1 samples (with fraction > 0.5)
    * Downsample burned==0 samples (with fraction == 0) to match the number of 1s

- Output:
    One multi-panel PNG with all predictor distributions.

    NOTE: This script *only* uses:
        - 0-class where fraction == 0
        - 1-class where fraction > 0.5
"""

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pyarrow.dataset as ds

# ============================================================
# CONFIG
# ============================================================

RANDOM_STATE = 42

TRAINVAL_DIR = "/explore/nobackup/people/spotter5/clelland_fire_ml/parquet_cems_trainval_100x"

OUT_ROOT = "/explore/nobackup/people/spotter5/clelland_fire_ml/ml_training/neg_ratio_experiments_globaltest"
OUT_DIR  = os.path.join(OUT_ROOT, "predictor_distributions_100x")
os.makedirs(OUT_DIR, exist_ok=True)

FIG_PATH = os.path.join(
    OUT_DIR,
    "trainval100x_predictor_distributions_balanced_frac0_vs_gt0p5.png"
)

# Columns we do NOT want as predictors
DROP_COLS = {"fraction", "burned", "bin", "year", "month", "latitude", "longitude"}

# ============================================================
# 1. LOAD TRAIN/VAL 100x DATASET
# ============================================================

print(f"Loading Train/Val 100x dataset from:\n  {TRAINVAL_DIR}")
tv_dataset = ds.dataset(TRAINVAL_DIR, format="parquet")
tv_table   = tv_dataset.to_table()
df_tv_full = tv_table.to_pandas()
print(f"Train/Val 100x raw size: {len(df_tv_full):,} rows\n")

# ============================================================
# 2. PREPARE LABEL + PREDICTORS
# ============================================================

df = df_tv_full.copy()

# Ensure fraction is clean and present
if "fraction" not in df.columns:
    raise ValueError("Column 'fraction' is required in the Train/Val dataset.")

df["fraction"] = df["fraction"].astype("float32").clip(0, 1)

# Create burned label if needed (1 if fraction > 0.5 else 0)
if "burned" not in df.columns:
    df["burned"] = (df["fraction"] > 0.5).astype("uint8")

# Replace inf with NaN
df = df.replace([np.inf, -np.inf], np.nan)

# Define predictor columns: everything except DROP_COLS
predictors = [c for c in df.columns if c not in DROP_COLS]

print(f"Total columns: {len(df.columns)}")
print(f"Num predictors (before numeric filter): {len(predictors)}\n")

# Build a frame with predictors + label
tv_plot = df[predictors].copy()
tv_plot["burned"]   = df["burned"].astype("uint8")
tv_plot["fraction"] = df["fraction"].astype("float32")

# Keep only numeric predictors for KDE
numeric_predictors = [
    c for c in predictors
    if np.issubdtype(tv_plot[c].dtype, np.number)
]

print(f"Numeric predictors for plotting: {len(numeric_predictors)}")
if not numeric_predictors:
    raise RuntimeError("No numeric predictors found to plot.")

# Drop rows with NaNs in any numeric predictor or in burned
cols_for_clean = numeric_predictors + ["burned"]
before_clean = len(tv_plot)
tv_plot = tv_plot.dropna(subset=cols_for_clean)
after_clean = len(tv_plot)
print(f"Dropped {before_clean - after_clean:,} rows with NaNs in predictors/burned.")
print(f"Remaining rows for plotting pool (before fraction filtering): {after_clean:,}\n")

# ============================================================
# 3. FILTER BY FRACTION & BALANCE 0/1 FOR PLOTTING
# ============================================================

# Only:
#   - negatives: burned == 0 AND fraction == 0
#   - positives: burned == 1 AND fraction > 0.5
neg = tv_plot[(tv_plot["burned"] == 0) & (tv_plot["fraction"] == 0)]
pos = tv_plot[(tv_plot["burned"] == 1) & (tv_plot["fraction"] > 0.5)]

n_pos = len(pos)
n_neg = len(neg)

print(f"After fraction filter:")
print(f"  Positives (burned=1, fraction>0.5): {n_pos:,}")
print(f"  Negatives (burned=0, fraction==0): {n_neg:,}")

if n_pos == 0 or n_neg == 0:
    raise RuntimeError(
        "One of the filtered classes is empty (after applying fraction==0 for 0s "
        "and fraction>0.5 for 1s); cannot make 0/1 comparison plots."
    )

# Downsample negatives to match number of positives
n_neg_sample = min(n_neg, n_pos)
neg_sample = neg.sample(n=n_neg_sample, random_state=RANDOM_STATE)

plot_df = (
    pd.concat([pos, neg_sample], axis=0)
    .sample(frac=1.0, random_state=RANDOM_STATE)
    .reset_index(drop=True)
)

print(
    f"\nBalanced plotting sample size (after fraction filter): {len(plot_df):,} rows "
    f"(0: {n_neg_sample:,}, 1: {n_pos:,})\n"
)

# ============================================================
# 4. MULTI-PANEL DENSITY PLOTS
# ============================================================

sns.set_style("whitegrid")

num_pred = len(numeric_predictors)
ncols = 3
nrows = int(np.ceil(num_pred / ncols))

fig, axes = plt.subplots(
    nrows=nrows,
    ncols=ncols,
    figsize=(5 * ncols, 3.5 * nrows),
    squeeze=False
)

for idx, col in enumerate(numeric_predictors):
    r, c = divmod(idx, ncols)
    ax = axes[r][c]

    sns.kdeplot(
        data=plot_df,
        x=col,
        hue="burned",          # 0 = no fire (fraction==0), 1 = fire (fraction>0.5)
        common_norm=False,
        fill=False,
        ax=ax,
    )
    ax.set_title(col)
    ax.set_ylabel("Density")
    ax.set_xlabel(col)

# Turn off any unused subplots
for j in range(num_pred, nrows * ncols):
    r, c = divmod(j, ncols)
    axes[r][c].axis("off")

plt.tight_layout()
plt.savefig(FIG_PATH, dpi=150)
plt.close()

print(f"Saved balanced predictor distribution multipanel plot to:\n  {FIG_PATH}")
print("\n✅ Done.")


Loading Train/Val 100x dataset from:
  /explore/nobackup/people/spotter5/clelland_fire_ml/parquet_cems_trainval_100x
Train/Val 100x raw size: 35,953,144 rows

Total columns: 19
Num predictors (before numeric filter): 15

Numeric predictors for plotting: 15
Dropped 14,208,107 rows with NaNs in predictors/burned.
Remaining rows for plotting pool (before fraction filtering): 21,745,037

After fraction filter:
  Positives (burned=1, fraction>0.5): 193,414
  Negatives (burned=0, fraction==0): 21,168,830

Balanced plotting sample size (after fraction filter): 386,828 rows (0: 193,414, 1: 193,414)

Saved balanced predictor distribution multipanel plot to:
  /explore/nobackup/people/spotter5/clelland_fire_ml/ml_training/neg_ratio_experiments_globaltest/predictor_distributions_100x/trainval100x_predictor_distributions_balanced_frac0_vs_gt0p5.png

✅ Done.


In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pyarrow.dataset as ds
import pandas as pd

# Path to your 100× dataset
TRAINVAL_DIR = "/explore/nobackup/people/spotter5/clelland_fire_ml/parquet_cems_trainval_100x"

print(f"Loading dataset from:\n  {TRAINVAL_DIR}")
dataset = ds.dataset(TRAINVAL_DIR, format="parquet")

# Load entire dataset into a DataFrame
df = dataset.to_table().to_pandas()

# Ensure fraction column exists
if "fraction" not in df.columns:
    raise ValueError("Column 'fraction' not found in dataset.")

# Create burned label
df["burned"] = (df["fraction"] > 0.5).astype("uint8")

# Count burned/unburned
counts = df["burned"].value_counts().sort_index()
total = len(df)

n_unburned = counts.get(0, 0)
n_burned   = counts.get(1, 0)

print("\n==============================")
print(" Burned / Unburned Counts")
print("==============================")
print(f"Unburned (fraction <= 0.5): {n_unburned:,}")
print(f"Burned   (fraction  > 0.5): {n_burned:,}")
print(f"Total rows: {total:,}")

# Percentages
print("\nPercentages:")
print(f"Unburned: {n_unburned / total * 100:.4f}%")
print(f"Burned:   {n_burned   / total * 100:.4f}%")

# Ratio
if n_burned > 0:
    print(f"\nNeg:Pos ratio ≈ {n_unburned / n_burned:.2f}:1")
else:
    print("\nNo burned pixels found.")


Loading dataset from:
  /explore/nobackup/people/spotter5/clelland_fire_ml/parquet_cems_trainval_100x

 Burned / Unburned Counts
Unburned (fraction <= 0.5): 35,596,995
Burned   (fraction  > 0.5): 356,149
Total rows: 35,953,144

Percentages:
Unburned: 99.0094%
Burned:   0.9906%

Neg:Pos ratio ≈ 99.95:1
