In [2]:
#!/usr/bin/env python
# coding: utf-8

import os
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import PartialDependenceDisplay

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# -------------------- Config --------------------
INPUT_CSV = "/explore/nobackup/people/spotter5/new_combustion/2025-08-13_LC_FISL_Original_combustionModelPredictors.csv"
OLD_PREDICTORS_CSV = "/explore/nobackup/people/spotter5/new_combustion/Combustion_SynthesisData_05042018_XJW.csv"
BASE_OUT_PATH = "/explore/nobackup/people/spotter5/new_combustion"

OUT_DIRS = {
    'above.carbon.combusted': os.path.join(BASE_OUT_PATH, "pdp_aboveground_fisl"),
    'below.ground.carbon.combusted': os.path.join(BASE_OUT_PATH, "pdp_belowground_fisl"),
    'burn.depth': os.path.join(BASE_OUT_PATH, "pdp_depth_fisl"),
}

POTENTIAL_TARGETS = ['below.ground.carbon.combusted', 'above.carbon.combusted', 'burn.depth']
METADATA_COLUMNS = ['burn_year','project.name','latitude','longitude','Date','id','CNA_MAR',
                    'fireYr','lat','lon','project_name']
COLS_TO_DROP_FROM_X = POTENTIAL_TARGETS + METADATA_COLUMNS

TOP6 = {
    'above.carbon.combusted': ['BS','twi','brightness','ISI','Tree.cover','Temperature'],
    'below.ground.carbon.combusted': ['BUI','twi','CNA_PAS','Silt_30','brightness','NV'],
    'burn.depth': ['BUI','twi','CNA_PAS','CNA_DD_0','WS','DC'],
}

# -------------------- Helpers --------------------
def normalize_name(s):
    return "".join(ch for ch in str(s).lower() if ch.isalnum())

def map_requested_features_to_columns(requested, df_columns):
    norm_map = {normalize_name(c): c for c in df_columns}
    mapped, missing = [], []
    for r in requested:
        nr = normalize_name(r)
        if nr in norm_map:
            mapped.append(norm_map[nr])
        else:
            missing.append(r)
    return mapped, missing

def median_impute_numeric(df):
    out = df.copy()
    num_cols = out.select_dtypes(include=[np.number]).columns
    out[num_cols] = out[num_cols].fillna(out[num_cols].median())
    return out

# -------------------- Load and split --------------------
print(f"\n{'='*70}\n🚀 STARTING PDP (New Data only)\n{'='*70}")
df_main = pd.read_csv(INPUT_CSV)
df_old  = pd.read_csv(OLD_PREDICTORS_CSV)

old_ids = pd.unique(df_old['id'])
df_new  = df_main[~df_main['id'].isin(old_ids)].copy()
print(f"New Data rows: {len(df_new)}")

# -------------------- Train & plot --------------------
for target in OUT_DIRS:
    os.makedirs(OUT_DIRS[target], exist_ok=True)
    print(f"\n— Target: {target}")

    if target not in df_new.columns:
        print(f"  ⚠️ Target '{target}' not found. Skipping.")
        continue

    df_t = df_new.dropna(subset=[target]).copy()
    if df_t.empty:
        print(f"  ⚠️ No rows with non-NA '{target}'. Skipping.")
        continue

    X = df_t.drop(columns=COLS_TO_DROP_FROM_X, errors='ignore').select_dtypes(include=[np.number])
    if X.shape[1] == 0:
        print("  ⚠️ No numeric predictors after exclusions. Skipping.")
        continue

    X_imp = median_impute_numeric(X)
    y = df_t[target].astype(float)

    rf = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=-1, oob_score=len(y)>10)
    rf.fit(X_imp, y)
    print(f"  ✓ Trained RF (n={len(y)})")

    mapped, missing = map_requested_features_to_columns(TOP6[target], X_imp.columns)
    if missing:
        print(f"  ⚠️ Missing requested features: {missing}")
    if not mapped:
        continue

    # 3x2 figure with bigger fonts
    fig, axes = plt.subplots(3, 2, figsize=(16, 20))
    axes = axes.ravel()
    for i in range(6):
        ax = axes[i]
        if i < len(mapped):
            feat = mapped[i]
            try:
                PartialDependenceDisplay.from_estimator(
                    rf, X_imp, [feat], ax=ax, line_kw={"linewidth": 3})
                ax.set_title(feat, fontsize=18)
                ax.set_xlabel(ax.get_xlabel(), fontsize=16)
                ax.set_ylabel(ax.get_ylabel(), fontsize=16)
                ax.tick_params(axis='both', labelsize=14)
                ax.grid(True, linestyle='--', alpha=0.6)
            except Exception as e:
                ax.text(0.5,0.5,f"PDP failed\n{feat}\n{e}",ha='center',va='center',
                        fontsize=16,color='red')
                ax.axis('off')
        else:
            ax.axis('off')

    pretty = {'above.carbon.combusted':'Aboveground Carbon',
              'below.ground.carbon.combusted':'Belowground Carbon',
              'burn.depth':'Burn Depth'}[target]
    fig.suptitle(f'New Data — Top 6 PDPs ({pretty})', fontsize=24, y=0.98)

    save_path = os.path.join(OUT_DIRS[target], f"pdp_new_{target.replace('.', '_')}_3x2.png")
    plt.tight_layout(rect=[0,0,1,0.96])
    plt.savefig(save_path, dpi=150)
    plt.close(fig)

print(f"\n{'='*70}\n🎉 Done. Large-font PDPs saved in *_new_3x2 folders.\n{'='*70}")



🚀 STARTING PDP (New Data only)
New Data rows: 867

— Target: above.carbon.combusted
  ✓ Trained RF (n=867)

— Target: below.ground.carbon.combusted
  ✓ Trained RF (n=867)

— Target: burn.depth
  ✓ Trained RF (n=866)

🎉 Done. Large-font PDPs saved in *_new_3x2 folders.


In [3]:
#!/usr/bin/env python
# coding: utf-8

import os
import warnings
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import PartialDependenceDisplay

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# -------------------- Font/Style: force larger text everywhere --------------------
FS_SUPTITLE = 36   # figure title
FS_LABEL    = 26   # x/y axis labels
FS_TICKS    = 22   # tick labels

plt.rcParams.update({
    "font.size": FS_TICKS,           # base font size
    "axes.titlesize": FS_SUPTITLE,   # not used (no subplot titles), but safe
    "axes.labelsize": FS_LABEL,      # default axis-label size
    "xtick.labelsize": FS_TICKS,
    "ytick.labelsize": FS_TICKS,
    "legend.fontsize": FS_TICKS,
})

# -------------------- Config --------------------
INPUT_CSV = "/explore/nobackup/people/spotter5/new_combustion/2025-08-13_LC_FISL_Original_combustionModelPredictors.csv"
OLD_PREDICTORS_CSV = "/explore/nobackup/people/spotter5/new_combustion/Combustion_SynthesisData_05042018_XJW.csv"
BASE_OUT_PATH = "/explore/nobackup/people/spotter5/new_combustion"

OUT_DIRS = {
    'above.carbon.combusted': os.path.join(BASE_OUT_PATH, "pdp_aboveground_fisl"),
    'below.ground.carbon.combusted': os.path.join(BASE_OUT_PATH, "pdp_belowground_fisl"),
    'burn.depth': os.path.join(BASE_OUT_PATH, "pdp_depth_fisl"),
}

# Excluded predictor columns
POTENTIAL_TARGETS = ['below.ground.carbon.combusted', 'above.carbon.combusted', 'burn.depth']
METADATA_COLUMNS = ['burn_year','project.name','latitude','longitude','Date','id','CNA_MAR',
                    'fireYr','lat','lon','project_name']
COLS_TO_DROP_FROM_X = POTENTIAL_TARGETS + METADATA_COLUMNS

# Top-6 features per target
TOP6 = {
    'above.carbon.combusted': ['BS','twi','brightness','ISI','Tree.cover','Temperature'],
    'below.ground.carbon.combusted': ['BUI','twi','CNA_PAS','Silt_30','brightness','NV'],
    'burn.depth': ['BUI','twi','CNA_PAS','CNA_DD_0','WS','DC'],
}

# Plot style
HIST_BINS  = 40
HIST_ALPHA = 0.30
HIST_COLOR = 'skyblue'

# -------------------- Helpers --------------------
def normalize_name(s):
    return "".join(ch for ch in str(s).lower() if ch.isalnum())

def map_requested_features_to_columns(requested, df_columns):
    norm_map = {normalize_name(c): c for c in df_columns}
    mapped, missing = [], []
    for r in requested:
        key = normalize_name(r)
        if key in norm_map:
            mapped.append(norm_map[key])
        else:
            missing.append(r)
    return mapped, missing

def median_impute_numeric(df):
    out = df.copy()
    num_cols = out.select_dtypes(include=[np.number]).columns
    out[num_cols] = out[num_cols].fillna(out[num_cols].median())
    return out

# -------------------- Load and split --------------------
print(f"\n{'='*70}\n🚀 STARTING PDP (New Data only)\n{'='*70}")
df_main = pd.read_csv(INPUT_CSV)
df_old  = pd.read_csv(OLD_PREDICTORS_CSV)

if 'id' not in df_main.columns or 'id' not in df_old.columns:
    raise SystemExit("❌ Both files must have an 'id' column.")

old_ids = pd.unique(df_old['id'])
df_new  = df_main[~df_main['id'].isin(old_ids)].copy()
print(f"New Data rows: {len(df_new)}")

# -------------------- Train & plot --------------------
for target in OUT_DIRS:
    os.makedirs(OUT_DIRS[target], exist_ok=True)
    print(f"\n— Target: {target}")

    if target not in df_new.columns:
        print(f"  ⚠️ Target '{target}' not found. Skipping.")
        continue

    df_t = df_new.dropna(subset=[target]).copy()
    if df_t.empty:
        print(f"  ⚠️ No rows with non-NA '{target}'. Skipping.")
        continue

    X = df_t.drop(columns=COLS_TO_DROP_FROM_X, errors='ignore').select_dtypes(include=[np.number])
    if X.shape[1] == 0:
        print("  ⚠️ No numeric predictors after exclusions. Skipping.")
        continue

    X_imp = median_impute_numeric(X)
    y = df_t[target].astype(float)

    rf = RandomForestRegressor(
        n_estimators=500,
        random_state=42,
        n_jobs=-1,
        oob_score=len(y) > 10
    )
    rf.fit(X_imp, y)
    print(f"  ✓ Trained RF (n={len(y)})")

    requested = TOP6[target]
    mapped, missing = map_requested_features_to_columns(requested, X_imp.columns)
    if not mapped:
        print("  ⚠️ No requested features found. Skipping PDP.")
        continue

    fig, axes = plt.subplots(3, 2, figsize=(22, 26))
    axes = axes.ravel()

    for i in range(6):
        ax = axes[i]
        if i < len(mapped):
            feat = mapped[i]
            try:
                # Draw PDP (this sets labels internally)
                PartialDependenceDisplay.from_estimator(
                    rf, X_imp, [feat], ax=ax,
                    line_kw={"linewidth": 4, "color": "black"}
                )
            except Exception as e:
                ax.text(0.5, 0.5, f"PDP failed\n{feat}\n{e}",
                        ha='center', va='center', fontsize=FS_LABEL, color='red')
                ax.axis('off')
                continue

            # ----- FORCE bigger axis label fonts AFTER PDP draws -----
            ax.set_xlabel(feat)  # ensure xlabel text
            ax.set_ylabel("Partial Dependence")
            ax.xaxis.label.set_size(FS_LABEL)
            ax.yaxis.label.set_size(FS_LABEL)
            ax.tick_params(axis='both', labelsize=FS_TICKS)
            # optional: add a bit of padding to labels
            ax.xaxis.labelpad = 12
            ax.yaxis.labelpad = 12

            ax.grid(True, linestyle='--', alpha=0.6)

            # Secondary y-axis histogram (no label)
            ax2 = ax.twinx()
            vals = X_imp[feat].values
            ax2.hist(vals[~np.isnan(vals)], bins=HIST_BINS,
                     alpha=HIST_ALPHA, color=HIST_COLOR)
            ax2.set_yticks([])
            ax2.set_ylabel("")            # no label
            ax2.tick_params(axis='both', length=0)
            ax2.grid(False)
            ax2.set_zorder(1)
            ax.set_zorder(2)
            ax.patch.set_visible(False)
        else:
            ax.axis('off')

    pretty = {
        'above.carbon.combusted': 'Aboveground Carbon',
        'below.ground.carbon.combusted': 'Belowground Carbon',
        'burn.depth': 'Burn Depth'
    }[target]
    fig.suptitle(f'New Data — Top 6 PDPs ({pretty})', fontsize=FS_SUPTITLE, y=0.98)

    save_path = os.path.join(OUT_DIRS[target], f"pdp_new_{target.replace('.', '_')}_3x2.png")
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.savefig(save_path, dpi=150)
    plt.close(fig)

print(f"\n{'='*70}\n🎉 Done. Axis labels are now force-sized after PDP draws.\n{'='*70}")



🚀 STARTING PDP (New Data only)
New Data rows: 867

— Target: above.carbon.combusted
  ✓ Trained RF (n=867)

— Target: below.ground.carbon.combusted
  ✓ Trained RF (n=867)

— Target: burn.depth
  ✓ Trained RF (n=866)

🎉 Done. Axis labels are now force-sized after PDP draws.


In [5]:
#!/usr/bin/env python
# coding: utf-8

import os
import warnings
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import PartialDependenceDisplay

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# -------------------- Font/Style: force larger text everywhere --------------------
FS_SUPTITLE = 36   # figure title
FS_LABEL    = 26   # x/y axis labels
FS_TICKS    = 22   # tick labels

plt.rcParams.update({
    "font.size": FS_TICKS,           # base font size
    "axes.titlesize": FS_SUPTITLE,   # not used (no subplot titles), but safe
    "axes.labelsize": FS_LABEL,      # default axis-label size
    "xtick.labelsize": FS_TICKS,
    "ytick.labelsize": FS_TICKS,
    "legend.fontsize": FS_TICKS,
})

# -------------------- Config --------------------
INPUT_CSV = "/explore/nobackup/people/spotter5/new_combustion/2025-08-13_LC_FISL_Original_combustionModelPredictors.csv"
OLD_PREDICTORS_CSV = "/explore/nobackup/people/spotter5/new_combustion/Combustion_SynthesisData_05042018_XJW.csv"
BASE_OUT_PATH = "/explore/nobackup/people/spotter5/new_combustion"

OUT_DIRS = {
    'above.carbon.combusted': os.path.join(BASE_OUT_PATH, "pdp_aboveground_fisl"),
    'below.ground.carbon.combusted': os.path.join(BASE_OUT_PATH, "pdp_belowground_fisl"),
    'burn.depth': os.path.join(BASE_OUT_PATH, "pdp_depth_fisl"),
}

# Excluded predictor columns
POTENTIAL_TARGETS = ['below.ground.carbon.combusted', 'above.carbon.combusted', 'burn.depth']
METADATA_COLUMNS = ['burn_year','project.name','latitude','longitude','Date','id','CNA_MAR',
                    'fireYr','lat','lon','project_name']
COLS_TO_DROP_FROM_X = POTENTIAL_TARGETS + METADATA_COLUMNS

# Top-6 features per target (ORIGINAL column names)
TOP6 = {
    'above.carbon.combusted': ['BS','twi','brightness','ISI','Tree.cover','Temperature'],
    'below.ground.carbon.combusted': ['BUI','twi','CNA_PAS','Silt_30','brightness','NV'],
    'burn.depth': ['BUI','twi','CNA_PAS','CNA_DD_0','WS','DC'],
}

# Pretty replacements for axis/feature names (map ORIGINAL -> PRETTY)
PRETTY_LABELS = {
    'BS': 'Black Spruce',
    'twi': 'Topographic Wetness Index',
    'brightness': 'Brightness',
    'ISI': 'Initial Spread Index',
    'Tree.cover': 'Tree Cover (%)',
    'Temperature': 'Temperature (°C)',
    'BUI': 'Build Up Index',
    'CNA_PAS': 'Precipitation as Snow',
    'Silt_30': 'Silt %)',
    'NV': 'Non-Vegetation Cover (%)',
    'CNA_DD_0': 'Degree-Days < 0',
    'WS': 'White Spruce',
    'DC': 'Drought Code',
    # add more mappings as needed
}

# Plot style
HIST_BINS  = 40
HIST_ALPHA = 0.30
HIST_COLOR = 'skyblue'

# -------------------- Helpers --------------------
def normalize_name(s):
    return "".join(ch for ch in str(s).lower() if ch.isalnum())

def map_requested_features_to_columns(requested, df_columns):
    norm_map = {normalize_name(c): c for c in df_columns}
    mapped, missing = [], []
    for r in requested:
        key = normalize_name(r)
        if key in norm_map:
            mapped.append(norm_map[key])
        else:
            missing.append(r)
    return mapped, missing

def median_impute_numeric(df):
    out = df.copy()
    num_cols = out.select_dtypes(include=[np.number]).columns
    out[num_cols] = out[num_cols].fillna(out[num_cols].median())
    return out

# -------------------- Load and split --------------------
print(f"\n{'='*70}\n🚀 STARTING PDP (New Data only)\n{'='*70}")
df_main = pd.read_csv(INPUT_CSV)
df_old  = pd.read_csv(OLD_PREDICTORS_CSV)

if 'id' not in df_main.columns or 'id' not in df_old.columns:
    raise SystemExit("❌ Both files must have an 'id' column.")

old_ids = pd.unique(df_old['id'])
df_new  = df_main[~df_main['id'].isin(old_ids)].copy()
print(f"New Data rows: {len(df_new)}")

# -------------------- Train & plot --------------------
for target in OUT_DIRS:
    os.makedirs(OUT_DIRS[target], exist_ok=True)
    print(f"\n— Target: {target}")

    if target not in df_new.columns:
        print(f"  ⚠️ Target '{target}' not found. Skipping.")
        continue

    df_t = df_new.dropna(subset=[target]).copy()
    if df_t.empty:
        print(f"  ⚠️ No rows with non-NA '{target}'. Skipping.")
        continue

    # Numeric predictors only, excluding meta/targets
    X = df_t.drop(columns=COLS_TO_DROP_FROM_X, errors='ignore').select_dtypes(include=[np.number])
    if X.shape[1] == 0:
        print("  ⚠️ No numeric predictors after exclusions. Skipping.")
        continue

    # ---- RENAME COLUMNS IN THE DATAFRAME TO PRETTY LABELS (forces PDP to use them) ----
    rename_map = {k: v for k, v in PRETTY_LABELS.items() if k in X.columns}
    X_renamed = X.rename(columns=rename_map)

    # Impute and set target
    X_imp = median_impute_numeric(X_renamed)
    y = df_t[target].astype(float)

    # Train model on the RENAMED feature set
    rf = RandomForestRegressor(
        n_estimators=500,
        random_state=42,
        n_jobs=-1,
        oob_score=len(y) > 10
    )
    rf.fit(X_imp, y)
    print(f"  ✓ Trained RF (n={len(y)}) on renamed columns")

    # Use pretty names for requested features so they match the renamed columns
    requested = TOP6[target]
    requested_pretty = [PRETTY_LABELS.get(r, r) for r in requested]
    mapped, missing = map_requested_features_to_columns(requested_pretty, X_imp.columns)
    if not mapped:
        print(f"  ⚠️ No requested features found after renaming. Missing: {missing}")
        continue
    if missing:
        print(f"  ℹ️ Some requested features not found after renaming: {missing}")

    fig, axes = plt.subplots(3, 2, figsize=(22, 26))
    axes = axes.ravel()

    for i in range(6):
        ax = axes[i]
        if i < len(mapped):
            feat = mapped[i]  # this is the PRETTY column name now
            try:
                PartialDependenceDisplay.from_estimator(
                    rf, X_imp, [feat], ax=ax,
                    line_kw={"linewidth": 4, "color": "black"}
                )
            except Exception as e:
                ax.text(0.5, 0.5, f"PDP failed\n{feat}\n{e}",
                        ha='center', va='center', fontsize=FS_LABEL, color='red')
                ax.axis('off')
                continue

            # Bigger axes labels (now already pretty)
            ax.set_xlabel(feat)  # pretty feature name
            ax.set_ylabel("Partial Dependence")
            ax.xaxis.label.set_size(FS_LABEL)
            ax.yaxis.label.set_size(FS_LABEL)
            ax.tick_params(axis='both', labelsize=FS_TICKS)
            ax.xaxis.labelpad = 12
            ax.yaxis.labelpad = 12
            ax.grid(True, linestyle='--', alpha=0.6)

            # Secondary y-axis histogram (matching the renamed column)
            ax2 = ax.twinx()
            vals = X_imp[feat].values
            ax2.hist(vals[~np.isnan(vals)], bins=HIST_BINS,
                     alpha=HIST_ALPHA, color=HIST_COLOR)
            ax2.set_yticks([])
            ax2.set_ylabel("")            # no label
            ax2.tick_params(axis='both', length=0)
            ax2.grid(False)
            ax2.set_zorder(1)
            ax.set_zorder(2)
            ax.patch.set_visible(False)
        else:
            ax.axis('off')

    pretty = {
        'above.carbon.combusted': 'Aboveground Carbon',
        'below.ground.carbon.combusted': 'Belowground Carbon',
        'burn.depth': 'Burn Depth'
    }[target]
    fig.suptitle(f'New Data — Top 6 PDPs ({pretty})', fontsize=FS_SUPTITLE, y=0.98)

    save_path = os.path.join(OUT_DIRS[target], f"pdp_new_{target.replace('.', '_')}_3x2.png")
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.savefig(save_path, dpi=150)
    plt.close(fig)

print(f"\n{'='*70}\n🎉 Done. Column names are renamed in pandas, so plots use pretty labels.\n{'='*70}")



🚀 STARTING PDP (New Data only)
New Data rows: 867

— Target: above.carbon.combusted
  ✓ Trained RF (n=867) on renamed columns

— Target: below.ground.carbon.combusted
  ✓ Trained RF (n=867) on renamed columns

— Target: burn.depth
  ✓ Trained RF (n=866) on renamed columns

🎉 Done. Column names are renamed in pandas, so plots use pretty labels.
