above ground

In [None]:
# 1. Load and clean data
df = pd.read_csv("/explore/nobackup/people/spotter5/new_combustion/2025-08-13_LC_FISL_Original_combustionModelPredictors.csv")

out_path = "/explore/nobackup/people/spotter5/new_combustion/all_data"
os.makedirs(out_path, exist_ok = True)

# 2. Exclude columns not relevant for modeling
exclude_columns = [
    #'below.ground.carbon.combusted',
    'above.carbon.combusted'
    'burn.depth',
    'burn_year',
    #'rdnbr_old',
    'project.name',
    'latitude',
    'longitude',
    'Date',
    'id',
    'CNA_MAR'
    #  'fireYr',
    # 'lat',
    # 'lon',
    # 'project_name'
]

# 3. Drop excluded columns and NaNs
all_data = df.drop(columns=exclude_columns).dropna()


# 1. Load and clean data
df = pd.read_csv("/explore/nobackup/people/spotter5/new_combustion/2025-08-08_LC_FISL_Original_combustionModelPredictors.csv")

old = pd.read_csv("/explore/nobackup/people/spotter5/new_combustion/all_predictors.csv")

# Get the unique IDs to remove
old_ids = old['id'].unique()

# Filter the DataFrame using the ~ operator ✅
new_data = df[~df['id'].isin(old_ids)].drop(columns=exclude_columns).dropna()

# 1. Load and clean data
df = pd.read_csv("/explore/nobackup/people/spotter5/new_combustion/2025-08-08_LC_FISL_Original_combustionModelPredictors.csv")

old = pd.read_csv("/explore/nobackup/people/spotter5/new_combustion/all_predictors.csv")

# Get the unique IDs to remove
old_ids = old['id'].unique()

# Filter the DataFrame using the ~ operator ✅
old_data = df[df['id'].isin(old_ids)].drop(columns=exclude_columns).dropna()


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import PartialDependenceDisplay
import os
import warnings

def run_pdp_analysis(target_variable, output_directory, input_csv_path, old_ids_csv_path):
    """
    Runs the full modeling and PDP generation pipeline for a specific target variable.

    Args:
        target_variable (str): The name of the column to use as the target.
        output_directory (str): The path to save the generated PDP images.
        input_csv_path (str): Path to the main input CSV file.
        old_ids_csv_path (str): Path to the CSV containing old plot IDs.
    """
    # --- Setup & Introduction ---
    print(f"\n{'='*70}")
    print(f"🚀 STARTING ANALYSIS | TARGET: '{target_variable}'")
    print(f"{'='*70}")

    warnings.filterwarnings('ignore', category=FutureWarning)
    warnings.filterwarnings('ignore', category=UserWarning)
    os.makedirs(output_directory, exist_ok=True)

    # --- 1. Data Preparation ---
    print("Step 1: Loading and preparing data...")
    try:
        df_main = pd.read_csv(input_csv_path)
        df_old_ids = pd.read_csv(old_ids_csv_path)
    except FileNotFoundError as e:
        print(f"❌ Error: Could not find input file. {e}")
        return

    # Define columns to exclude from predictors.
    # CRITICAL: We must exclude BOTH potential target variables from the predictors
    # to prevent data leakage between the two analyses.
    POTENTIAL_TARGETS = ['below.ground.carbon.combusted', 'above.carbon.combusted', 'burn.depth']
    METADATA_COLUMNS = [
        'burn_year', 'project.name', 'latitude', 'longitude', 'Date', 'id',
        'CNA_MAR'
    ]
    
    # Combine all columns to be dropped when creating the predictor set (X)
    COLS_TO_DROP_FROM_X = POTENTIAL_TARGETS + METADATA_COLUMNS

    # Prepare the three datasets (All, Old, New)
    old_ids = df_old_ids['id'].unique()
    data_splits = {
        "All Data": df_main,
        "Old Data": df_main[df_main['id'].isin(old_ids)],
        "New Data": df_main[~df_main['id'].isin(old_ids)]
    }

    # --- 2. Model Training ---
    print(f"Step 2: Training Random Forest models on '{target_variable}'...")
    models = {}
    for name, data in data_splits.items():
        print(f"  - Training on '{name}' ({len(data)} rows)...")

        # Drop rows where the CURRENT target variable is missing
        df_clean = data.dropna(subset=[target_variable])

        # Define predictors (X) and target (y)
        X = df_clean.drop(columns=COLS_TO_DROP_FROM_X, errors='ignore')
        y = df_clean[target_variable]

        # Initialize and train the model
        rf = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=-1, oob_score=True)
        rf.fit(X, y)
        print(f"    ...Done. Model OOB Score (R²): {rf.oob_score_:.3f}")

        # Store the trained model and predictor data
        models[name] = {'model': rf, 'X': X}

    # --- 3. Generate and Save Partial Dependence Plots ---
    feature_list = models['All Data']['X'].columns
    print(f"\nStep 3: Generating {len(feature_list)} Partial Dependence Plots...")

    for feature in feature_list:
        print(f"  - Plotting for: {feature}")
        fig, axes = plt.subplots(3, 1, figsize=(8, 12), sharex=True)
        fig.suptitle(f'Partial Dependence on: {feature}\n(Target: {target_variable})', fontsize=16, y=0.96)

        plot_order = ["All Data", "Old Data", "New Data"]
        for i, model_name in enumerate(plot_order):
            ax = axes[i]
            model_info = models[model_name]
            PartialDependenceDisplay.from_estimator(
                estimator=model_info['model'], X=model_info['X'], features=[feature],
                ax=ax, line_kw={"color": "darkcyan", "linewidth": 2.5}
            )
            ax.set_title(f"{model_name} (n={len(model_info['X'])})")
            ax.set_ylabel("Partial Dependence")
            ax.grid(True, linestyle='--', alpha=0.6)

        save_path = os.path.join(output_directory, f'{feature}.png')
        plt.tight_layout(rect=[0, 0, 1, 0.94])
        plt.savefig(save_path)
        plt.close(fig)

    print(f"\n✅ ANALYSIS COMPLETE for '{target_variable}'. Plots saved to: {output_directory}")


if __name__ == "__main__":
    # --- Master Configuration ---
    INPUT_CSV = "/explore/nobackup/people/spotter5/new_combustion/2025-08-13_LC_FISL_Original_combustionModelPredictors.csv"
    # OLD_PREDICTORS_CSV = "/explore/nobackup/people/spotter5/new_combustion/all_predictors.csv"
    OLD_PREDICTORS_CSV = "/explore/nobackup/people/spotter5/new_combustion/Combustion_SynthesisData_05042018_XJW.csv"
    BASE_OUT_PATH = "/explore/nobackup/people/spotter5/new_combustion"

    # Define the analyses to run
    analyses = {
        'below.ground.carbon.combusted': os.path.join(BASE_OUT_PATH, "pdp_belowground"),
        'above.carbon.combusted': os.path.join(BASE_OUT_PATH, "pdp_aboveground"),
        'burn.depth': os.path.join(BASE_OUT_PATH, "pdp_depth")
    }

    # Execute each analysis
    for target_col, out_dir in analyses.items():
        run_pdp_analysis(
            target_variable=target_col,
            output_directory=out_dir,
            input_csv_path=INPUT_CSV,
            old_ids_csv_path=OLD_PREDICTORS_CSV
        )

    print(f"\n{'='*70}")
    print("🎉 All tasks finished.")
    print(f"{'='*70}")


🚀 STARTING ANALYSIS | TARGET: 'below.ground.carbon.combusted'
Step 1: Loading and preparing data...
Step 2: Training Random Forest models on 'below.ground.carbon.combusted'...
  - Training on 'All Data' (1877 rows)...
    ...Done. Model OOB Score (R²): 0.434
  - Training on 'Old Data' (1010 rows)...
    ...Done. Model OOB Score (R²): 0.228
  - Training on 'New Data' (867 rows)...
    ...Done. Model OOB Score (R²): 0.607

Step 3: Generating 78 Partial Dependence Plots...
  - Plotting for: PFI
  - Plotting for: pH_30
  - Plotting for: Sand_30
  - Plotting for: Silt_30
  - Plotting for: Clay_30
  - Plotting for: DOB_lst
  - Plotting for: Relative.humidity
  - Plotting for: Temperature
  - Plotting for: VPD
  - Plotting for: Wind.speed
  - Plotting for: JP
  - Plotting for: BS
  - Plotting for: DEC
  - Plotting for: GRSH
  - Plotting for: NV
  - Plotting for: OCON
  - Plotting for: WS
  - Plotting for: CNA_Tmax_5_8
  - Plotting for: CNA_PPT_5_8
  - Plotting for: CNA_Rad_5_8
  - Plotting f

ValueError: Found array with 0 sample(s) (shape=(0, 78)) while a minimum of 1 is required by RandomForestRegressor.

With depth

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import PartialDependenceDisplay
import os
import warnings

def run_pdp_analysis(target_variable, output_directory, input_csv_path, old_ids_csv_path):
    """
    Runs the full modeling and PDP generation pipeline for a specific target variable.

    Args:
        target_variable (str): The name of the column to use as the target.
        output_directory (str): The path to save the generated PDP images.
        input_csv_path (str): Path to the main input CSV file.
        old_ids_csv_path (str): Path to the CSV containing old plot IDs and, if available, burn.depth.
    """
    # --- Setup & Introduction ---
    print(f"\n{'='*70}")
    print(f"🚀 STARTING ANALYSIS | TARGET: '{target_variable}'")
    print(f"{'='*70}")

    warnings.filterwarnings('ignore', category=FutureWarning)
    warnings.filterwarnings('ignore', category=UserWarning)
    os.makedirs(output_directory, exist_ok=True)

    # --- 1. Data Preparation ---
    print("Step 1: Loading and preparing data...")
    try:
        df_main = pd.read_csv(input_csv_path)
        df_old_ids = pd.read_csv(old_ids_csv_path)
    except FileNotFoundError as e:
        print(f"❌ Error: Could not find input file. {e}")
        return

    # Normalize 'id' types & dedupe
    if 'id' not in df_main.columns or 'id' not in df_old_ids.columns:
        print("❌ Error: Both input files must have an 'id' column.")
        return

    for df in (df_main, df_old_ids):
        # coerce to numeric if possible (keeps strings if not)
        try:
            df['id'] = pd.to_numeric(df['id'], errors='ignore')
        except Exception:
            pass
    df_old_ids = df_old_ids.drop_duplicates(subset='id')

    # If we're modeling burn.depth, try to backfill from old file (if present there)
    if target_variable == 'burn.depth' and 'burn.depth' in df_old_ids.columns:
        print("  - Backfilling 'burn.depth' from old CSV where missing in main...")
        before_na = df_main['burn.depth'].isna().sum() if 'burn.depth' in df_main.columns else None
        if 'burn.depth' not in df_main.columns:
            df_main['burn.depth'] = np.nan
        df_main = df_main.set_index('id')
        df_old_depth = df_old_ids.set_index('id')['burn.depth']
        df_main['burn.depth'] = df_main['burn.depth'].combine_first(df_old_depth)
        df_main = df_main.reset_index()
        after_na = df_main['burn.depth'].isna().sum()
        if before_na is not None:
            print(f"    Filled {before_na - after_na} missing burn.depth values.")

    # Define columns to exclude from predictors (X)
    POTENTIAL_TARGETS = ['below.ground.carbon.combusted', 'above.carbon.combusted', 'burn.depth']
    METADATA_COLUMNS = [
        'burn_year', 'project.name', 'latitude', 'longitude', 'Date', 'id', 'CNA_MAR'
    ]
    COLS_TO_DROP_FROM_X = POTENTIAL_TARGETS + METADATA_COLUMNS

    # Prepare splits
    old_ids = df_old_ids['id'].unique()
    data_splits = {
        "All Data": df_main,
        "Old Data": df_main[df_main['id'].isin(old_ids)],
        "New Data": df_main[~df_main['id'].isin(old_ids)]
    }

    # --- 2. Model Training ---
    print(f"Step 2: Training Random Forest models on '{target_variable}'...")
    models = {}
    for name, data in data_splits.items():
        print(f"  - Training on '{name}' ({len(data)} rows)...")

        if target_variable not in data.columns:
            print(f"    ⚠️ Skipping '{name}' – target '{target_variable}' not in columns.")
            continue

        # Drop rows missing the CURRENT target variable
        df_clean = data.dropna(subset=[target_variable]).copy()

        # Build X (numeric only), drop constant/all-NaN cols
        X = df_clean.drop(columns=COLS_TO_DROP_FROM_X, errors='ignore')
        X = X.select_dtypes(include=[np.number])
        if X.shape[1] == 0:
            print(f"    ⚠️ Skipping '{name}' – no numeric predictors after cleaning.")
            continue
        # drop all-NaN columns
        X = X.loc[:, X.notna().any(axis=0)]
        # drop constant columns
        constant_cols = [c for c in X.columns if X[c].nunique(dropna=True) <= 1]
        if constant_cols:
            X = X.drop(columns=constant_cols)

        y = df_clean[target_variable].astype(float)

        n = len(y)
        if n < 2:
            print(f"    ⚠️ Skipping '{name}' – insufficient samples after dropna (n={n}).")
            continue

        # Enable OOB only when there are enough samples to make it meaningful
        use_oob = n > 10
        rf = RandomForestRegressor(
            n_estimators=500,
            random_state=42,
            n_jobs=-1,
            oob_score=use_oob
        )
        rf.fit(X, y)
        if use_oob:
            print(f"    ...Done. Model OOB Score (R²): {rf.oob_score_:.3f}")
        else:
            print(f"    ...Done. (OOB disabled; n={n})")

        models[name] = {'model': rf, 'X': X}

    if "All Data" not in models:
        print("❌ No trainable 'All Data' model. Aborting PDP stage.")
        return

    # --- 3. Generate and Save Partial Dependence Plots ---
    feature_list = models['All Data']['X'].columns
    print(f"\nStep 3: Generating {len(feature_list)} Partial Dependence Plots...")

    # Only include splits that successfully trained
    trained_order = [k for k in ["All Data", "Old Data", "New Data"] if k in models]
    if not trained_order:
        print("❌ No trained models available for PDP.")
        return

    for feature in feature_list:
        print(f"  - Plotting for: {feature}")
        fig, axes = plt.subplots(len(trained_order), 1, figsize=(8, 4 * len(trained_order)), sharex=True)
        if len(trained_order) == 1:
            axes = [axes]
        fig.suptitle(f'Partial Dependence on: {feature}\n(Target: {target_variable})', fontsize=16, y=0.96)

        for ax, model_name in zip(axes, trained_order):
            model_info = models[model_name]
            try:
                PartialDependenceDisplay.from_estimator(
                    estimator=model_info['model'], X=model_info['X'], features=[feature],
                    ax=ax, line_kw={"color": "darkcyan", "linewidth": 2.5}
                )
                ax.set_title(f"{model_name} (n={len(model_info['X'])})")
                ax.set_ylabel("Partial Dependence")
                ax.grid(True, linestyle='--', alpha=0.6)
            except Exception as e:
                ax.set_title(f"{model_name} – PDP failed for '{feature}' ({e})")
                ax.axis('off')

        save_path = os.path.join(output_directory, f'{feature}.png')
        plt.tight_layout(rect=[0, 0, 1, 0.94])
        plt.savefig(save_path)
        plt.close(fig)

    print(f"\n✅ ANALYSIS COMPLETE for '{target_variable}'. Plots saved to: {output_directory}")


if __name__ == "__main__":
    # --- Master Configuration ---
    INPUT_CSV = "/explore/nobackup/people/spotter5/new_combustion/2025-08-13_LC_FISL_Original_combustionModelPredictors.csv"
    OLD_PREDICTORS_CSV = "/explore/nobackup/people/spotter5/new_combustion/Combustion_SynthesisData_05042018_XJW.csv"
    BASE_OUT_PATH = "/explore/nobackup/people/spotter5/new_combustion"

    analyses = {
        'below.ground.carbon.combusted': os.path.join(BASE_OUT_PATH, "pdp_belowground"),
        'above.carbon.combusted': os.path.join(BASE_OUT_PATH, "pdp_aboveground"),
        'burn.depth': os.path.join(BASE_OUT_PATH, "pdp_depth")
    }

    for target_col, out_dir in analyses.items():
        run_pdp_analysis(
            target_variable=target_col,
            output_directory=out_dir,
            input_csv_path=INPUT_CSV,
            old_ids_csv_path=OLD_PREDICTORS_CSV
        )

    print(f"\n{'='*70}")
    print("🎉 All tasks finished.")
    print(f"{'='*70}")



🚀 STARTING ANALYSIS | TARGET: 'below.ground.carbon.combusted'
Step 1: Loading and preparing data...
Step 2: Training Random Forest models on 'below.ground.carbon.combusted'...
  - Training on 'All Data' (1877 rows)...
    ...Done. Model OOB Score (R²): 0.434
  - Training on 'Old Data' (1010 rows)...
    ...Done. Model OOB Score (R²): 0.226
  - Training on 'New Data' (867 rows)...
    ...Done. Model OOB Score (R²): 0.607

Step 3: Generating 78 Partial Dependence Plots...
  - Plotting for: PFI
  - Plotting for: pH_30
  - Plotting for: Sand_30
  - Plotting for: Silt_30
  - Plotting for: Clay_30
  - Plotting for: DOB_lst
  - Plotting for: Relative.humidity
  - Plotting for: Temperature
  - Plotting for: VPD
  - Plotting for: Wind.speed
  - Plotting for: JP
  - Plotting for: BS
  - Plotting for: DEC
  - Plotting for: GRSH
  - Plotting for: NV
  - Plotting for: OCON
  - Plotting for: WS
  - Plotting for: CNA_Tmax_5_8
  - Plotting for: CNA_PPT_5_8
  - Plotting for: CNA_Rad_5_8
  - Plotting f

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import PartialDependenceDisplay
import os
import warnings

def run_pdp_analysis(target_variable, output_directory, input_csv_path, old_ids_csv_path):
    """
    Runs the full modeling and PDP generation pipeline for a specific target variable.

    Args:
        target_variable (str): The name of the column to use as the target.
        output_directory (str): The path to save the generated PDP images.
        input_csv_path (str): Path to the main input CSV file.
        old_ids_csv_path (str): Path to the CSV containing old plot IDs.
    """
    # --- Setup & Introduction ---
    print(f"\n{'='*70}")
    print(f"🚀 STARTING ANALYSIS | TARGET: '{target_variable}'")
    print(f"{'='*70}")

    warnings.filterwarnings('ignore', category=FutureWarning)
    warnings.filterwarnings('ignore', category=UserWarning)
    os.makedirs(output_directory, exist_ok=True)

    # --- 1. Data Preparation ---
    print("Step 1: Loading and preparing data...")
    try:
        df_main = pd.read_csv(input_csv_path)
        df_old_ids = pd.read_csv(old_ids_csv_path)
    except FileNotFoundError as e:
        print(f"❌ Error: Could not find input file. {e}")
        return

    # Define columns to exclude from predictors.
    POTENTIAL_TARGETS = ['below.ground.carbon.combusted', 'above.carbon.combusted']
    METADATA_COLUMNS = [
        'burn_year', 'project.name', 'latitude', 'longitude', 'Date', 'id',
        'CNA_MAR', 'fireYr', 'lat', 'lon', 'project_name'
    ]
    
    # Combine all columns to be dropped when creating the predictor set (X)
    COLS_TO_DROP_FROM_X = POTENTIAL_TARGETS + METADATA_COLUMNS

    # Prepare the three datasets (All, Old, New)
    old_ids = df_old_ids['id'].unique()
    data_splits = {
        "All Data": df_main,
        "Old Data": df_main[df_main['id'].isin(old_ids)],
        "New Data": df_main[~df_main['id'].isin(old_ids)]
    }

    # --- 2. Model Training ---
    print(f"Step 2: Training Random Forest models on '{target_variable}'...")
    models = {}
    for name, data in data_splits.items():
        print(f"  - Preparing data for '{name}' ({len(data)} rows)...")

        # First, drop rows where the CURRENT target variable is missing
        df_clean_target = data.dropna(subset=[target_variable])

        # Define predictors (X) and target (y) from this pre-cleaned data
        X = df_clean_target.drop(columns=COLS_TO_DROP_FROM_X, errors='ignore')
        y = df_clean_target[target_variable]

        # --- START: FIX ---
        # **CRITICAL FIX**: Now, drop rows with NaNs in the PREDICTOR (X) columns
        # This ensures the model receives completely clean data.
        rows_before_cleaning_predictors = len(X)
        X = X.dropna()
        
        # **CRITICAL FIX**: Re-align y to match the cleaned X's index
        y = y.loc[X.index]
        rows_after_cleaning_predictors = len(X)
        
        if rows_before_cleaning_predictors > rows_after_cleaning_predictors:
            rows_dropped = rows_before_cleaning_predictors - rows_after_cleaning_predictors
            print(f"    ... Cleaned predictor NaNs. Dropped {rows_dropped} rows. Final training size: {rows_after_cleaning_predictors}")
        # --- END: FIX ---

        # Check if there is still data to train on
        if X.empty:
            print(f"    ... ❌ Skipping '{name}': No data left after cleaning.")
            models[name] = None # Store None to indicate a failed model
            continue

        # Initialize and train the model
        rf = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=-1, oob_score=True)
        rf.fit(X, y)
        print(f"    ... Done. Model OOB Score (R²): {rf.oob_score_:.3f}")

        # Store the trained model and predictor data
        models[name] = {'model': rf, 'X': X}

    # --- 3. Generate and Save Partial Dependence Plots ---
    # Use the 'All Data' model's features as the reference list
    if models['All Data'] is None:
        print("\n❌ Cannot generate plots because 'All Data' model failed to train.")
        return
        
    feature_list = models['All Data']['X'].columns
    print(f"\nStep 3: Generating {len(feature_list)} Partial Dependence Plots...")

    for feature in feature_list:
        print(f"  - Plotting for: {feature}")
        fig, axes = plt.subplots(3, 1, figsize=(8, 12), sharex=True, squeeze=False) # squeeze=False ensures axes is always 2D
        axes = axes.flatten() # Flatten to a 1D array for easy iteration
        fig.suptitle(f'Partial Dependence on: {feature}\n(Target: {target_variable})', fontsize=16, y=0.96)

        plot_order = ["All Data", "Old Data", "New Data"]
        for i, model_name in enumerate(plot_order):
            ax = axes[i]
            model_info = models.get(model_name) # Use .get() for safety

            if model_info:
                PartialDependenceDisplay.from_estimator(
                    estimator=model_info['model'], X=model_info['X'], features=[feature],
                    ax=ax, line_kw={"color": "darkcyan", "linewidth": 2.5}
                )
                ax.set_title(f"{model_name} (n={len(model_info['X'])})")
            else:
                ax.text(0.5, 0.5, 'Model could not be trained\n(No data available)', 
                        ha='center', va='center', transform=ax.transAxes, fontsize=12, color='red')
                ax.set_title(f"{model_name} (n=0)")

            ax.set_ylabel("Partial Dependence")
            ax.grid(True, linestyle='--', alpha=0.6)

        save_path = os.path.join(output_directory, f'{feature}.png')
        plt.tight_layout(rect=[0, 0, 1, 0.94])
        plt.savefig(save_path)
        plt.close(fig)

    print(f"\n✅ ANALYSIS COMPLETE for '{target_variable}'. Plots saved to: {output_directory}")


if __name__ == "__main__":
    # --- Master Configuration ---
    INPUT_CSV = "/explore/nobackup/people/spotter5/new_combustion/2025-08-08_LC_FISL_Original_combustionModelPredictors.csv"
    OLD_PREDICTORS_CSV = "/explore/nobackup/people/spotter5/new_combustion/all_predictors.csv"
    BASE_OUT_PATH = "/explore/nobackup/people/spotter5/new_combustion"

    # Define the analyses to run
    analyses = {
        'below.ground.carbon.combusted': os.path.join(BASE_OUT_PATH, "pdp_belowground"),
        'above.carbon.combusted': os.path.join(BASE_OUT_PATH, "pdp_aboveground")
    }

    # Execute each analysis
    for target_col, out_dir in analyses.items():
        run_pdp_analysis(
            target_variable=target_col,
            output_directory=out_dir,
            input_csv_path=INPUT_CSV,
            old_ids_csv_path=OLD_PREDICTORS_CSV
        )

    print(f"\n{'='*70}")
    print("🎉 All tasks finished.")
    print(f"{'='*70}")


🚀 STARTING ANALYSIS | TARGET: 'below.ground.carbon.combusted'
Step 1: Loading and preparing data...
Step 2: Training Random Forest models on 'below.ground.carbon.combusted'...
  - Preparing data for 'All Data' (1877 rows)...
    ... Cleaned predictor NaNs. Dropped 562 rows. Final training size: 1201
    ... Done. Model OOB Score (R²): 0.358
  - Preparing data for 'Old Data' (1011 rows)...
    ... Cleaned predictor NaNs. Dropped 127 rows. Final training size: 770
    ... Done. Model OOB Score (R²): 0.218
  - Preparing data for 'New Data' (866 rows)...
    ... Cleaned predictor NaNs. Dropped 435 rows. Final training size: 431
    ... Done. Model OOB Score (R²): 0.559

Step 3: Generating 73 Partial Dependence Plots...
  - Plotting for: PFI
  - Plotting for: pH_30
  - Plotting for: Sand_30
  - Plotting for: Silt_30
  - Plotting for: Clay_30
  - Plotting for: DOB_lst
  - Plotting for: Relative.humidity
  - Plotting for: Temperature
  - Plotting for: VPD
  - Plotting for: Wind.speed
  - Plo

In [5]:
#!/usr/bin/env python
# coding: utf-8

import os
import warnings
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# -------------------- Font/Style: large text everywhere --------------------
FS_SUPTITLE = 36   # figure title
FS_LABEL    = 26   # x/y axis labels
FS_TICKS    = 22   # tick labels
FS_SUBTITLE = 26   # per-panel titles

plt.rcParams.update({
    "font.size": FS_TICKS,
    "axes.labelsize": FS_LABEL,
    "xtick.labelsize": FS_TICKS,
    "ytick.labelsize": FS_TICKS,
    "legend.fontsize": FS_TICKS,
})

# -------------------- Config --------------------
INPUT_CSV = "/explore/nobackup/people/spotter5/new_combustion/2025-08-13_LC_FISL_Original_combustionModelPredictors.csv"
OLD_PREDICTORS_CSV = "/explore/nobackup/people/spotter5/new_combustion/Combustion_SynthesisData_05042018_XJW.csv"
BASE_OUT_PATH = "/explore/nobackup/people/spotter5/new_combustion"

OUT_DIRS = {
    'above.carbon.combusted':        os.path.join(BASE_OUT_PATH, "pdp_aboveground_fisl"),
    'below.ground.carbon.combusted': os.path.join(BASE_OUT_PATH, "pdp_belowground_fisl"),
    'burn.depth':                    os.path.join(BASE_OUT_PATH, "pdp_depth_fisl"),
}

POTENTIAL_TARGETS = ['below.ground.carbon.combusted', 'above.carbon.combusted', 'burn.depth']
METADATA_COLUMNS = ['burn_year','project.name','latitude','longitude','Date','id','CNA_MAR',
                    'fireYr','lat','lon','project_name']
COLS_TO_DROP_FROM_X = POTENTIAL_TARGETS + METADATA_COLUMNS

# Requested top-6 features
TOP6 = {
    'above.carbon.combusted':        ['BS','twi','brightness','ISI','Tree.cover','Temperature'],
    'below.ground.carbon.combusted': ['BUI','twi','CNA_PAS','Silt_30','brightness','NV'],
    'burn.depth':                    ['BUI','twi','CNA_PAS','CNA_DD_0','WS','DC'],
}

# Pretty labels for plotting
PRETTY_LABELS = {
    'BS':           'Black Spruce',
    'twi':          'The Wetness Index',
    'brightness':   'Brightness',
    'ISI':          'Initial Spread Index',
    'Tree.cover':   'Tree Cover',
    'Temperature':  'Temperature (FWI)',
    'BUI':          'Buildup Index',
    'CNA_PAS':      'Precipitation as Snow',
    'Silt_30':      'Silt %',
    'NV':           'Non Vegetation',
    'CNA_DD_0':     'Growing Degree Days < 0',
    'WS':           'White Spruce',
    'DC':           'Drought code',
}

# Histogram style
HIST_BINS  = 40
HIST_ALPHA = 0.30
HIST_COLOR = 'skyblue'

# Manual PDP config
PDP_MAX_SAMPLES   = 5000   # average predictions over up to this many rows
PDP_GRID_POINTS   = 60     # grid resolution along x
PDP_CLIP_PERCENT  = (1, 99)  # clip to avoid extreme tails

# -------------------- Helpers --------------------
def normalize_name(s):
    return "".join(ch for ch in str(s).lower() if ch.isalnum())

def map_requested_features_to_columns_with_labels(requested, df_columns):
    """Return list of (actual_column_name, pretty_label) pairs."""
    norm_map = {normalize_name(c): c for c in df_columns}
    mapped_pairs, missing = [], []
    for r in requested:
        key = normalize_name(r)
        if key in norm_map:
            mapped_pairs.append((norm_map[key], PRETTY_LABELS.get(r, r)))
        else:
            missing.append(r)
    return mapped_pairs, missing

def median_impute_numeric(df):
    out = df.copy()
    num_cols = out.select_dtypes(include=[np.number]).columns
    out[num_cols] = out[num_cols].fillna(out[num_cols].median())
    return out

def manual_pdp(model, X: pd.DataFrame, feature: str,
               max_samples=PDP_MAX_SAMPLES,
               grid_points=PDP_GRID_POINTS,
               clip_percent=PDP_CLIP_PERCENT,
               random_state=42):
    """
    Compute 1D partial dependence for `feature` by averaging predictions
    over rows of X while sweeping `feature` across an evenly spaced grid
    between the given percentiles.
    """
    if feature not in X.columns:
        return None, None

    # sample rows for speed
    X_base = X.sample(n=min(len(X), max_samples), random_state=random_state) if len(X) > max_samples else X

    vals = X_base[feature].to_numpy()
    vals = vals[np.isfinite(vals)]
    if vals.size < 2:
        return None, None

    lo, hi = np.nanpercentile(vals, clip_percent)
    if not np.isfinite(lo) or not np.isfinite(hi) or lo == hi:
        return None, None

    grid = np.linspace(lo, hi, grid_points)
    pdp = np.empty_like(grid, dtype=float)

    X_work = X_base.copy()
    for i, g in enumerate(grid):
        X_work[feature] = g
        pdp[i] = model.predict(X_work).mean()

    return grid, pdp

# -------------------- Load and split --------------------
print(f"\n{'='*70}\n🚀 STARTING PDP (New Data only)\n{'='*70}")
df_main = pd.read_csv(INPUT_CSV)
df_old  = pd.read_csv(OLD_PREDICTORS_CSV)

if 'id' not in df_main.columns or 'id' not in df_old.columns:
    raise SystemExit("❌ Both files must have an 'id' column.")

old_ids = pd.unique(df_old['id'])
df_new  = df_main[~df_main['id'].isin(old_ids)].copy()
print(f"New Data rows: {len(df_new)}")

# -------------------- Train & plot --------------------
for target in OUT_DIRS:
    os.makedirs(OUT_DIRS[target], exist_ok=True)
    print(f"\n— Target: {target}")

    if target not in df_new.columns:
        print(f"  ⚠️ Target '{target}' not found. Skipping.")
        continue

    df_t = df_new.dropna(subset=[target]).copy()
    if df_t.empty:
        print(f"  ⚠️ No rows with non-NA '{target}'. Skipping.")
        continue

    X = df_t.drop(columns=COLS_TO_DROP_FROM_X, errors='ignore').select_dtypes(include=[np.number])
    if X.shape[1] == 0:
        print("  ⚠️ No numeric predictors after exclusions. Skipping.")
        continue

    X_imp = median_impute_numeric(X)
    y = df_t[target].astype(float)

    rf = RandomForestRegressor(
        n_estimators=500,
        random_state=42,
        n_jobs=-1,
        oob_score=len(y) > 10
    )
    rf.fit(X_imp, y)
    print(f"  ✓ Trained RF (n={len(y)})")

    requested = TOP6[target]
    mapped_pairs, missing = map_requested_features_to_columns_with_labels(requested, X_imp.columns)
    if not mapped_pairs:
        print("  ⚠️ No requested features found. Skipping PDP.")
        continue

    # figure setup with big fonts
    fig, axes = plt.subplots(3, 2, figsize=(22, 26))
    axes = axes.ravel()

    for i in range(6):
        ax = axes[i]
        if i < len(mapped_pairs):
            feat_col, pretty_label = mapped_pairs[i]

            # --- Histogram (secondary y-axis, behind the line) ---
            ax_hist = ax.twinx()
            values = X_imp[feat_col].to_numpy()
            values = values[np.isfinite(values)]
            if values.size > 0:
                ax_hist.hist(values, bins=HIST_BINS, alpha=HIST_ALPHA, color=HIST_COLOR)
            ax_hist.set_yticks([])
            ax_hist.set_ylabel("")
            ax_hist.tick_params(axis='both', length=0)
            ax_hist.grid(False)
            ax_hist.set_zorder(1)
            ax.set_zorder(2)
            ax.patch.set_visible(False)

            # --- Manual PDP line (primary y-axis) ---
            xs, ys = manual_pdp(rf, X_imp, feat_col)
            if xs is not None:
                ax.plot(xs, ys, linewidth=4, color="black", zorder=3)
                # align x-limits to data range so hist + line overlap cleanly
                xmin, xmax = (np.min(values), np.max(values)) if values.size else (xs.min(), xs.max())
                if np.isfinite(xmin) and np.isfinite(xmax) and xmin != xmax:
                    ax.set_xlim(xmin, xmax)
                    ax_hist.set_xlim(xmin, xmax)

            # Labels/titles
            ax.set_title(pretty_label, fontsize=FS_SUBTITLE, pad=18)
            ax.set_xlabel(pretty_label, fontsize=FS_LABEL)
            pretty_target = {
                'above.carbon.combusted': 'Aboveground Carbon',
                'below.ground.carbon.combusted': 'Belowground Carbon',
                'burn.depth': 'Burn Depth'
            }[target]
            ax.set_ylabel(pretty_target, fontsize=FS_LABEL)
            ax.tick_params(axis='both', labelsize=FS_TICKS)
            ax.xaxis.labelpad = 14
            ax.yaxis.labelpad = 14
            ax.grid(True, linestyle='--', alpha=0.6)

        else:
            ax.axis('off')

    pretty = {
        'above.carbon.combusted': 'Aboveground Carbon',
        'below.ground.carbon.combusted': 'Belowground Carbon',
        'burn.depth': 'Burn Depth'
    }[target]
    fig.suptitle(f'New Data — Top 6 PDPs ({pretty})', fontsize=FS_SUPTITLE, y=0.98)

    save_path = os.path.join(OUT_DIRS[target], f"pdp_new_{target.replace('.', '_')}_3x2.png")
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.savefig(save_path, dpi=150)
    plt.close(fig)

print(f"\n{'='*70}\n🎉 Done. Histograms (sky-blue) + manual PDP lines saved in *_new_3x2 folders.\n{'='*70}")



🚀 STARTING PDP (New Data only)
New Data rows: 867

— Target: above.carbon.combusted
  ✓ Trained RF (n=867)

— Target: below.ground.carbon.combusted
  ✓ Trained RF (n=867)

— Target: burn.depth
  ✓ Trained RF (n=866)

🎉 Done. Histograms (sky-blue) + manual PDP lines saved in *_new_3x2 folders.


In [4]:
't'

't'

In [6]:
#!/usr/bin/env python
# coding: utf-8

import os
import warnings
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import PartialDependenceDisplay

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# -------------------- Font/Style: large text everywhere --------------------
FS_SUPTITLE = 36   # figure title
FS_LABEL    = 26   # x/y axis labels
FS_TICKS    = 22   # tick labels
FS_SUBTITLE = 26   # per-panel titles

plt.rcParams.update({
    "font.size": FS_TICKS,
    "axes.labelsize": FS_LABEL,
    "xtick.labelsize": FS_TICKS,
    "ytick.labelsize": FS_TICKS,
    "legend.fontsize": FS_TICKS,
})

# -------------------- Config --------------------
INPUT_CSV = "/explore/nobackup/people/spotter5/new_combustion/2025-08-13_LC_FISL_Original_combustionModelPredictors.csv"
OLD_PREDICTORS_CSV = "/explore/nobackup/people/spotter5/new_combustion/Combustion_SynthesisData_05042018_XJW.csv"
BASE_OUT_PATH = "/explore/nobackup/people/spotter5/new_combustion"

OUT_DIRS = {
    'above.carbon.combusted':        os.path.join(BASE_OUT_PATH, "pdp_aboveground_fisl"),
    'below.ground.carbon.combusted': os.path.join(BASE_OUT_PATH, "pdp_belowground_fisl"),
    'burn.depth':                    os.path.join(BASE_OUT_PATH, "pdp_depth_fisl"),
}

POTENTIAL_TARGETS = ['below.ground.carbon.combusted', 'above.carbon.combusted', 'burn.depth']
METADATA_COLUMNS = ['burn_year','project.name','latitude','longitude','Date','id','CNA_MAR',
                    'fireYr','lat','lon','project_name']
COLS_TO_DROP_FROM_X = POTENTIAL_TARGETS + METADATA_COLUMNS

# Requested top-6 features
TOP6 = {
    'above.carbon.combusted':        ['BS','twi','brightness','ISI','Tree.cover','Temperature'],
    'below.ground.carbon.combusted': ['BUI','twi','CNA_PAS','Silt_30','brightness','NV'],
    'burn.depth':                    ['BUI','twi','CNA_PAS','CNA_DD_0','WS','DC'],
}

# Pretty labels for plotting
PRETTY_LABELS = {
    'BS':           'Black Spruce',
    'twi':          'The Wetness Index',
    'brightness':   'Brightness',
    'ISI':          'Initial Spread Index',
    'Tree.cover':   'Tree Cover',
    'Temperature':  'Temperature (FWI)',
    'BUI':          'Buildup Index',
    'CNA_PAS':      'Precipitation as Snow',
    'Silt_30':      'Silt %',
    'NV':           'Non Vegetation',
    'CNA_DD_0':     'Growing Degree Days < 0',
    'WS':           'White Spruce',
    'DC':           'Drought code',
}

# Histogram style
HIST_BINS  = 40
HIST_ALPHA = 0.30
HIST_COLOR = 'skyblue'

# PDP config
PDP_GRID_RES = 60

# -------------------- Helpers --------------------
def normalize_name(s):
    return "".join(ch for ch in str(s).lower() if ch.isalnum())

def map_requested_features_to_columns_with_labels(requested, df_columns):
    """Return list of (actual_column_name, pretty_label) pairs."""
    norm_map = {normalize_name(c): c for c in df_columns}
    mapped_pairs, missing = [], []
    for r in requested:
        key = normalize_name(r)
        if key in norm_map:
            mapped_pairs.append((norm_map[key], PRETTY_LABELS.get(r, r)))
        else:
            missing.append(r)
    return mapped_pairs, missing

def median_impute_numeric(df):
    out = df.copy()
    num_cols = out.select_dtypes(include=[np.number]).columns
    out[num_cols] = out[num_cols].fillna(out[num_cols].median())
    return out

def pdp_curve_from_sklearn(estimator, X, feature, grid_resolution=PDP_GRID_RES):
    """
    Try sklearn's partial_dependence to get (x, y) PDP arrays.
    Fallback: call PartialDependenceDisplay.from_estimator on a temp axis and
    extract line data, then close the temp figure.
    """
    # Attempt modern API first
    try:
        from sklearn.inspection import partial_dependence
        res = partial_dependence(
            estimator, X, [feature], kind="average",
            grid_resolution=grid_resolution, method="auto"
        )
        # Newer versions expose either 'values' or 'grid_values'
        xs = res.get('values', res.get('grid_values'))[0]
        ys = res['average'][0].ravel()
        return np.asarray(xs), np.asarray(ys)
    except Exception:
        pass

    # Fallback: render off-screen and grab the line data
    fig_tmp, ax_tmp = plt.subplots()
    try:
        disp = PartialDependenceDisplay.from_estimator(
            estimator, X, [feature], kind="average",
            grid_resolution=grid_resolution, ax=ax_tmp
        )
        # First (and only) line in first (and only) axis for 1D PDP
        line = disp.lines_[0][0]
        xs = np.asarray(line.get_xdata())
        ys = np.asarray(line.get_ydata())
        return xs, ys
    finally:
        plt.close(fig_tmp)

# -------------------- Load and split --------------------
print(f"\n{'='*70}\n🚀 STARTING PDP (New Data only)\n{'='*70}")
df_main = pd.read_csv(INPUT_CSV)
df_old  = pd.read_csv(OLD_PREDICTORS_CSV)

if 'id' not in df_main.columns or 'id' not in df_old.columns:
    raise SystemExit("❌ Both files must have an 'id' column.")

old_ids = pd.unique(df_old['id'])
df_new  = df_main[~df_main['id'].isin(old_ids)].copy()
print(f"New Data rows: {len(df_new)}")

# -------------------- Train & plot --------------------
for target in OUT_DIRS:
    os.makedirs(OUT_DIRS[target], exist_ok=True)
    print(f"\n— Target: {target}")

    if target not in df_new.columns:
        print(f"  ⚠️ Target '{target}' not found. Skipping.")
        continue

    df_t = df_new.dropna(subset=[target]).copy()
    if df_t.empty:
        print(f"  ⚠️ No rows with non-NA '{target}'. Skipping.")
        continue

    X = df_t.drop(columns=COLS_TO_DROP_FROM_X, errors='ignore').select_dtypes(include=[np.number])
    if X.shape[1] == 0:
        print("  ⚠️ No numeric predictors after exclusions. Skipping.")
        continue

    X_imp = median_impute_numeric(X)
    y = df_t[target].astype(float)

    rf = RandomForestRegressor(
        n_estimators=500,
        random_state=42,
        n_jobs=-1,
        oob_score=len(y) > 10
    )
    rf.fit(X_imp, y)
    print(f"  ✓ Trained RF (n={len(y)})")

    requested = TOP6[target]
    mapped_pairs, missing = map_requested_features_to_columns_with_labels(requested, X_imp.columns)
    if not mapped_pairs:
        print("  ⚠️ No requested features found. Skipping PDP.")
        continue

    # figure setup with big fonts
    fig, axes = plt.subplots(3, 2, figsize=(22, 26))
    axes = axes.ravel()

    for i in range(6):
        ax = axes[i]
        if i < len(mapped_pairs):
            feat_col, pretty_label = mapped_pairs[i]

            # --- Histogram (secondary y-axis, behind the line) ---
            ax_hist = ax.twinx()
            values = X_imp[feat_col].to_numpy()
            values = values[np.isfinite(values)]
            if values.size > 0:
                ax_hist.hist(values, bins=HIST_BINS, alpha=HIST_ALPHA, color=HIST_COLOR)
            ax_hist.set_yticks([])
            ax_hist.set_ylabel("")
            ax_hist.tick_params(axis='both', length=0)
            ax_hist.grid(False)
            ax_hist.set_zorder(1)
            ax.set_zorder(2)
            ax.patch.set_visible(False)

            # --- PDP line (extracted from sklearn, plotted manually) ---
            xs, ys = pdp_curve_from_sklearn(rf, X_imp, feat_col, grid_resolution=PDP_GRID_RES)
            if xs is not None and ys is not None:
                ax.plot(xs, ys, linewidth=4, color="black", zorder=3)
                # Align x-limits for clean overlay
                xmin, xmax = (np.min(values), np.max(values)) if values.size else (np.min(xs), np.max(xs))
                if np.isfinite(xmin) and np.isfinite(xmax) and xmin != xmax:
                    ax.set_xlim(xmin, xmax)
                    ax_hist.set_xlim(xmin, xmax)

            # Labels/titles
            ax.set_title(pretty_label, fontsize=FS_SUBTITLE, pad=18)
            ax.set_xlabel(pretty_label, fontsize=FS_LABEL)
            pretty_target = {
                'above.carbon.combusted': 'Aboveground Carbon',
                'below.ground.carbon.combusted': 'Belowground Carbon',
                'burn.depth': 'Burn Depth'
            }[target]
            ax.set_ylabel(pretty_target, fontsize=FS_LABEL)
            ax.tick_params(axis='both', labelsize=FS_TICKS)
            ax.xaxis.labelpad = 14
            ax.yaxis.labelpad = 14
            ax.grid(True, linestyle='--', alpha=0.6)
        else:
            ax.axis('off')

    pretty = {
        'above.carbon.combusted': 'Aboveground Carbon',
        'below.ground.carbon.combusted': 'Belowground Carbon',
        'burn.depth': 'Burn Depth'
    }[target]
    fig.suptitle(f'New Data — Top 6 PDPs ({pretty})', fontsize=FS_SUPTITLE, y=0.98)

    save_path = os.path.join(OUT_DIRS[target], f"pdp_new_{target.replace('.', '_')}_3x2.png")
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.savefig(save_path, dpi=150)
    plt.close(fig)

print(f"\n{'='*70}\n🎉 Done. PDP vectors pulled from sklearn + sky-blue histograms.\n{'='*70}")



🚀 STARTING PDP (New Data only)
New Data rows: 867

— Target: above.carbon.combusted
  ✓ Trained RF (n=867)

— Target: below.ground.carbon.combusted
  ✓ Trained RF (n=867)

— Target: burn.depth
  ✓ Trained RF (n=866)

🎉 Done. PDP vectors pulled from sklearn + sky-blue histograms.
