above ground

In [None]:
# 1. Load and clean data
df = pd.read_csv("/explore/nobackup/people/spotter5/new_combustion/2025-08-13_LC_FISL_Original_combustionModelPredictors.csv")

out_path = "/explore/nobackup/people/spotter5/new_combustion/all_data"
os.makedirs(out_path, exist_ok = True)

# 2. Exclude columns not relevant for modeling
exclude_columns = [
    #'below.ground.carbon.combusted',
    'above.carbon.combusted'
    'burn.depth',
    'burn_year',
    #'rdnbr_old',
    'project.name',
    'latitude',
    'longitude',
    'Date',
    'id',
    'CNA_MAR'
    #  'fireYr',
    # 'lat',
    # 'lon',
    # 'project_name'
]

# 3. Drop excluded columns and NaNs
all_data = df.drop(columns=exclude_columns).dropna()


# 1. Load and clean data
df = pd.read_csv("/explore/nobackup/people/spotter5/new_combustion/2025-08-08_LC_FISL_Original_combustionModelPredictors.csv")

old = pd.read_csv("/explore/nobackup/people/spotter5/new_combustion/all_predictors.csv")

# Get the unique IDs to remove
old_ids = old['id'].unique()

# Filter the DataFrame using the ~ operator ✅
new_data = df[~df['id'].isin(old_ids)].drop(columns=exclude_columns).dropna()

# 1. Load and clean data
df = pd.read_csv("/explore/nobackup/people/spotter5/new_combustion/2025-08-08_LC_FISL_Original_combustionModelPredictors.csv")

old = pd.read_csv("/explore/nobackup/people/spotter5/new_combustion/all_predictors.csv")

# Get the unique IDs to remove
old_ids = old['id'].unique()

# Filter the DataFrame using the ~ operator ✅
old_data = df[df['id'].isin(old_ids)].drop(columns=exclude_columns).dropna()


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import PartialDependenceDisplay
import os
import warnings

def run_pdp_analysis(target_variable, output_directory, input_csv_path, old_ids_csv_path):
    """
    Runs the full modeling and PDP generation pipeline for a specific target variable.

    Args:
        target_variable (str): The name of the column to use as the target.
        output_directory (str): The path to save the generated PDP images.
        input_csv_path (str): Path to the main input CSV file.
        old_ids_csv_path (str): Path to the CSV containing old plot IDs.
    """
    # --- Setup & Introduction ---
    print(f"\n{'='*70}")
    print(f"🚀 STARTING ANALYSIS | TARGET: '{target_variable}'")
    print(f"{'='*70}")

    warnings.filterwarnings('ignore', category=FutureWarning)
    warnings.filterwarnings('ignore', category=UserWarning)
    os.makedirs(output_directory, exist_ok=True)

    # --- 1. Data Preparation ---
    print("Step 1: Loading and preparing data...")
    try:
        df_main = pd.read_csv(input_csv_path)
        df_old_ids = pd.read_csv(old_ids_csv_path)
    except FileNotFoundError as e:
        print(f"❌ Error: Could not find input file. {e}")
        return

    # Define columns to exclude from predictors.
    # CRITICAL: We must exclude BOTH potential target variables from the predictors
    # to prevent data leakage between the two analyses.
    POTENTIAL_TARGETS = ['below.ground.carbon.combusted', 'above.carbon.combusted', 'burn.depth']
    METADATA_COLUMNS = [
        'burn_year', 'project.name', 'latitude', 'longitude', 'Date', 'id',
        'CNA_MAR'
    ]
    
    # Combine all columns to be dropped when creating the predictor set (X)
    COLS_TO_DROP_FROM_X = POTENTIAL_TARGETS + METADATA_COLUMNS

    # Prepare the three datasets (All, Old, New)
    old_ids = df_old_ids['id'].unique()
    data_splits = {
        "All Data": df_main,
        "Old Data": df_main[df_main['id'].isin(old_ids)],
        "New Data": df_main[~df_main['id'].isin(old_ids)]
    }

    # --- 2. Model Training ---
    print(f"Step 2: Training Random Forest models on '{target_variable}'...")
    models = {}
    for name, data in data_splits.items():
        print(f"  - Training on '{name}' ({len(data)} rows)...")

        # Drop rows where the CURRENT target variable is missing
        df_clean = data.dropna(subset=[target_variable])

        # Define predictors (X) and target (y)
        X = df_clean.drop(columns=COLS_TO_DROP_FROM_X, errors='ignore')
        y = df_clean[target_variable]

        # Initialize and train the model
        rf = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=-1, oob_score=True)
        rf.fit(X, y)
        print(f"    ...Done. Model OOB Score (R²): {rf.oob_score_:.3f}")

        # Store the trained model and predictor data
        models[name] = {'model': rf, 'X': X}

    # --- 3. Generate and Save Partial Dependence Plots ---
    feature_list = models['All Data']['X'].columns
    print(f"\nStep 3: Generating {len(feature_list)} Partial Dependence Plots...")

    for feature in feature_list:
        print(f"  - Plotting for: {feature}")
        fig, axes = plt.subplots(3, 1, figsize=(8, 12), sharex=True)
        fig.suptitle(f'Partial Dependence on: {feature}\n(Target: {target_variable})', fontsize=16, y=0.96)

        plot_order = ["All Data", "Old Data", "New Data"]
        for i, model_name in enumerate(plot_order):
            ax = axes[i]
            model_info = models[model_name]
            PartialDependenceDisplay.from_estimator(
                estimator=model_info['model'], X=model_info['X'], features=[feature],
                ax=ax, line_kw={"color": "darkcyan", "linewidth": 2.5}
            )
            ax.set_title(f"{model_name} (n={len(model_info['X'])})")
            ax.set_ylabel("Partial Dependence")
            ax.grid(True, linestyle='--', alpha=0.6)

        save_path = os.path.join(output_directory, f'{feature}.png')
        plt.tight_layout(rect=[0, 0, 1, 0.94])
        plt.savefig(save_path)
        plt.close(fig)

    print(f"\n✅ ANALYSIS COMPLETE for '{target_variable}'. Plots saved to: {output_directory}")


if __name__ == "__main__":
    # --- Master Configuration ---
    INPUT_CSV = "/explore/nobackup/people/spotter5/new_combustion/2025-08-13_LC_FISL_Original_combustionModelPredictors.csv"
    # OLD_PREDICTORS_CSV = "/explore/nobackup/people/spotter5/new_combustion/all_predictors.csv"
    OLD_PREDICTORS_CSV = "/explore/nobackup/people/spotter5/new_combustion/Combustion_SynthesisData_05042018_XJW.csv"
    BASE_OUT_PATH = "/explore/nobackup/people/spotter5/new_combustion"

    # Define the analyses to run
    analyses = {
        'below.ground.carbon.combusted': os.path.join(BASE_OUT_PATH, "pdp_belowground"),
        'above.carbon.combusted': os.path.join(BASE_OUT_PATH, "pdp_aboveground"),
        'burn.depth': os.path.join(BASE_OUT_PATH, "pdp_depth")
    }

    # Execute each analysis
    for target_col, out_dir in analyses.items():
        run_pdp_analysis(
            target_variable=target_col,
            output_directory=out_dir,
            input_csv_path=INPUT_CSV,
            old_ids_csv_path=OLD_PREDICTORS_CSV
        )

    print(f"\n{'='*70}")
    print("🎉 All tasks finished.")
    print(f"{'='*70}")


🚀 STARTING ANALYSIS | TARGET: 'below.ground.carbon.combusted'
Step 1: Loading and preparing data...
Step 2: Training Random Forest models on 'below.ground.carbon.combusted'...
  - Training on 'All Data' (1877 rows)...
    ...Done. Model OOB Score (R²): 0.434
  - Training on 'Old Data' (1010 rows)...
    ...Done. Model OOB Score (R²): 0.228
  - Training on 'New Data' (867 rows)...
    ...Done. Model OOB Score (R²): 0.607

Step 3: Generating 78 Partial Dependence Plots...
  - Plotting for: PFI
  - Plotting for: pH_30
  - Plotting for: Sand_30
  - Plotting for: Silt_30
  - Plotting for: Clay_30
  - Plotting for: DOB_lst
  - Plotting for: Relative.humidity
  - Plotting for: Temperature
  - Plotting for: VPD
  - Plotting for: Wind.speed
  - Plotting for: JP
  - Plotting for: BS
  - Plotting for: DEC
  - Plotting for: GRSH
  - Plotting for: NV
  - Plotting for: OCON
  - Plotting for: WS
  - Plotting for: CNA_Tmax_5_8
  - Plotting for: CNA_PPT_5_8
  - Plotting for: CNA_Rad_5_8
  - Plotting f

ValueError: Found array with 0 sample(s) (shape=(0, 78)) while a minimum of 1 is required by RandomForestRegressor.

With depth

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import PartialDependenceDisplay
import os
import warnings

def run_pdp_analysis(target_variable, output_directory, input_csv_path, old_ids_csv_path):
    """
    Runs the full modeling and PDP generation pipeline for a specific target variable.

    Args:
        target_variable (str): The name of the column to use as the target.
        output_directory (str): The path to save the generated PDP images.
        input_csv_path (str): Path to the main input CSV file.
        old_ids_csv_path (str): Path to the CSV containing old plot IDs and, if available, burn.depth.
    """
    # --- Setup & Introduction ---
    print(f"\n{'='*70}")
    print(f"🚀 STARTING ANALYSIS | TARGET: '{target_variable}'")
    print(f"{'='*70}")

    warnings.filterwarnings('ignore', category=FutureWarning)
    warnings.filterwarnings('ignore', category=UserWarning)
    os.makedirs(output_directory, exist_ok=True)

    # --- 1. Data Preparation ---
    print("Step 1: Loading and preparing data...")
    try:
        df_main = pd.read_csv(input_csv_path)
        df_old_ids = pd.read_csv(old_ids_csv_path)
    except FileNotFoundError as e:
        print(f"❌ Error: Could not find input file. {e}")
        return

    # Normalize 'id' types & dedupe
    if 'id' not in df_main.columns or 'id' not in df_old_ids.columns:
        print("❌ Error: Both input files must have an 'id' column.")
        return

    for df in (df_main, df_old_ids):
        # coerce to numeric if possible (keeps strings if not)
        try:
            df['id'] = pd.to_numeric(df['id'], errors='ignore')
        except Exception:
            pass
    df_old_ids = df_old_ids.drop_duplicates(subset='id')

    # If we're modeling burn.depth, try to backfill from old file (if present there)
    if target_variable == 'burn.depth' and 'burn.depth' in df_old_ids.columns:
        print("  - Backfilling 'burn.depth' from old CSV where missing in main...")
        before_na = df_main['burn.depth'].isna().sum() if 'burn.depth' in df_main.columns else None
        if 'burn.depth' not in df_main.columns:
            df_main['burn.depth'] = np.nan
        df_main = df_main.set_index('id')
        df_old_depth = df_old_ids.set_index('id')['burn.depth']
        df_main['burn.depth'] = df_main['burn.depth'].combine_first(df_old_depth)
        df_main = df_main.reset_index()
        after_na = df_main['burn.depth'].isna().sum()
        if before_na is not None:
            print(f"    Filled {before_na - after_na} missing burn.depth values.")

    # Define columns to exclude from predictors (X)
    POTENTIAL_TARGETS = ['below.ground.carbon.combusted', 'above.carbon.combusted', 'burn.depth']
    METADATA_COLUMNS = [
        'burn_year', 'project.name', 'latitude', 'longitude', 'Date', 'id', 'CNA_MAR'
    ]
    COLS_TO_DROP_FROM_X = POTENTIAL_TARGETS + METADATA_COLUMNS

    # Prepare splits
    old_ids = df_old_ids['id'].unique()
    data_splits = {
        "All Data": df_main,
        "Old Data": df_main[df_main['id'].isin(old_ids)],
        "New Data": df_main[~df_main['id'].isin(old_ids)]
    }

    # --- 2. Model Training ---
    print(f"Step 2: Training Random Forest models on '{target_variable}'...")
    models = {}
    for name, data in data_splits.items():
        print(f"  - Training on '{name}' ({len(data)} rows)...")

        if target_variable not in data.columns:
            print(f"    ⚠️ Skipping '{name}' – target '{target_variable}' not in columns.")
            continue

        # Drop rows missing the CURRENT target variable
        df_clean = data.dropna(subset=[target_variable]).copy()

        # Build X (numeric only), drop constant/all-NaN cols
        X = df_clean.drop(columns=COLS_TO_DROP_FROM_X, errors='ignore')
        X = X.select_dtypes(include=[np.number])
        if X.shape[1] == 0:
            print(f"    ⚠️ Skipping '{name}' – no numeric predictors after cleaning.")
            continue
        # drop all-NaN columns
        X = X.loc[:, X.notna().any(axis=0)]
        # drop constant columns
        constant_cols = [c for c in X.columns if X[c].nunique(dropna=True) <= 1]
        if constant_cols:
            X = X.drop(columns=constant_cols)

        y = df_clean[target_variable].astype(float)

        n = len(y)
        if n < 2:
            print(f"    ⚠️ Skipping '{name}' – insufficient samples after dropna (n={n}).")
            continue

        # Enable OOB only when there are enough samples to make it meaningful
        use_oob = n > 10
        rf = RandomForestRegressor(
            n_estimators=500,
            random_state=42,
            n_jobs=-1,
            oob_score=use_oob
        )
        rf.fit(X, y)
        if use_oob:
            print(f"    ...Done. Model OOB Score (R²): {rf.oob_score_:.3f}")
        else:
            print(f"    ...Done. (OOB disabled; n={n})")

        models[name] = {'model': rf, 'X': X}

    if "All Data" not in models:
        print("❌ No trainable 'All Data' model. Aborting PDP stage.")
        return

    # --- 3. Generate and Save Partial Dependence Plots ---
    feature_list = models['All Data']['X'].columns
    print(f"\nStep 3: Generating {len(feature_list)} Partial Dependence Plots...")

    # Only include splits that successfully trained
    trained_order = [k for k in ["All Data", "Old Data", "New Data"] if k in models]
    if not trained_order:
        print("❌ No trained models available for PDP.")
        return

    for feature in feature_list:
        print(f"  - Plotting for: {feature}")
        fig, axes = plt.subplots(len(trained_order), 1, figsize=(8, 4 * len(trained_order)), sharex=True)
        if len(trained_order) == 1:
            axes = [axes]
        fig.suptitle(f'Partial Dependence on: {feature}\n(Target: {target_variable})', fontsize=16, y=0.96)

        for ax, model_name in zip(axes, trained_order):
            model_info = models[model_name]
            try:
                PartialDependenceDisplay.from_estimator(
                    estimator=model_info['model'], X=model_info['X'], features=[feature],
                    ax=ax, line_kw={"color": "darkcyan", "linewidth": 2.5}
                )
                ax.set_title(f"{model_name} (n={len(model_info['X'])})")
                ax.set_ylabel("Partial Dependence")
                ax.grid(True, linestyle='--', alpha=0.6)
            except Exception as e:
                ax.set_title(f"{model_name} – PDP failed for '{feature}' ({e})")
                ax.axis('off')

        save_path = os.path.join(output_directory, f'{feature}.png')
        plt.tight_layout(rect=[0, 0, 1, 0.94])
        plt.savefig(save_path)
        plt.close(fig)

    print(f"\n✅ ANALYSIS COMPLETE for '{target_variable}'. Plots saved to: {output_directory}")


if __name__ == "__main__":
    # --- Master Configuration ---
    INPUT_CSV = "/explore/nobackup/people/spotter5/new_combustion/2025-08-13_LC_FISL_Original_combustionModelPredictors.csv"
    OLD_PREDICTORS_CSV = "/explore/nobackup/people/spotter5/new_combustion/Combustion_SynthesisData_05042018_XJW.csv"
    BASE_OUT_PATH = "/explore/nobackup/people/spotter5/new_combustion"

    analyses = {
        'below.ground.carbon.combusted': os.path.join(BASE_OUT_PATH, "pdp_belowground"),
        'above.carbon.combusted': os.path.join(BASE_OUT_PATH, "pdp_aboveground"),
        'burn.depth': os.path.join(BASE_OUT_PATH, "pdp_depth")
    }

    for target_col, out_dir in analyses.items():
        run_pdp_analysis(
            target_variable=target_col,
            output_directory=out_dir,
            input_csv_path=INPUT_CSV,
            old_ids_csv_path=OLD_PREDICTORS_CSV
        )

    print(f"\n{'='*70}")
    print("🎉 All tasks finished.")
    print(f"{'='*70}")



🚀 STARTING ANALYSIS | TARGET: 'below.ground.carbon.combusted'
Step 1: Loading and preparing data...
Step 2: Training Random Forest models on 'below.ground.carbon.combusted'...
  - Training on 'All Data' (1877 rows)...
    ...Done. Model OOB Score (R²): 0.434
  - Training on 'Old Data' (1010 rows)...
    ...Done. Model OOB Score (R²): 0.226
  - Training on 'New Data' (867 rows)...
    ...Done. Model OOB Score (R²): 0.607

Step 3: Generating 78 Partial Dependence Plots...
  - Plotting for: PFI
  - Plotting for: pH_30
  - Plotting for: Sand_30
  - Plotting for: Silt_30
  - Plotting for: Clay_30
  - Plotting for: DOB_lst
  - Plotting for: Relative.humidity
  - Plotting for: Temperature
  - Plotting for: VPD
  - Plotting for: Wind.speed
  - Plotting for: JP
  - Plotting for: BS
  - Plotting for: DEC
  - Plotting for: GRSH
  - Plotting for: NV
  - Plotting for: OCON
  - Plotting for: WS
  - Plotting for: CNA_Tmax_5_8
  - Plotting for: CNA_PPT_5_8
  - Plotting for: CNA_Rad_5_8
  - Plotting f

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import PartialDependenceDisplay
import os
import warnings

def run_pdp_analysis(target_variable, output_directory, input_csv_path, old_ids_csv_path):
    """
    Runs the full modeling and PDP generation pipeline for a specific target variable.

    Args:
        target_variable (str): The name of the column to use as the target.
        output_directory (str): The path to save the generated PDP images.
        input_csv_path (str): Path to the main input CSV file.
        old_ids_csv_path (str): Path to the CSV containing old plot IDs.
    """
    # --- Setup & Introduction ---
    print(f"\n{'='*70}")
    print(f"🚀 STARTING ANALYSIS | TARGET: '{target_variable}'")
    print(f"{'='*70}")

    warnings.filterwarnings('ignore', category=FutureWarning)
    warnings.filterwarnings('ignore', category=UserWarning)
    os.makedirs(output_directory, exist_ok=True)

    # --- 1. Data Preparation ---
    print("Step 1: Loading and preparing data...")
    try:
        df_main = pd.read_csv(input_csv_path)
        df_old_ids = pd.read_csv(old_ids_csv_path)
    except FileNotFoundError as e:
        print(f"❌ Error: Could not find input file. {e}")
        return

    # Define columns to exclude from predictors.
    POTENTIAL_TARGETS = ['below.ground.carbon.combusted', 'above.carbon.combusted']
    METADATA_COLUMNS = [
        'burn_year', 'project.name', 'latitude', 'longitude', 'Date', 'id',
        'CNA_MAR', 'fireYr', 'lat', 'lon', 'project_name'
    ]
    
    # Combine all columns to be dropped when creating the predictor set (X)
    COLS_TO_DROP_FROM_X = POTENTIAL_TARGETS + METADATA_COLUMNS

    # Prepare the three datasets (All, Old, New)
    old_ids = df_old_ids['id'].unique()
    data_splits = {
        "All Data": df_main,
        "Old Data": df_main[df_main['id'].isin(old_ids)],
        "New Data": df_main[~df_main['id'].isin(old_ids)]
    }

    # --- 2. Model Training ---
    print(f"Step 2: Training Random Forest models on '{target_variable}'...")
    models = {}
    for name, data in data_splits.items():
        print(f"  - Preparing data for '{name}' ({len(data)} rows)...")

        # First, drop rows where the CURRENT target variable is missing
        df_clean_target = data.dropna(subset=[target_variable])

        # Define predictors (X) and target (y) from this pre-cleaned data
        X = df_clean_target.drop(columns=COLS_TO_DROP_FROM_X, errors='ignore')
        y = df_clean_target[target_variable]

        # --- START: FIX ---
        # **CRITICAL FIX**: Now, drop rows with NaNs in the PREDICTOR (X) columns
        # This ensures the model receives completely clean data.
        rows_before_cleaning_predictors = len(X)
        X = X.dropna()
        
        # **CRITICAL FIX**: Re-align y to match the cleaned X's index
        y = y.loc[X.index]
        rows_after_cleaning_predictors = len(X)
        
        if rows_before_cleaning_predictors > rows_after_cleaning_predictors:
            rows_dropped = rows_before_cleaning_predictors - rows_after_cleaning_predictors
            print(f"    ... Cleaned predictor NaNs. Dropped {rows_dropped} rows. Final training size: {rows_after_cleaning_predictors}")
        # --- END: FIX ---

        # Check if there is still data to train on
        if X.empty:
            print(f"    ... ❌ Skipping '{name}': No data left after cleaning.")
            models[name] = None # Store None to indicate a failed model
            continue

        # Initialize and train the model
        rf = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=-1, oob_score=True)
        rf.fit(X, y)
        print(f"    ... Done. Model OOB Score (R²): {rf.oob_score_:.3f}")

        # Store the trained model and predictor data
        models[name] = {'model': rf, 'X': X}

    # --- 3. Generate and Save Partial Dependence Plots ---
    # Use the 'All Data' model's features as the reference list
    if models['All Data'] is None:
        print("\n❌ Cannot generate plots because 'All Data' model failed to train.")
        return
        
    feature_list = models['All Data']['X'].columns
    print(f"\nStep 3: Generating {len(feature_list)} Partial Dependence Plots...")

    for feature in feature_list:
        print(f"  - Plotting for: {feature}")
        fig, axes = plt.subplots(3, 1, figsize=(8, 12), sharex=True, squeeze=False) # squeeze=False ensures axes is always 2D
        axes = axes.flatten() # Flatten to a 1D array for easy iteration
        fig.suptitle(f'Partial Dependence on: {feature}\n(Target: {target_variable})', fontsize=16, y=0.96)

        plot_order = ["All Data", "Old Data", "New Data"]
        for i, model_name in enumerate(plot_order):
            ax = axes[i]
            model_info = models.get(model_name) # Use .get() for safety

            if model_info:
                PartialDependenceDisplay.from_estimator(
                    estimator=model_info['model'], X=model_info['X'], features=[feature],
                    ax=ax, line_kw={"color": "darkcyan", "linewidth": 2.5}
                )
                ax.set_title(f"{model_name} (n={len(model_info['X'])})")
            else:
                ax.text(0.5, 0.5, 'Model could not be trained\n(No data available)', 
                        ha='center', va='center', transform=ax.transAxes, fontsize=12, color='red')
                ax.set_title(f"{model_name} (n=0)")

            ax.set_ylabel("Partial Dependence")
            ax.grid(True, linestyle='--', alpha=0.6)

        save_path = os.path.join(output_directory, f'{feature}.png')
        plt.tight_layout(rect=[0, 0, 1, 0.94])
        plt.savefig(save_path)
        plt.close(fig)

    print(f"\n✅ ANALYSIS COMPLETE for '{target_variable}'. Plots saved to: {output_directory}")


if __name__ == "__main__":
    # --- Master Configuration ---
    INPUT_CSV = "/explore/nobackup/people/spotter5/new_combustion/2025-08-08_LC_FISL_Original_combustionModelPredictors.csv"
    OLD_PREDICTORS_CSV = "/explore/nobackup/people/spotter5/new_combustion/all_predictors.csv"
    BASE_OUT_PATH = "/explore/nobackup/people/spotter5/new_combustion"

    # Define the analyses to run
    analyses = {
        'below.ground.carbon.combusted': os.path.join(BASE_OUT_PATH, "pdp_belowground"),
        'above.carbon.combusted': os.path.join(BASE_OUT_PATH, "pdp_aboveground")
    }

    # Execute each analysis
    for target_col, out_dir in analyses.items():
        run_pdp_analysis(
            target_variable=target_col,
            output_directory=out_dir,
            input_csv_path=INPUT_CSV,
            old_ids_csv_path=OLD_PREDICTORS_CSV
        )

    print(f"\n{'='*70}")
    print("🎉 All tasks finished.")
    print(f"{'='*70}")


🚀 STARTING ANALYSIS | TARGET: 'below.ground.carbon.combusted'
Step 1: Loading and preparing data...
Step 2: Training Random Forest models on 'below.ground.carbon.combusted'...
  - Preparing data for 'All Data' (1877 rows)...
    ... Cleaned predictor NaNs. Dropped 562 rows. Final training size: 1201
    ... Done. Model OOB Score (R²): 0.358
  - Preparing data for 'Old Data' (1011 rows)...
    ... Cleaned predictor NaNs. Dropped 127 rows. Final training size: 770
    ... Done. Model OOB Score (R²): 0.218
  - Preparing data for 'New Data' (866 rows)...
    ... Cleaned predictor NaNs. Dropped 435 rows. Final training size: 431
    ... Done. Model OOB Score (R²): 0.559

Step 3: Generating 73 Partial Dependence Plots...
  - Plotting for: PFI
  - Plotting for: pH_30
  - Plotting for: Sand_30
  - Plotting for: Silt_30
  - Plotting for: Clay_30
  - Plotting for: DOB_lst
  - Plotting for: Relative.humidity
  - Plotting for: Temperature
  - Plotting for: VPD
  - Plotting for: Wind.speed
  - Plo

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import PartialDependenceDisplay
import os
import warnings

# --- 1. Setup & Configuration ---
print(f"\n{'='*70}")
print("🚀 STARTING COMBINED PDP ANALYSIS")
print(f"{'='*70}")

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# --- Master Configuration ---
INPUT_CSV = "/explore/nobackup/people/spotter5/new_combustion/2025-08-08_LC_FISL_Original_combustionModelPredictors.csv"
OLD_PREDICTORS_CSV = "/explore/nobackup/people/spotter5/new_combustion/all_predictors.csv"
BASE_OUT_PATH = "/explore/nobackup/people/spotter5/new_combustion"
COMBINED_OUTPUT_DIR = os.path.join(BASE_OUT_PATH, "pdp_combined_3x2")

os.makedirs(COMBINED_OUTPUT_DIR, exist_ok=True)

# --- 2. Data Preparation ---
print("Step 1: Loading and preparing data...")
try:
    df_main = pd.read_csv(INPUT_CSV)
    df_old_ids = pd.read_csv(OLD_PREDICTORS_CSV)
except FileNotFoundError as e:
    print(f"❌ Error: Could not find input file. {e}")
    exit()

# Define columns to exclude from predictors
POTENTIAL_TARGETS = ['above.carbon.combusted', 'below.ground.carbon.combusted']
METADATA_COLUMNS = [
    'burn_year', 'project.name', 'latitude', 'longitude', 'Date', 'id',
    'CNA_MAR', 'fireYr', 'lat', 'lon', 'project_name'
]
COLS_TO_DROP_FROM_X = POTENTIAL_TARGETS + METADATA_COLUMNS

# Prepare the three datasets (All, Old, New)
old_ids = df_old_ids['id'].unique()
data_splits = {
    "All Data": df_main,
    "Old Data": df_main[df_main['id'].isin(old_ids)],
    "New Data": df_main[~df_main['id'].isin(old_ids)]
}

# --- 3. Train All Models ---
print("\nStep 2: Training all Random Forest models...")
models = {target: {} for target in POTENTIAL_TARGETS} # Nested dictionary to hold all 6 models

for target_variable in POTENTIAL_TARGETS:
    print(f"  Training models for target: '{target_variable}'")
    for name, data in data_splits.items():
        print(f"    - Training on '{name}' ({len(data)} rows)...")
        
        # Clean data for the current model
        df_clean = data.dropna(subset=[target_variable])
        X = df_clean.drop(columns=COLS_TO_DROP_FROM_X, errors='ignore')
        y = df_clean[target_variable]
        
        # Drop NaNs from predictors and align y
        X = X.dropna()
        y = y.loc[X.index]

        if X.empty:
            print(f"      ... ❌ Skipping '{name}': No data left after cleaning.")
            models[target_variable][name] = None
            continue
            
        rf = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=-1, oob_score=True)
        rf.fit(X, y)
        print(f"      ... Done. Model OOB Score (R²): {rf.oob_score_:.3f}")
        
        # Store the trained model and predictor data
        models[target_variable][name] = {'model': rf, 'X': X}

# --- 4. Generate and Save 3x2 Partial Dependence Plots ---
# Use one consistent feature list from the 'All Data' model
all_data_model_info = models['above.carbon.combusted'].get('All Data')
if not all_data_model_info:
    print("\n❌ Cannot generate plots because the 'All Data' model for aboveground failed to train.")
    exit()

feature_list = all_data_model_info['X'].columns
print(f"\nStep 3: Generating {len(feature_list)} combined 3x2 Partial Dependence Plots...")

plot_order = ["All Data", "New Data", "Old Data"]
target_left = 'above.carbon.combusted'
target_right = 'below.ground.carbon.combusted'

for feature in feature_list:
    print(f"  - Plotting for: {feature}")
    
    fig, axes = plt.subplots(3, 2, figsize=(15, 18), sharex=True)
    fig.suptitle(f'Partial Dependence on: {feature}', fontsize=20, y=0.98)

    for i, model_name in enumerate(plot_order):
        # --- Left Column: Aboveground ---
        ax_left = axes[i, 0]
        model_info_left = models[target_left].get(model_name)
        
        if model_info_left:
            PartialDependenceDisplay.from_estimator(
                estimator=model_info_left['model'], X=model_info_left['X'], features=[feature],
                ax=ax_left, line_kw={"color": "darkcyan", "linewidth": 2.5}
            )
        else:
            ax_left.text(0.5, 0.5, 'Model Not Trained', ha='center', va='center', color='red')
        
        ax_left.set_ylabel(f"{model_name}\nPartial Dependence")
        ax_left.grid(True, linestyle='--', alpha=0.6)

        # --- Right Column: Belowground ---
        ax_right = axes[i, 1]
        model_info_right = models[target_right].get(model_name)
        
        if model_info_right:
            PartialDependenceDisplay.from_estimator(
                estimator=model_info_right['model'], X=model_info_right['X'], features=[feature],
                ax=ax_right, line_kw={"color": "saddlebrown", "linewidth": 2.5}
            )
        else:
            ax_right.text(0.5, 0.5, 'Model Not Trained', ha='center', va='center', color='red')

        ax_right.set_ylabel("")
        ax_right.grid(True, linestyle='--', alpha=0.6)

    # Set titles for the top row
    axes[0, 0].set_title("Target: Aboveground Carbon", fontsize=14)
    axes[0, 1].set_title("Target: Belowground Carbon", fontsize=14)
    
    # Set shared x-axis label for the bottom row
    axes[2, 0].set_xlabel(feature)
    axes[2, 1].set_xlabel(feature)

    save_path = os.path.join(COMBINED_OUTPUT_DIR, f'{feature}_combined_pdp.png')
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.savefig(save_path, dpi=150)
    plt.close(fig)

print(f"\n{'='*70}")
print(f"🎉 All tasks finished. Plots saved to: {COMBINED_OUTPUT_DIR}")
print(f"{'='*70}")


🚀 STARTING COMBINED PDP ANALYSIS
Step 1: Loading and preparing data...

Step 2: Training all Random Forest models...
  Training models for target: 'above.carbon.combusted'
    - Training on 'All Data' (1877 rows)...
      ... Done. Model OOB Score (R²): 0.355
    - Training on 'Old Data' (1011 rows)...
      ... Done. Model OOB Score (R²): 0.341
    - Training on 'New Data' (866 rows)...
      ... Done. Model OOB Score (R²): 0.330
  Training models for target: 'below.ground.carbon.combusted'
    - Training on 'All Data' (1877 rows)...
      ... Done. Model OOB Score (R²): 0.358
    - Training on 'Old Data' (1011 rows)...
      ... Done. Model OOB Score (R²): 0.218
    - Training on 'New Data' (866 rows)...
      ... Done. Model OOB Score (R²): 0.559

Step 3: Generating 73 combined 3x2 Partial Dependence Plots...
  - Plotting for: PFI
  - Plotting for: pH_30
  - Plotting for: Sand_30
  - Plotting for: Silt_30
  - Plotting for: Clay_30
  - Plotting for: DOB_lst
  - Plotting for: Relativ

In [4]:
't'

't'