In [2]:
import pandas as pd
import numpy as np

# --- Load all data sources ---
input_data = pd.read_csv("/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_data_v4.csv")
soil       = pd.read_csv("/explore/nobackup/people/spotter5/anna_v/v2/integrated_soil_data_1km_v2_sites.csv")
landcover  = pd.read_csv("/explore/nobackup/people/spotter5/anna_v/v2/extracted_landcover_values_v2.csv")
# keep these for potential use, but we’ll rename properly below
landcover  = landcover[['site_refer', 'land_cover_code']]

sm         = pd.read_csv("/explore/nobackup/people/spotter5/anna_v/v2/soil_moisture_by_site_monthly_2000_2023.csv")
cont       = pd.read_csv("/explore/nobackup/people/spotter5/anna_v/v2/co2_cont.csv")
alt        = pd.read_csv("/explore/nobackup/people/spotter5/anna_v/v2/ALT_by_site.csv")

# --- Initial Data Cleaning ---
# Keep EC only, drop rows without site_reference
input_data = input_data[input_data['flux_method'] == 'EC'].copy()
input_data = input_data.dropna(subset=['site_reference'])

# Drop rows without keys in other tables
soil      = soil.dropna(subset=['site_refer']).copy()
landcover = landcover.dropna(subset=['site_refer']).copy()

# Ensure key types are consistent before merging
for df in [input_data, alt, sm]:
    if 'site_reference' in df.columns:
        df['site_reference'] = df['site_reference'].astype(str)
for df in [soil, landcover]:
    if 'site_refer' in df.columns:
        df['site_refer'] = df['site_refer'].astype(str)

for df in [input_data, alt, sm, cont]:
    if 'year' in df.columns:
        df['year'] = pd.to_numeric(df['year'], errors='coerce').astype('Int64')
    if 'month' in df.columns:
        df['month'] = pd.to_numeric(df['month'], errors='coerce').astype('Int64')

# Deduplicate on merge keys
input_data = input_data.drop_duplicates(subset=['site_reference', 'year', 'month'])
soil       = soil.drop_duplicates(subset=['site_refer'])
landcover  = landcover.drop_duplicates(subset=['site_refer'])
alt        = alt.drop_duplicates(subset=['site_reference', 'year'])
sm         = sm.drop_duplicates(subset=['site_reference', 'year', 'month'])

print(f"Initial shape: {input_data.shape}")

# --- Prepare and Merge Soil (static) ---
# keep only 100 cm depth columns; carry site_reference for join
soil_filtered = soil.filter(regex='100cm$').copy()
soil_filtered["site_reference"] = soil["site_refer"].values
input_data = input_data.merge(soil_filtered, on="site_reference", how="left", validate="m:1")
print(f"After soil merge: {input_data.shape}")

# --- Prepare and Merge Land Cover (static) ---
# rename to match keys and column you want in final data
landcover = landcover.rename(columns={'site_refer': 'site_reference',
                                      'land_cover_code': 'land_cover'})
# keep only the necessary columns (keep lat/lon too if you want them in final)
landcover = landcover[['site_reference', 'land_cover']]
input_data = input_data.merge(landcover, on="site_reference", how="left", validate="m:1")
print(f"After landcover merge: {input_data.shape}")

# --- Prepare and Merge CO2 (time-varying by year/month) ---
co2_to_merge = cont[['year', 'month', 'value']].copy()
co2_to_merge = co2_to_merge.rename(columns={'value': 'co2_cont'})
co2_to_merge = co2_to_merge.drop_duplicates(subset=['year', 'month'])
input_data = input_data.merge(co2_to_merge, on=['year', 'month'], how='left', validate="m:1")
print(f"After CO2 merge: {input_data.shape}")

# --- Prepare and Merge ALT (time-varying by site/year) ---
alt_to_merge = alt[['site_reference', 'year', 'ALT']].copy()
alt_to_merge = alt_to_merge.drop_duplicates(subset=['site_reference', 'year'])
input_data = input_data.merge(alt_to_merge, on=['site_reference', 'year'], how='left', validate="m:1")
print(f"After ALT merge: {input_data.shape}")

# --- Prepare and Merge Soil Moisture (time-varying by site/year/month) ---
# Expecting columns: site_reference, year, month, sm_surface, sm_rootzone
needed_cols = {'site_reference', 'year', 'month', 'sm_surface', 'sm_rootzone'}
missing = needed_cols.difference(set(sm.columns))
if missing:
    raise ValueError(f"Soil moisture CSV is missing expected columns: {missing}")

input_data = input_data.merge(
    sm[['site_reference', 'year', 'month', 'sm_surface', 'sm_rootzone']],
    on=['site_reference', 'year', 'month'],
    how='left',
    validate='m:1'
)
print(f"After soil moisture merge: {input_data.shape}")

# --- Final Data Type Conversion for Land Cover ---
# Fill any missing values (NaN) in 'land_cover' with -9999 and cast to int
if 'land_cover' in input_data.columns:
    input_data['land_cover'] = input_data['land_cover'].fillna(-9999).astype(int)

# --- Save Final Combined Data ---
output_path_final = "/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_final.csv"
input_data.to_csv(output_path_final, index=False)

print(f"\nSuccessfully merged all data and saved to: {output_path_final}")
print("Final DataFrame head:")
print(input_data.head())
print("\nFinal DataFrame columns:")
print(input_data.columns)
if 'land_cover' in input_data.columns:
    print(f"\nData type of 'land_cover' column: {input_data['land_cover'].dtype}")


  input_data = pd.read_csv("/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_data_v4.csv")


Initial shape: (56731, 51)
After soil merge: (56731, 61)
After landcover merge: (56731, 62)
After CO2 merge: (56731, 63)
After ALT merge: (56731, 64)
After soil moisture merge: (56731, 66)

Successfully merged all data and saved to: /explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_final.csv
Final DataFrame head:
                                           site_name  \
0                                         Skyttorp 2   
1                                  Wolf_creek_forest   
2  Alberta - Western Peatland - LaBiche River,Bla...   
3                             Elgeeii forest station   
4                                           Faejemyr   

                                      site_reference   latitude   longitude  \
0                            Skyttorp 2_SE-Sk2_tower  60.129667   17.840056   
1                     Wolf_creek_forest_CA-WCF_tower  60.596886 -134.952833   
2  Alberta - Western Peatland - LaBiche River,Bla...  54.953840 -112.466980   
3                Elg

With depth

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import PartialDependenceDisplay
import os
import warnings

def run_pdp_analysis(target_variable, output_directory, input_csv_path, old_ids_csv_path):
    """
    Runs the full modeling and PDP generation pipeline for a specific target variable.

    Args:
        target_variable (str): The name of the column to use as the target.
        output_directory (str): The path to save the generated PDP images.
        input_csv_path (str): Path to the main input CSV file.
        old_ids_csv_path (str): Path to the CSV containing old plot IDs and, if available, burn.depth.
    """
    # --- Setup & Introduction ---
    print(f"\n{'='*70}")
    print(f"🚀 STARTING ANALYSIS | TARGET: '{target_variable}'")
    print(f"{'='*70}")

    warnings.filterwarnings('ignore', category=FutureWarning)
    warnings.filterwarnings('ignore', category=UserWarning)
    os.makedirs(output_directory, exist_ok=True)

    # --- 1. Data Preparation ---
    print("Step 1: Loading and preparing data...")
    try:
        df_main = pd.read_csv(input_csv_path)
        df_old_ids = pd.read_csv(old_ids_csv_path)
    except FileNotFoundError as e:
        print(f"❌ Error: Could not find input file. {e}")
        return

    # Normalize 'id' types & dedupe
    if 'id' not in df_main.columns or 'id' not in df_old_ids.columns:
        print("❌ Error: Both input files must have an 'id' column.")
        return

    for df in (df_main, df_old_ids):
        # coerce to numeric if possible (keeps strings if not)
        try:
            df['id'] = pd.to_numeric(df['id'], errors='ignore')
        except Exception:
            pass
    df_old_ids = df_old_ids.drop_duplicates(subset='id')

    # If we're modeling burn.depth, try to backfill from old file (if present there)
    if target_variable == 'burn.depth' and 'burn.depth' in df_old_ids.columns:
        print("  - Backfilling 'burn.depth' from old CSV where missing in main...")
        before_na = df_main['burn.depth'].isna().sum() if 'burn.depth' in df_main.columns else None
        if 'burn.depth' not in df_main.columns:
            df_main['burn.depth'] = np.nan
        df_main = df_main.set_index('id')
        df_old_depth = df_old_ids.set_index('id')['burn.depth']
        df_main['burn.depth'] = df_main['burn.depth'].combine_first(df_old_depth)
        df_main = df_main.reset_index()
        after_na = df_main['burn.depth'].isna().sum()
        if before_na is not None:
            print(f"    Filled {before_na - after_na} missing burn.depth values.")

    # Define columns to exclude from predictors (X)
    POTENTIAL_TARGETS = ['below.ground.carbon.combusted', 'above.carbon.combusted', 'burn.depth']
    METADATA_COLUMNS = [
        'burn_year', 'project.name', 'latitude', 'longitude', 'Date', 'id', 'CNA_MAR'
    ]
    COLS_TO_DROP_FROM_X = POTENTIAL_TARGETS + METADATA_COLUMNS

    # Prepare splits
    old_ids = df_old_ids['id'].unique()
    data_splits = {
        "All Data": df_main,
        "Old Data": df_main[df_main['id'].isin(old_ids)],
        "New Data": df_main[~df_main['id'].isin(old_ids)]
    }

    # --- 2. Model Training ---
    print(f"Step 2: Training Random Forest models on '{target_variable}'...")
    models = {}
    for name, data in data_splits.items():
        print(f"  - Training on '{name}' ({len(data)} rows)...")

        if target_variable not in data.columns:
            print(f"    ⚠️ Skipping '{name}' – target '{target_variable}' not in columns.")
            continue

        # Drop rows missing the CURRENT target variable
        df_clean = data.dropna(subset=[target_variable]).copy()

        # Build X (numeric only), drop constant/all-NaN cols
        X = df_clean.drop(columns=COLS_TO_DROP_FROM_X, errors='ignore')
        X = X.select_dtypes(include=[np.number])
        if X.shape[1] == 0:
            print(f"    ⚠️ Skipping '{name}' – no numeric predictors after cleaning.")
            continue
        # drop all-NaN columns
        X = X.loc[:, X.notna().any(axis=0)]
        # drop constant columns
        constant_cols = [c for c in X.columns if X[c].nunique(dropna=True) <= 1]
        if constant_cols:
            X = X.drop(columns=constant_cols)

        y = df_clean[target_variable].astype(float)

        n = len(y)
        if n < 2:
            print(f"    ⚠️ Skipping '{name}' – insufficient samples after dropna (n={n}).")
            continue

        # Enable OOB only when there are enough samples to make it meaningful
        use_oob = n > 10
        rf = RandomForestRegressor(
            n_estimators=500,
            random_state=42,
            n_jobs=-1,
            oob_score=use_oob
        )
        rf.fit(X, y)
        if use_oob:
            print(f"    ...Done. Model OOB Score (R²): {rf.oob_score_:.3f}")
        else:
            print(f"    ...Done. (OOB disabled; n={n})")

        models[name] = {'model': rf, 'X': X}

    if "All Data" not in models:
        print("❌ No trainable 'All Data' model. Aborting PDP stage.")
        return

    # --- 3. Generate and Save Partial Dependence Plots ---
    feature_list = models['All Data']['X'].columns
    print(f"\nStep 3: Generating {len(feature_list)} Partial Dependence Plots...")

    # Only include splits that successfully trained
    trained_order = [k for k in ["All Data", "Old Data", "New Data"] if k in models]
    if not trained_order:
        print("❌ No trained models available for PDP.")
        return

    for feature in feature_list:
        print(f"  - Plotting for: {feature}")
        fig, axes = plt.subplots(len(trained_order), 1, figsize=(8, 4 * len(trained_order)), sharex=True)
        if len(trained_order) == 1:
            axes = [axes]
        fig.suptitle(f'Partial Dependence on: {feature}\n(Target: {target_variable})', fontsize=16, y=0.96)

        for ax, model_name in zip(axes, trained_order):
            model_info = models[model_name]
            try:
                PartialDependenceDisplay.from_estimator(
                    estimator=model_info['model'], X=model_info['X'], features=[feature],
                    ax=ax, line_kw={"color": "darkcyan", "linewidth": 2.5}
                )
                ax.set_title(f"{model_name} (n={len(model_info['X'])})")
                ax.set_ylabel("Partial Dependence")
                ax.grid(True, linestyle='--', alpha=0.6)
            except Exception as e:
                ax.set_title(f"{model_name} – PDP failed for '{feature}' ({e})")
                ax.axis('off')

        save_path = os.path.join(output_directory, f'{feature}.png')
        plt.tight_layout(rect=[0, 0, 1, 0.94])
        plt.savefig(save_path)
        plt.close(fig)

    print(f"\n✅ ANALYSIS COMPLETE for '{target_variable}'. Plots saved to: {output_directory}")


if __name__ == "__main__":
    # --- Master Configuration ---
    INPUT_CSV = "/explore/nobackup/people/spotter5/new_combustion/2025-08-13_LC_FISL_Original_combustionModelPredictors.csv"
    OLD_PREDICTORS_CSV = "/explore/nobackup/people/spotter5/new_combustion/Combustion_SynthesisData_05042018_XJW.csv"
    BASE_OUT_PATH = "/explore/nobackup/people/spotter5/new_combustion"

    analyses = {
        'below.ground.carbon.combusted': os.path.join(BASE_OUT_PATH, "pdp_belowground"),
        'above.carbon.combusted': os.path.join(BASE_OUT_PATH, "pdp_aboveground"),
        'burn.depth': os.path.join(BASE_OUT_PATH, "pdp_depth")
    }

    for target_col, out_dir in analyses.items():
        run_pdp_analysis(
            target_variable=target_col,
            output_directory=out_dir,
            input_csv_path=INPUT_CSV,
            old_ids_csv_path=OLD_PREDICTORS_CSV
        )

    print(f"\n{'='*70}")
    print("🎉 All tasks finished.")
    print(f"{'='*70}")



🚀 STARTING ANALYSIS | TARGET: 'below.ground.carbon.combusted'
Step 1: Loading and preparing data...
Step 2: Training Random Forest models on 'below.ground.carbon.combusted'...
  - Training on 'All Data' (1877 rows)...


In [6]:
import pandas as pd
input_data = pd.read_csv("/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_data_final.csv")
input_data = input_data[input_data['site_reference'] == 'Zackenberg Heath_GL-ZaH_tower']

input_data = input_data[['year', 'month', 'nee', 'tmmx', 'tmmn', 'pr']]
input_data.sort_values(by = 'year')

FileNotFoundError: [Errno 2] No such file or directory: '/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_data_final.csv'

In [10]:
import pandas as pd
input_data = pd.read_csv("/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_final.csv")
input_data = input_data.drop_duplicates(subset = 'site_reference')

input_data = input_data[['site_reference', 'latitude', 'longitude', 'land_cover_eco', 'land_cover_plot', 'bawld_class']]
input_data

# input_data = input_data[['year', 'month', 'nee', 'tmmx', 'tmmn', 'pr']]
# input_data.sort_values(by = 'year')

Unnamed: 0,site_reference,latitude,longitude,land_cover_eco,land_cover_plot,bawld_class
0,Skyttorp 2_SE-Sk2_tower,60.129667,17.840056,70.0,70.0,Boreal Forest
1,Wolf_creek_forest_CA-WCF_tower,60.596886,-134.952833,70.0,70.0,Boreal Forest
2,"Alberta - Western Peatland - LaBiche River,Bla...",54.953840,-112.466980,160.0,160.0,Fen
3,Elgeeii forest station_RU-Ege_tower,60.015516,133.824012,90.0,90.0,Boreal Forest
4,Faejemyr_SE-Faj_tower,56.265500,13.553500,180.0,180.0,Bog
...,...,...,...,...,...,...
429,ARM-NSA-Barrow_US-A10_tower,71.323000,-156.609000,153.0,153.0,Wet Tundra
445,Barrow-CMDL_US-Brw_tower,71.322525,-156.609200,180.0,180.0,Wet Tundra
446,"Bayelva, Spitsbergen_SJ-Blv_tower",78.921600,11.831100,130.0,130.0,Dry Tundra
457,Central Marsh_US-Cms_tower,71.320190,-156.622270,180.0,180.0,Wet Tundra


In [2]:
input_data.columns

Index(['site_name', 'site_reference', 'latitude', 'longitude', 'flux_method',
       'country', 'land_cover_eco', 'land_cover_plot', 'bawld_class', 'year',
       'month', 'siteID', 'EVI', 'NDVI', 'SummaryQA', 'sur_refl_b01',
       'sur_refl_b02', 'sur_refl_b03', 'sur_refl_b07', 'NDWI', 'aet', 'def',
       'pdsi', 'pet', 'pr', 'ro', 'soil', 'srad', 'swe', 'tmmn', 'tmmx', 'vap',
       'vpd', 'vs', 'lai', 'fpar', 'Percent_NonTree_Vegetation',
       'Percent_NonVegetated', 'Percent_Tree_Cover', 'nee', 'gpp', 'reco',
       'ch4_flux_total', 'Flux'],
      dtype='object')

same thing but use thew 16day modis data

In [4]:
import pandas as pd
import numpy as np

# --- Load all data sources ---
input_data = pd.read_csv("/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_data_16daymodis.csv")
input_data2 = pd.read_csv("/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_data_v3.csv")
soil = pd.read_csv("/explore/nobackup/people/spotter5/anna_v/v2/integrated_soil_data_1km_v2_sites.csv")
landcover = pd.read_csv("/explore/nobackup/people/spotter5/anna_v/v2/extracted_landcover_values.csv")
cont = pd.read_csv("/explore/nobackup/people/spotter5/anna_v/v2/co2_cont.csv")
alt = pd.read_csv("/explore/nobackup/people/spotter5/anna_v/v2/ALT_by_site.csv")

# --- Initial Data Cleaning ---
input_data = input_data[input_data['flux_method'] == 'EC']
input_data = input_data.dropna(subset=['site_reference'])
soil = soil.dropna(subset=['site_refer'])
landcover = landcover.dropna(subset=['site_refer'])

input_data = input_data.drop_duplicates(subset=['site_reference', 'year', 'month'])
soil = soil.drop_duplicates(subset=['site_refer'])
landcover = landcover.drop_duplicates(subset=['site_refer'])

print(f"Initial shape: {input_data.shape}")

# --- START: EDITED SECTION ---
# This section replaces the surface reflectance bands in input_data with those from input_data2

# 1. Define the reflectance columns to be replaced and the keys for merging
reflectance_cols = ['sur_refl_b01', 'sur_refl_b02', 'sur_refl_b03', 'sur_refl_b07']
merge_keys = ['site_reference', 'year', 'month']

# 2. Create a small DataFrame from input_data2 with only the keys and the desired reflectance columns
reflectance_to_merge = input_data2[merge_keys + reflectance_cols].copy()
reflectance_to_merge = reflectance_to_merge.drop_duplicates(subset=merge_keys)

# 3. Drop the old reflectance columns from the main dataframe to avoid conflicts
# input_data = input_data.drop(columns=reflectance_cols)

# 4. Merge the new reflectance values into the main dataframe
input_data = input_data.merge(reflectance_to_merge, on=merge_keys, how='left')

print(f"After replacing reflectance bands: {input_data.shape}")
# --- END: EDITED SECTION ---


# --- Prepare and Merge Soil Data ---
soil_filtered = soil.filter(regex='100cm$').copy()
soil_filtered["site_reference"] = soil["site_refer"]
input_data = input_data.merge(soil_filtered, on="site_reference", how="left")

print(f"After soil merge: {input_data.shape}")

# --- Prepare and Merge Land Cover Data ---
landcover = landcover.rename(columns={'site_refer': 'site_reference'})
landcover = landcover[['site_reference', 'land_cover']]
input_data = input_data.merge(landcover, on="site_reference", how="left")

print(f"After landcover merge: {input_data.shape}")

# --- Prepare and Merge CO2 Data ---
co2_to_merge = cont[['year', 'month', 'value']].copy()
co2_to_merge = co2_to_merge.rename(columns={'value': 'co2_cont'})
co2_to_merge = co2_to_merge.drop_duplicates(subset=['year', 'month'])
input_data = input_data.merge(co2_to_merge, on=['year', 'month'], how='left')

print(f"After CO2 merge: {input_data.shape}")

# --- Prepare and Merge ALT Data ---
alt_to_merge = alt[['site_reference', 'year', 'ALT']].copy()
alt_to_merge = alt_to_merge.drop_duplicates(subset=['site_reference', 'year'])
input_data = input_data.merge(alt_to_merge, on=['site_reference', 'year'], how='left')

print(f"After ALT merge: {input_data.shape}")

# --- Final Data Type Conversion for Land Cover ---
# Fill any missing values (NaN) in 'land_cover' with -9999
input_data['land_cover'] = input_data['land_cover'].fillna(-9999)

# Convert the 'land_cover' column to integer type
input_data['land_cover'] = input_data['land_cover'].astype(int)

# --- Save Final Combined Data ---
# Note: The output filename includes 'mod16', you may want to change this to reflect the new data source
output_path_final = "/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_final_mod16.csv"
input_data.to_csv(output_path_final, index=False)

print(f"\nSuccessfully merged all data and saved to: {output_path_final}")
print("Final DataFrame head:")
print(input_data.head())
print("\nFinal DataFrame columns:")
print(input_data.columns)
print(f"\nData type of 'land_cover' column: {input_data['land_cover'].dtype}")

Initial shape: (56731, 39)
After replacing reflectance bands: (56731, 43)
After soil merge: (56731, 53)
After landcover merge: (56731, 54)
After CO2 merge: (56731, 55)
After ALT merge: (56731, 56)

Successfully merged all data and saved to: /explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_final_mod16.csv
Final DataFrame head:
                                           site_name  \
0                                     ARM-NSA-Barrow   
1                                    ARM-NSA-Oliktok   
2                      Abisko Stordalen birch forest   
3                                        Adventdalen   
4  Alberta - Western Peatland - LaBiche River,Bla...   

                                      site_reference   latitude   longitude  \
0                        ARM-NSA-Barrow_US-A10_tower  71.323000 -156.609000   
1                       ARM-NSA-Oliktok_US-A03_tower  70.495000 -149.886000   
2                Abisko Stordalen birch forest_tower  68.347939   19.049769   
3     

In [3]:
input_data2.columns

Index(['site_name', 'site_reference', 'latitude', 'longitude', 'flux_method',
       'country', 'land_cover_eco', 'land_cover_plot', 'bawld_class', 'year',
       'month', 'siteID', 'EVI', 'NDVI', 'SummaryQA', 'sur_refl_b01',
       'sur_refl_b02', 'sur_refl_b03', 'sur_refl_b07', 'NDWI', 'aet', 'def',
       'pdsi', 'pet', 'pr', 'ro', 'soil', 'srad', 'swe', 'tmmn', 'tmmx', 'vap',
       'vpd', 'vs', 'lai', 'fpar', 'Percent_NonTree_Vegetation',
       'Percent_NonVegetated', 'Percent_Tree_Cover', 'nee', 'gpp', 'reco',
       'ch4_flux_total', 'Flux'],
      dtype='object')