In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

data_dir = '../data'
for file in os.listdir(data_dir):
    if os.path.isdir(file):
        continue
    print(file)

plot_ndvi.pkl
weather_25_clipped.pkl
df_savi.pkl
plot_elev_features.pkl
df_ndwi.pkl
df_ndvi_rendvi.pkl
df_evi.pkl
ndvi_raw_2025.pkl
DEM
plot_features.pkl
df_mcari2.pkl
ndvi_2025.pkl
df_2025.pkl
PRISM
df.pkl
plot_ndvi_filtered.pkl
plot_ndvi_filtered_2025.pkl
polygons
ndvi
plot_ndvi_2025.pkl


In [2]:
df_path = os.path.join(data_dir, 'df_ndvi_rendvi.pkl')
df = pd.read_pickle(df_path)

df

In [4]:
# --- Identify leakage columns ---
leakage_cols = [col for col in df.columns if any(
    col.endswith(f"_{m}") for m in range(36,45)
)]
leakage_cols.extend([col for col in df.columns if col.endswith('length')])
leakage_cols.extend([col for col in df.columns if 'mcari2' in col])

In [5]:
# dynamic_features = ['ndvi','rendvi', 'tmean', 'tmin', 'tmax', 'gdd', 'vpdmax']


In [6]:
# --- Define target and features ---
target_cols = [
    'ndvi_smooth_mean_36',
    'ndvi_smooth_mean_37',
    'ndvi_smooth_mean_38',
    'ndvi_smooth_mean_39',
    'ndvi_smooth_mean_40',
    'ndvi_smooth_mean_41',
    'ndvi_smooth_mean_42',
    'ndvi_smooth_mean_43',
    'ndvi_smooth_mean_44',
    
    'rendvi_smooth_mean_36',
    'rendvi_smooth_mean_37',
    'rendvi_smooth_mean_38',
    'rendvi_smooth_mean_39',
    'rendvi_smooth_mean_40',
    'rendvi_smooth_mean_41',
    'rendvi_smooth_mean_42',
    'rendvi_smooth_mean_43',
    'rendvi_smooth_mean_44',
]
X = df.drop(columns=leakage_cols + ['plot_id', 'year'] + target_cols)
y = df[target_cols]

In [7]:
from sklearn.model_selection import train_test_split

X_test, X_hold, y_test, y_hold = train_test_split(
    X, y, test_size = 0.8, random_state = 42
)

X_train, X_tune, y_train, y_tune = train_test_split(
    X_hold, y_hold, test_size = 0.25, random_state = 42
)

print(X_train.shape)
print(X_tune.shape)
print(X_test.shape)

print(y_train.shape)
print(y_tune.shape)
print(y_test.shape)

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler


numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns

numeric_transformer = Pipeline(steps = [
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, numeric_features)
    ],
    remainder = 'drop'
)

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    ExtraTreesRegressor,
    StackingRegressor
)
from sklearn.linear_model import Ridge, Lasso, ElasticNetCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.compose import TransformedTargetRegressor

from xgboost import XGBRegressor

# --------------------------
# Base models
# --------------------------
base_models = [
    ('rf', RandomForestRegressor(
        n_estimators=200, max_depth=12, random_state=8274, n_jobs=-1
    )),
    ('et', ExtraTreesRegressor(
        n_estimators=300, max_depth=12, random_state=836409, n_jobs=-1
    )),
    ('gb', GradientBoostingRegressor(
        n_estimators=150, learning_rate=0.05, max_depth=8, random_state=13425
    )),
    ('xgb', XGBRegressor(
        n_estimators=200, learning_rate=0.05, max_depth=6,
        subsample=0.5, colsample_bytree=0.5, random_state=4111112, n_jobs=-1
    )),
    ('knn', KNeighborsRegressor(n_neighbors=8, weights='distance', n_jobs=-1))
]

In [11]:
meta_model = ElasticNetCV(
    l1_ratio=[0.1, 0.5, 0.9],
    alphas=np.logspace(-2, 1, 10),  
    cv=5,
    max_iter=20000,
    n_jobs=-1
)

In [12]:
stacked_model = StackingRegressor(
    estimators=base_models,
    final_estimator=meta_model,
    passthrough=True,
    cv=5,
    n_jobs=-1
)

In [13]:
multi_output_model = MultiOutputRegressor(stacked_model, n_jobs=-1)

In [14]:
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),  
    ('regressor', TransformedTargetRegressor(
        regressor=multi_output_model,
        transformer=StandardScaler()  
    ))
])

In [15]:
model_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,regressor,MultiOutputRe... n_jobs=-1)
,transformer,StandardScaler()
,func,
,inverse_func,
,check_inverse,True

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,12
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,n_estimators,300
,criterion,'squared_error'
,max_depth,12
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,False

0,1,2
,loss,'squared_error'
,learning_rate,0.05
,n_estimators,150
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,8
,min_impurity_decrease,0.0

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.5
,device,
,early_stopping_rounds,
,enable_categorical,False

0,1,2
,n_neighbors,8
,weights,'distance'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,-1

0,1,2
,l1_ratio,"[0.1, 0.5, ...]"
,eps,0.001
,n_alphas,'deprecated'
,alphas,array([ 0.01 ... 10. ])
,fit_intercept,True
,precompute,'auto'
,max_iter,20000
,tol,0.0001
,cv,5
,copy_X,True

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [16]:
import joblib

# Save the model
joblib.dump(model_pipeline, 'models/tree_ensemble.pkl')

['models/tree_ensemble.pkl']

In [18]:
y_pred = model_pipeline.predict(X_tune)
y_pred.shape

(117, 18)

In [19]:
from sklearn.metrics import mean_squared_error, r2_score


print("Decision Tree R2:", r2_score(y_tune, y_pred))
print("Decision Tree RMSE:", mean_squared_error(y_tune, y_pred))

Decision Tree R2: 0.909187203835522
Decision Tree RMSE: 0.0005356480382494175
