# ML -- Decision Trees
### Let's start with getting data ready

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

data_dir = '../data'
for file in os.listdir(data_dir):
    if os.path.isdir(file):
        continue
    print(file)

plot_ndvi.pkl
weather_25_clipped.pkl
df_savi.pkl
plot_elev_features.pkl
df_ndwi.pkl
df_ndvi_rendvi.pkl
df_evi.pkl
ndvi_raw_2025.pkl
DEM
plot_features.pkl
df_mcari2.pkl
ndvi_2025.pkl
df_2025.pkl
PRISM
df.pkl
plot_ndvi_filtered.pkl
plot_ndvi_filtered_2025.pkl
polygons
ndvi
plot_ndvi_2025.pkl


### For this notebook, let's pick NDVI / RENDVI. They are coupled together by a quirk in data assembly, but will make good test cases.

In [2]:
df_path = os.path.join(data_dir, 'df_ndvi_rendvi.pkl')
df = pd.read_pickle(df_path)

### Now identify leakage columns. That's anything after week 35, and other various data that might be captured.

In [3]:
# --- Identify leakage columns ---
leakage_cols = [col for col in df.columns if any(
    col.endswith(f"_{m}") for m in range(36,45)
)]
leakage_cols.extend([col for col in df.columns if col.endswith('length')])
leakage_cols.extend([col for col in df.columns if 'mcari2' in col])

### identify targets we're estimating, and features

In [4]:
# --- Define target and features ---
target_cols = [
    'ndvi_smooth_mean_36',
    'ndvi_smooth_mean_37',
    'ndvi_smooth_mean_38',
    'ndvi_smooth_mean_39',
    'ndvi_smooth_mean_40',
    'ndvi_smooth_mean_41',
    'ndvi_smooth_mean_42',
    'ndvi_smooth_mean_43',
    'ndvi_smooth_mean_44',
    
    'rendvi_smooth_mean_36',
    'rendvi_smooth_mean_37',
    'rendvi_smooth_mean_38',
    'rendvi_smooth_mean_39',
    'rendvi_smooth_mean_40',
    'rendvi_smooth_mean_41',
    'rendvi_smooth_mean_42',
    'rendvi_smooth_mean_43',
    'rendvi_smooth_mean_44',
]
X = df.drop(columns=leakage_cols + ['plot_id', 'year'] + target_cols)
y = df[target_cols]

### Split the data

In [5]:
from sklearn.model_selection import train_test_split

X_test, X_hold, y_test, y_hold = train_test_split(
    X, y, test_size = 0.8, random_state = 42
)

X_train, X_tune, y_train, y_tune = train_test_split(
    X_hold, y_hold, test_size = 0.25, random_state = 42
)

print(X_train.shape)
print(X_tune.shape)
print(X_test.shape)

print(y_train.shape)
print(y_tune.shape)
print(y_test.shape)

(351, 175)
(117, 175)
(117, 175)
(351, 18)
(117, 18)
(117, 18)


### Train a single decision tree.
### It turns out sklearn now automajickally applies multi-output regression.

In [6]:
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor(random_state=84726)
tree.fit(X_train, y_train)

import joblib

# Save the model
joblib.dump(tree, 'models/decision_tree.pkl')
# tree_loaded = joblib.load('decision_tree.pkl')

['models/decision_tree.pkl']

In [7]:
y_pred = tree.predict(X_tune)
y_pred.shape

(117, 18)

In [8]:
from sklearn.metrics import mean_squared_error, r2_score


print("Decision Tree R2:", r2_score(y_tune, y_pred))
print("Decision Tree RMSE:", mean_squared_error(y_tune, y_pred))

Decision Tree R2: 0.7291268321248822
Decision Tree RMSE: 0.001668009221535867


### Let's see a random forest do better

In [9]:

from sklearn.ensemble import RandomForestRegressor

# Initialize a Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=93628)
rf.fit(X_train, y_train)
joblib.dump(rf, 'models/random_forest.pkl')

y_pred = rf.predict(X_tune)
print("Random Forest R2:", r2_score(y_tune, y_pred))
print("Random Forest RMSE:", mean_squared_error(y_tune, y_pred))


Random Forest R2: 0.8799212195806292
Random Forest RMSE: 0.0007499239335900776


In [10]:
y_pred.shape

(117, 18)