In [1]:
import xarray as xr
import rioxarray
import numpy as np
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, explained_variance_score
import os

try:
    import util
except ImportError:
    if 'notebooks' in os.getcwd():
        os.chdir("..")
finally:
    import util



In [2]:
# Prepare data - since we are working with tabular data here we convert
# to a dataframe.
ds = xr.open_dataset("data_working/westmort.nc")

# Calculate total basal area based on all the genus rasters
ba_vars = ["abies", "picea", "populus", "tsuga", "pseudotsuga"]
ds["forest_ba"] = ds[ba_vars].to_stacked_array("band", sample_dims=["x", "y", "time"]).sum(dim="band")

# Smoothing
ds["mort_ewma"] = xr.apply_ufunc(
    util.manip.numpy_ewma_vectorized,
    ds.mortality,
    input_core_dims=[["time"]],
    output_core_dims=[["time"]],
    vectorize=True,
    kwargs=dict(window=3)
)

# Shift so we are doing proper out-year forecasting
ds["mort_nextyear"] = ds["mort_ewma"].shift(time=-1)

In [3]:
df = ds[["elev", "forest_ba", "fire", "mort_ewma", "mort_nextyear", "prcp", "vp", "vod"]].to_dataframe().reset_index().dropna()
df

Unnamed: 0,time,y,x,elev,forest_ba,fire,mort_ewma,mort_nextyear,prcp,vp,vod
620454,1998,4682302.419,-1.243822e+07,2784.0,78.0,0.0,0.750000,0.375000,791.0,838.0,452.0
620455,1998,4682302.419,-1.243422e+07,2998.0,38.0,0.0,0.750000,0.375000,760.0,813.0,452.0
620457,1998,4682302.419,-1.242622e+07,3118.0,49.0,0.0,20.000000,14.000000,868.0,733.0,452.0
620458,1998,4682302.419,-1.242222e+07,3081.0,39.0,0.0,12.000000,9.500000,931.0,695.0,452.0
620462,1998,4682302.419,-1.240622e+07,2515.0,9.0,0.0,0.750000,0.375000,476.0,688.0,351.0
...,...,...,...,...,...,...,...,...,...,...,...
8461005,2018,3686302.419,-1.227422e+07,1714.0,0.0,0.0,0.000122,0.000061,471.0,1348.0,358.0
8461593,2018,3682302.419,-1.228222e+07,1897.0,4.0,0.0,0.031447,0.015723,571.0,1357.0,358.0
8461594,2018,3682302.419,-1.227822e+07,2000.0,21.0,0.0,0.001768,0.000884,731.0,1317.0,358.0
8461595,2018,3682302.419,-1.227422e+07,1856.0,0.0,0.0,0.000549,0.000275,496.0,1365.0,358.0


In [4]:
years = ds.time.values

train_years = years[16:]
valid_years = years[:8]
test_years = years[8:16]

df_train = df[df.time.isin(train_years)]
df_valid = df[df.time.isin(valid_years)]
df_test  = df[df.time.isin(test_years)]

exclude = ["time", "y", "x"]

df_train = df_train.drop(columns=exclude)
df_valid = df_valid.drop(columns=exclude)
df_test  = df_test.drop(columns=exclude)

In [5]:
def split_xy(df, target):
    return (
        df.drop(columns=target).to_numpy(),
        df[target].to_numpy()
    )

def get_results(m, X, y):
    y_hat = m.predict(X)
    return {
        "mse": mean_squared_error(y, y_hat),
        "r2": explained_variance_score(y, y_hat)
    }

In [6]:
# Training
target = "mort_nextyear"

m = RandomForestRegressor(
    oob_score=True,
    max_features="sqrt"
)

downsample_factor = 5

X, y = split_xy(df_train, target)

X = X[::downsample_factor]
y = y[::downsample_factor]

m.fit(
    X, y
)

# This is artificially inflated because we aren't running
# on out of bag samples. Need to use the oob_prediction_
# property.
# train_result = get_results(m, X, y)
# print(train_result)

In [7]:
# This is a more accurate representation of training R2
print({
    "mse": mean_squared_error(y, m.oob_prediction_),
    "r2": explained_variance_score(y, m.oob_prediction_)
})

{'mse': 15.547926559429648, 'r2': 0.7076079018467228}


In [8]:
X_valid, y_valid = split_xy(df_valid, target)
valid_result = get_results(m, X_valid, y_valid)
print(valid_result)

{'mse': 26.539373368226492, 'r2': 0.5345122605585748}


In [9]:
X_test, y_test = split_xy(df_test, target)
test_result = get_results(m, X_test, y_test)
print(test_result)

{'mse': 14.813521885194314, 'r2': 0.6697971034015373}


In [10]:
def get_naive_error(ds, target):
    y = ds[target].values.flatten()
    y_hat = ds[target].shift(time=-1).values.flatten()

    # remove nans from calculation
    mask = (~np.isnan(y)) & (~np.isnan(y_hat))
    y = y[mask]
    y_hat = y_hat[mask]

    return {
        "mse": mean_squared_error(y, y_hat),
        "r2": explained_variance_score(y, y_hat)
    }

In [11]:
get_naive_error(ds.sel(time=train_years), "mort_ewma")

{'mse': 19.77752, 'r2': 0.5958353281021118}

In [12]:
get_naive_error(ds.sel(time=valid_years), "mort_ewma")

{'mse': 19.721703, 'r2': 0.22245818376541138}

In [13]:
get_naive_error(ds.sel(time=test_years), "mort_ewma")

{'mse': 12.427801, 'r2': 0.7395472526550293}