In [47]:
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score,precision_score,classification_report,confusion_matrix,r2_score
from sklearn.model_selection import GridSearchCV

root_file = './data/FuelsDataWBio/'
dist_file = './data/fuel_class_distributions_2023.csv'
evt_file = './data/LF2024_EVT.csv'
bps_file = './data/LF16_BPS.csv'
dist_frame = pd.read_csv(dist_file)

# zones = [
#     6,
#     17,
#     18,
#     19,
#     26,
#     27,
#     28,
#     29,
#     30,
#     31,
#     32,
#     33,
#     34,
#     35,
#     36,
#     41
# ]

zones = [
    27,
    28,
    29,
    30,
    31,
    32,
    33,
    34,
    35,
    41
]

pyromes= [
    33
]
eval_pyromes = [
    30
]
year = 2023
years = [
    # 2020,
    # 2021,
    2022,
    2023
]

from_vals = [
    91,92,93,98,99,
    101,102,103,104,105,106,107,108,109,
    121,122,123,124,
    141,142,143,144,145,146,147,148,149,
    161,162,163,164,165,
    181,182,183,184,185,186,187,188,189,
    201,202,203,204
]

to_vals = [
    1,1,1,1,1,
    2,2,2,2,2,2,2,2,2,
    3,3,3,3,
    4,4,4,4,4,4,4,4,4,
    5,5,5,5,5,
    6,6,6,6,6,6,6,6,6,
    7,7,7,7
]

In [2]:
lf_25_file = './data/LF24_F40_250.ods'

lfdf = pd.read_excel(lf_25_file,engine='odf')

def load_data(pyromes,years):
    yearly_frames = []
    for pyrome in pyromes:
        for year in years:
            year_csv_file = root_file + f'stratified_sample_fbfm40_30m_{pyrome}_{year}.csv'
            year_fuels_sample = pd.read_csv(year_csv_file)
            yearly_frames.append(year_fuels_sample)

    fuels_frame = pd.concat(yearly_frames)

    return fuels_frame
fuels_sample = load_data(pyromes,years)
fuels_sample_eval = load_data(eval_pyromes,years)

feature_list = fuels_sample.columns.to_list()
feature_list.remove('system:index')
feature_list.remove('.geo')

In [3]:
alphaearth_features = [f'A{str(i).zfill(2)}' for i in range(64)]
label_list = ['FBFM40','FBFM40Parent']
feature_list_wo_alphaearth = [feature for feature in feature_list if feature not in (alphaearth_features +label_list)]

In [4]:
fuels_labels = fuels_sample[label_list]

fuels_data_alphaearth = fuels_sample[alphaearth_features]
fuels_data_wo_alphaearth = fuels_sample[feature_list_wo_alphaearth]
fuels_data_full = fuels_sample[alphaearth_features + feature_list_wo_alphaearth]

fuel_characteristics = [
    'VALUE',
    'Fuel Load 1hr', 'Fuel Load 10hr','Fuel Load 100hr', 'Fuel Load Live Herb','Fuel Load Live Woody',
    'SAV ratio Dead 1hr','SAV Live Herb','SAV Live Woody',
    'fuel bed depth',
    'dead fuel extinction moisture'
]
lf_characteristics = lfdf[fuel_characteristics]

fuels_labels_w_characteristics = fuels_labels.merge(lf_characteristics,how='left',left_on='FBFM40',right_on='VALUE').drop('VALUE',axis=1)

cols = fuels_labels_w_characteristics.columns.to_numpy()
col_map = dict(zip(cols,range(len(cols))))

In [5]:
col_map

{'FBFM40': 0,
 'FBFM40Parent': 1,
 'Fuel Load 1hr': 2,
 'Fuel Load 10hr': 3,
 'Fuel Load 100hr': 4,
 'Fuel Load Live Herb': 5,
 'Fuel Load Live Woody': 6,
 'SAV ratio Dead 1hr': 7,
 'SAV Live Herb': 8,
 'SAV Live Woody': 9,
 'fuel bed depth': 10,
 'dead fuel extinction moisture': 11}

In [6]:
fuels_labels_w_characteristics.head()

Unnamed: 0,FBFM40,FBFM40Parent,Fuel Load 1hr,Fuel Load 10hr,Fuel Load 100hr,Fuel Load Live Herb,Fuel Load Live Woody,SAV ratio Dead 1hr,SAV Live Herb,SAV Live Woody,fuel bed depth,dead fuel extinction moisture
0,93,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1,-1,-1.0,-1
1,91,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1,-1,-1.0,-1
2,91,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1,-1,-1.0,-1
3,93,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1,-1,-1.0,-1
4,98,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1,-1,-1.0,-1


In [7]:
fuels_labels_w_characteristics.dtypes

FBFM40                             int64
FBFM40Parent                       int64
Fuel Load 1hr                    float64
Fuel Load 10hr                   float64
Fuel Load 100hr                  float64
Fuel Load Live Herb              float64
Fuel Load Live Woody             float64
SAV ratio Dead 1hr                 int64
SAV Live Herb                      int64
SAV Live Woody                     int64
fuel bed depth                   float64
dead fuel extinction moisture      int64
dtype: object

In [8]:
cols = list(fuels_labels_w_characteristics.columns)

target_keys = dict(zip(range(len(cols)),cols))

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder

seed = 1917
test_pct = .3

X_train, X_test, y_train, y_test = train_test_split(fuels_data_full.to_numpy(), fuels_labels_w_characteristics.to_numpy(),test_size=test_pct,random_state=seed)

X_train = np.nan_to_num(X_train,0)
X_test = np.nan_to_num(X_test,0)

scaler = StandardScaler()
encoder = LabelEncoder()
fbfm40_encoder = LabelEncoder()

X_train_scaled = scaler.fit_transform(X_train)
y_train_encode = encoder.fit_transform(y_train[:,1])
y_train_fbfm40_encode = fbfm40_encoder.fit_transform(y_train[:,0])

X_test_scaled = scaler.transform(X_test)
y_test_encode = encoder.transform(y_test[:,1])
y_test_fbfm40_encode = fbfm40_encoder.transform(y_test[:,0])


#prepare neighboring pyrome data for validation
eval_data_full = fuels_sample_eval[alphaearth_features + feature_list_wo_alphaearth]
eval_data_full= np.nan_to_num(eval_data_full.to_numpy(),0)

eval_labels = fuels_sample_eval[label_list]

eval_data_scaled = scaler.transform(eval_data_full)


eval_labels_encode = encoder.transform(eval_labels.to_numpy()[:,1])
eval_labels_fbfm40_encode = fbfm40_encoder.transform(eval_labels.to_numpy()[:,0])

---

Standard FBFM40 Classification

---

In [10]:
from sklearn.ensemble import RandomForestClassifier

rfc_superclass = RandomForestClassifier(
    n_estimators=100,
    criterion='gini',
    n_jobs=-1
)

rfc_superclass.fit(X_train_scaled,y_train_encode)

superclass_pred = rfc_superclass.predict(X_test_scaled)
print('Superclass Metrics:')
print(classification_report(y_test_encode,superclass_pred))
# print(confusion_matrix(y_test_encode,superclass_pred))

rfc_fbfm40 = RandomForestClassifier(
    n_estimators=100,
    criterion='gini',
    n_jobs=-1
)

rfc_fbfm40.fit(X_train_scaled,y_train_fbfm40_encode)
fbfm40_pred = rfc_fbfm40.predict(X_test_scaled)
print('FBFM40 Metrics')
print(classification_report(y_test_fbfm40_encode,fbfm40_pred))
# print(confusion_matrix(y_test_fbfm40_encode,fbfm40_pred))


Superclass Metrics:
              precision    recall  f1-score   support

           0       0.84      0.83      0.84      1794
           1       0.61      0.70      0.65      1719
           2       0.58      0.31      0.40      1776
           3       0.63      0.76      0.69      1803
           4       0.61      0.64      0.62      1842
           5       0.62      0.64      0.63      1856
           6       0.91      0.96      0.93      1810

    accuracy                           0.69     12600
   macro avg       0.69      0.69      0.68     12600
weighted avg       0.69      0.69      0.68     12600

FBFM40 Metrics
              precision    recall  f1-score   support

           0       0.77      0.77      0.77       870
           1       0.86      0.87      0.87       530
           2       0.89      0.84      0.87       334
           3       0.93      0.62      0.74        60
           4       1.00      0.04      0.07        82
           5       0.57      0.81      0.67

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
neighboring_pyrome_pred = rfc_fbfm40.predict(eval_data_scaled)
print(classification_report(neighboring_pyrome_pred,eval_labels_encode))
print(confusion_matrix(neighboring_pyrome_pred,eval_labels_encode))

              precision    recall  f1-score   support

           0       0.24      0.40      0.30      3650
           1       0.17      0.18      0.17      5672
           2       0.04      0.29      0.07       817
           3       0.00      0.18      0.01        87
           4       0.00      0.00      0.00         3
           5       0.21      0.08      0.12     15086
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00       345
           8       0.00      0.00      0.00      4012
          12       0.00      0.00      0.00       884
          13       0.00      0.00      0.00         5
          17       0.00      0.00      0.00        69
          18       0.00      0.00      0.00        15
          19       0.00      0.00      0.00       198
          22       0.00      0.00      0.00       390
          24       0.00      0.00      0.00         3
          25       0.00      0.00      0.00      4271
          27       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
fbfm40_pred_df = pd.DataFrame.from_dict({
    'FBFM40':fbfm40_encoder.inverse_transform(fbfm40_pred)
})
fbfm40_pred_df = fbfm40_pred_df.merge(lf_characteristics,how='left',left_on='FBFM40',right_on='VALUE').drop('VALUE',axis=1)
fbfm40_pred_df

Unnamed: 0,FBFM40,Fuel Load 1hr,Fuel Load 10hr,Fuel Load 100hr,Fuel Load Live Herb,Fuel Load Live Woody,SAV ratio Dead 1hr,SAV Live Herb,SAV Live Woody,fuel bed depth,dead fuel extinction moisture
0,145.0,3.60,2.10,0.00,0.0,2.90,750,9999,1600,6.0,15
1,102.0,0.10,0.00,0.00,1.0,0.00,2000,1800,9999,1.0,15
2,202.0,4.50,4.25,4.00,0.0,0.00,2000,9999,9999,1.0,25
3,162.0,0.95,1.80,1.25,0.0,0.20,2000,9999,1400,1.0,30
4,202.0,4.50,4.25,4.00,0.0,0.00,2000,9999,9999,1.0,25
...,...,...,...,...,...,...,...,...,...,...,...
12595,145.0,3.60,2.10,0.00,0.0,2.90,750,9999,1600,6.0,15
12596,183.0,0.50,2.20,2.80,0.0,0.00,2000,9999,9999,0.3,20
12597,142.0,1.35,2.40,0.75,0.0,3.85,2000,9999,1600,1.0,15
12598,122.0,0.50,0.50,0.00,0.6,1.00,2000,1800,1800,1.5,15


---

Fuel Bed Depth Regression

---

In [13]:
var_name = 'dead fuel extinction moisture'
var_idx = col_map[var_name]

y_train_var = y_train[:,var_idx]
y_test_var = y_test[:,var_idx]

y_train_depth = y_train[:,var_idx]
y_test_depth = y_test[:,var_idx]


bins = fuels_labels_w_characteristics[var_name].unique()
bins.sort()
# bins = np.insert(bins,0,-np.inf)

In [14]:
bins

array([-1, 15, 20, 25, 30, 35, 40])

In [15]:
var_encoder = LabelEncoder()
var_encoder.fit(bins)
y_train_var_cats = var_encoder.transform(y_train_var)
y_test_var_cats = var_encoder.transform(y_test_var)


var_clf = RandomForestClassifier(
    n_estimators=100,
    criterion='gini',
    n_jobs=-1
)

var_clf.fit(X_train_scaled,y_train_var_cats)
var_clf_pred = var_clf.predict(X_test_scaled)
var_clf_pred_decode = var_encoder.inverse_transform(var_clf_pred)

mse_decode_clf_pred = np.sum(np.sqrt((var_clf_pred_decode-y_test_var)**2))
var_clf_acc = accuracy_score(y_test_var,var_clf_pred_decode)

print(f'Acc Single Var: {var_clf_acc}')
print(f'MSE Decoded Single Var {mse_decode_clf_pred}')

Acc Single Var: 0.7119047619047619
MSE Decoded Single Var 42755.0


In [16]:
from sklearn.ensemble import RandomForestRegressor

rfdepth = RandomForestRegressor(
    n_estimators=200,
    criterion='squared_error',
    n_jobs=-1
)

rfdepth.fit(X_train_scaled,y_train_depth)

rf_depth_pred = rfdepth.predict(X_test_scaled)

rf_depth_pred[rf_depth_pred > lf_characteristics[var_name].max()] = lf_characteristics[var_name].max()
rf_depth_pred[rf_depth_pred < lf_characteristics[var_name].min()] = lf_characteristics[var_name].min()

In [17]:
rf_depth_pred
mse_direct_pred = np.sum(np.sqrt((rf_depth_pred - y_test_depth)**2))
print(mse_direct_pred)

52447.55


In [18]:
rf_depth_bin_nearest_idxs = np.argmin(np.abs(np.expand_dims(rf_depth_pred,axis=1) - np.expand_dims(bins,axis=0)),axis=1)
rf_depth_bin_nearest_idxs.shape

rf_depth_pred_nearest = bins[rf_depth_bin_nearest_idxs]

mse_nearest_bin = np.sum(np.sqrt((rf_depth_pred_nearest - y_test_depth)**2))
print(mse_nearest_bin)

48083.0


In [19]:
categorical_fuel_depth_pred = fbfm40_pred_df[var_name].to_numpy()

print(categorical_fuel_depth_pred)

mse_categorical_pred = np.sum(np.sqrt((categorical_fuel_depth_pred - y_test_depth)**2))
print(mse_categorical_pred)

[15 15 25 ... 15 15 25]
43679.0


In [20]:
import xgboost as xgb

xgb_clf = xgb.XGBClassifier(n_estimators=100)
xgb_clf.fit(X_train_scaled,y_train_fbfm40_encode)
xgb_fbfm40_pred = xgb_clf.predict(X_test_scaled)
fbfm40_pred_xgb_df = pd.DataFrame.from_dict({
    'FBFM40':fbfm40_encoder.inverse_transform(xgb_fbfm40_pred)
})
fbfm40_pred_xgb_df = fbfm40_pred_xgb_df.merge(lf_characteristics,how='left',left_on='FBFM40',right_on='VALUE').drop('VALUE',axis=1)
categorical_fuel_depth_pred_xgb = fbfm40_pred_xgb_df[var_name].to_numpy()
mse_categorical_pred_xgb = np.sum(np.sqrt((categorical_fuel_depth_pred_xgb - y_test_depth)**2))


xgb_depth = xgb.XGBRegressor()

xgb_depth.fit(X_train_scaled,y_train_depth)

xgb_reg_direct_pred = xgb_depth.predict(X_test_scaled)

xgb_reg_direct_pred[xgb_reg_direct_pred > lf_characteristics[var_name].max()] = lf_characteristics[var_name].max()
xgb_reg_direct_pred[xgb_reg_direct_pred < lf_characteristics[var_name].min()] = lf_characteristics[var_name].min()

mse_direct_pred_xgb = np.sum(np.sqrt((xgb_reg_direct_pred - y_test_depth)**2))

xgb_depth_bin_nearest_idxs = np.argmin(np.abs(np.expand_dims(xgb_reg_direct_pred,axis=1) - np.expand_dims(bins,axis=0)),axis=1)
xgb_depth_pred_nearest = bins[xgb_depth_bin_nearest_idxs]
mse_depth_pred_nearest = np.sum(np.sqrt((xgb_depth_pred_nearest - y_test_depth)**2))

print(f'MSE Categorical Pred (XGB): {mse_categorical_pred_xgb}')
print(f'MSE Direct Pred (XGB): {mse_direct_pred_xgb}')
print(f'MSE Nearest Bin Pred (XGB): {mse_depth_pred_nearest}')

MSE Categorical Pred (XGB): 39882.0
MSE Direct Pred (XGB): 54430.638408595696
MSE Nearest Bin Pred (XGB): 49034.0


---

Table of Regression Results

---

In [31]:
xgb_clf = xgb.XGBClassifier(n_estimators=100)
xgb_clf.fit(X_train_scaled,y_train_fbfm40_encode)
xgb_fbfm40_pred = xgb_clf.predict(X_test_scaled)
fbfm40_pred_xgb_df = pd.DataFrame.from_dict({
    'FBFM40':fbfm40_encoder.inverse_transform(xgb_fbfm40_pred)
})
fbfm40_pred_xgb_df = fbfm40_pred_xgb_df.merge(lf_characteristics,how='left',left_on='FBFM40',right_on='VALUE').drop('VALUE',axis=1)

acc_fbfm40 = accuracy_score(y_test_fbfm40_encode,xgb_fbfm40_pred)

In [51]:
test_vars = ['Fuel Load 1hr', 'Fuel Load 10hr','Fuel Load 100hr', 'Fuel Load Live Herb','Fuel Load Live Woody',
    'SAV ratio Dead 1hr','SAV Live Herb','SAV Live Woody',
    'fuel bed depth',
    'dead fuel extinction moisture'
]

results = []

for var_name in test_vars:
    var_idx = col_map[var_name]

    y_train_var = y_train[:,var_idx]
    y_test_var = y_test[:,var_idx]

    bins = fuels_labels_w_characteristics[var_name].unique()
    bins.sort()
    categorical_var_pred_xgb = fbfm40_pred_xgb_df[var_name].to_numpy()
    
    var_encoder = LabelEncoder()
    var_encoder.fit(bins)
    y_train_var_cats = var_encoder.transform(y_train_var)
    y_test_var_cats = var_encoder.transform(y_test_var)

    acc_fbfm40_var_xgb = accuracy_score(y_test_var_cats,var_encoder.transform(categorical_var_pred_xgb))
    mse_categorical_pred_xgb = np.sum(np.sqrt((categorical_var_pred_xgb - y_test_var)**2))
    r2_categorical_pred_xgb = r2_score(y_test_var,categorical_var_pred_xgb)

    xgb_cat = xgb.XGBClassifier(n_estimators=100)
    xgb_cat.fit(X_train_scaled,y_train_var_cats)
    xgb_cat_pred = xgb_cat.predict(X_test_scaled)

    acc_cat_pred = accuracy_score(y_test_var_cats,xgb_cat_pred)
    xgb_cat_var_pred = var_encoder.inverse_transform(xgb_cat_pred) 

    mse_local_cat = np.sum(np.sqrt((xgb_cat_var_pred - y_test_var)**2))
    r2_local_cat_xgb = r2_score(y_test_var,xgb_cat_var_pred)

    xgb_reg = xgb.XGBRegressor(n_estimators=100)
    xgb_reg.fit(X_train_scaled,y_train_var)
    xgb_reg_direct_pred = xgb_reg.predict(X_test_scaled)

    
    xgb_reg_direct_pred[xgb_reg_direct_pred > lf_characteristics[var_name].max()] = lf_characteristics[var_name].max()
    xgb_reg_direct_pred[xgb_reg_direct_pred < lf_characteristics[var_name].min()] = lf_characteristics[var_name].min()

    mse_direct_pred_xgb = np.sum(np.sqrt((xgb_reg_direct_pred - y_test_var)**2))
    r2_direct_pred_xgb = r2_score(y_test_var,xgb_reg_direct_pred)

    xgb_reg_bin_nearest_idxs = np.argmin(np.abs(np.expand_dims(xgb_reg_direct_pred,axis=1) - np.expand_dims(bins,axis=0)),axis=1)
    xgb_reg_bin_nearest_pred = bins[xgb_reg_bin_nearest_idxs]

    xgb_reg_bin_nearest_pred_cat = var_encoder.transform(xgb_reg_bin_nearest_pred)
    acc_nearest_bin = accuracy_score(y_test_var_cats,xgb_reg_bin_nearest_pred_cat)

    mse_nearest_bin_pred_xgb = np.sum(np.sqrt((xgb_reg_bin_nearest_pred - y_test_var)**2))
    r2_nearest_bin_pred_xgb = r2_score(y_test_var,xgb_reg_bin_nearest_pred)



    result = {
        'var':var_name,
        'acc_fbfm40_global':acc_fbfm40,
        'acc_single_var':acc_fbfm40_var_xgb,
        'acc_single_var_local':acc_cat_pred,
        'acc_reg_binned':acc_nearest_bin,
        'mse_categorical_global':mse_categorical_pred_xgb,
        'r2_categorical_global':r2_categorical_pred_xgb,
        'mse_categorical_local':mse_local_cat,
        'r2_categorical_local':r2_local_cat_xgb,
        'mse_direct':mse_direct_pred_xgb,
        'r2_direct':r2_direct_pred_xgb,
        'mse_nearest_bin':mse_nearest_bin_pred_xgb,
        'r2_nearest_bin':r2_nearest_bin_pred_xgb
    }

    results.append(result)

In [52]:
results_df = pd.DataFrame.from_dict(results)
results_df

Unnamed: 0,var,acc_fbfm40_global,acc_single_var,acc_single_var_local,acc_reg_binned,mse_categorical_global,r2_categorical_global,mse_categorical_local,r2_categorical_local,mse_direct,r2_direct,mse_nearest_bin,r2_nearest_bin
0,Fuel Load 1hr,0.645159,0.655952,0.649603,0.254127,7200.95,0.561852,7186.85,0.571887,9976.209,0.648789,9672.25,0.642232
1,Fuel Load 10hr,0.645159,0.668889,0.670476,0.277778,5869.3,0.672878,5930.4,0.666547,7825.192,0.742288,7574.95,0.738374
2,Fuel Load 100hr,0.645159,0.705952,0.704127,0.323413,5821.35,0.63371,6029.1,0.610982,8342.581,0.690736,7995.55,0.685795
3,Fuel Load Live Herb,0.645159,0.769365,0.776429,0.437937,2291.25,0.432266,2313.3,0.418565,3181.733,0.551405,2957.25,0.529214
4,Fuel Load Live Woody,0.645159,0.688175,0.697778,0.315,5680.7,0.411636,5464.9,0.432764,7660.107,0.540754,7313.1,0.520772
5,SAV ratio Dead 1hr,0.645159,0.823254,0.82373,0.505873,1938322.0,0.57558,1950133.0,0.563776,3091581.0,0.64806,2898018.0,0.623897
6,SAV Live Herb,0.645159,0.819048,0.81381,0.511032,14351703.0,0.522776,14915654.0,0.50472,21968210.0,0.632265,16759203.0,0.498571
7,SAV Live Woody,0.645159,0.724603,0.724841,0.44373,22111998.0,0.254753,23923974.0,0.185829,31747590.0,0.44738,25332113.0,0.235193
8,fuel bed depth,0.645159,0.679286,0.678968,0.335317,5245.8,0.518622,5138.5,0.539238,6896.96,0.61152,6578.9,0.585986
9,dead fuel extinction moisture,0.645159,0.73746,0.73246,0.512698,39882.0,0.547919,40407.0,0.547739,54430.64,0.637216,49034.0,0.610006


In [48]:
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier

test_vars = ['Fuel Load 1hr', 'Fuel Load 10hr','Fuel Load 100hr', 'Fuel Load Live Herb','Fuel Load Live Woody',
    'SAV ratio Dead 1hr','SAV Live Herb','SAV Live Woody',
    'fuel bed depth',
    'dead fuel extinction moisture'
]

knn_results = []

for var_name in test_vars:
    var_idx = col_map[var_name]

    y_train_var = y_train[:,var_idx]
    y_test_var = y_test[:,var_idx]

    bins = fuels_labels_w_characteristics[var_name].unique()
    bins.sort()
    categorical_var_pred_xgb = fbfm40_pred_xgb_df[var_name].to_numpy()
    
    var_encoder = LabelEncoder()
    var_encoder.fit(bins)
    y_train_var_cats = var_encoder.transform(y_train_var)
    y_test_var_cats = var_encoder.transform(y_test_var)

    acc_fbfm40_var_xgb = accuracy_score(y_test_var_cats,var_encoder.transform(categorical_var_pred_xgb))
    mse_categorical_pred_xgb = np.sum(np.sqrt((categorical_var_pred_xgb - y_test_var)**2))
    r2_categorical_pred_xgb = r2_score(y_test_var,categorical_var_pred_xgb)

    xgb_cat = xgb.XGBClassifier(n_estimators=100)
    xgb_cat.fit(X_train_scaled,y_train_var_cats)
    xgb_cat_pred = xgb_cat.predict(X_test_scaled)

    acc_cat_pred = accuracy_score(y_test_var_cats,xgb_cat_pred)
    xgb_cat_var_pred = var_encoder.inverse_transform(xgb_cat_pred) 

    mse_local_cat = np.sum(np.sqrt((xgb_cat_var_pred - y_test_var)**2))
    r2_local_cat_xgb = r2_score(y_test_var,xgb_cat_var_pred)

    xgb_reg = KNeighborsRegressor(n_neighbors=7,metric='cosine')
    # xgb_reg = xgb.XGBRegressor(n_estimators=100)
    xgb_reg.fit(X_train_scaled,y_train_var)
    xgb_reg_direct_pred = xgb_reg.predict(X_test_scaled)

    
    xgb_reg_direct_pred[xgb_reg_direct_pred > lf_characteristics[var_name].max()] = lf_characteristics[var_name].max()
    xgb_reg_direct_pred[xgb_reg_direct_pred < lf_characteristics[var_name].min()] = lf_characteristics[var_name].min()

    mse_direct_pred_xgb = np.sum(np.sqrt((xgb_reg_direct_pred - y_test_var)**2))
    r2_direct_pred_xgb = r2_score(y_test_var,xgb_reg_direct_pred)

    xgb_reg_bin_nearest_idxs = np.argmin(np.abs(np.expand_dims(xgb_reg_direct_pred,axis=1) - np.expand_dims(bins,axis=0)),axis=1)
    xgb_reg_bin_nearest_pred = bins[xgb_reg_bin_nearest_idxs]

    xgb_reg_bin_nearest_pred_cat = var_encoder.transform(xgb_reg_bin_nearest_pred)
    acc_nearest_bin = accuracy_score(y_test_var_cats,xgb_reg_bin_nearest_pred_cat)

    mse_nearest_bin_pred_xgb = np.sum(np.sqrt((xgb_reg_bin_nearest_pred - y_test_var)**2))
    r2_nearest_bin_pred_xgb = r2_score(y_test_var,xgb_reg_bin_nearest_pred)



    result = {
        'var':var_name,
        'acc_fbfm40_global':acc_fbfm40,
        'acc_single_var':acc_fbfm40_var_xgb,
        'acc_single_var_local':acc_cat_pred,
        'acc_reg_binned':acc_nearest_bin,
        'mse_categorical_global':mse_categorical_pred_xgb,
        'r2_categorical_global':r2_categorical_pred_xgb,
        'mse_categorical_local':mse_local_cat,
        'r2_categorical_local':r2_local_cat_xgb,
        'mse_direct':mse_direct_pred_xgb,
        'r2_direct':r2_direct_pred_xgb,
        'mse_nearest_bin':mse_nearest_bin_pred_xgb,
        'r2_nearest_bin':r2_nearest_bin_pred_xgb
    }

    knn_results.append(result)

In [50]:
knn_results_df = pd.DataFrame.from_dict(knn_results)
knn_results_df

Unnamed: 0,var,acc_fbfm40_global,acc_single_var,acc_single_var_local,acc_reg_binned,mse_categorical_global,r2_categorical_global,mse_categorical_local,r2_categorical_local,mse_direct,r2_direct,mse_nearest_bin,r2_nearest_bin
0,Fuel Load 1hr,0.645159,0.655952,0.649603,0.302778,7200.95,0.561852,7186.85,0.571887,9438.071,0.64833,9261.35,0.644519
1,Fuel Load 10hr,0.645159,0.668889,0.670476,0.333651,5869.3,0.672878,5930.4,0.666547,7356.579,0.740614,7213.45,0.737207
2,Fuel Load 100hr,0.645159,0.705952,0.704127,0.429921,5821.35,0.63371,6029.1,0.610982,7801.6,0.686685,7629.2,0.679993
3,Fuel Load Live Herb,0.645159,0.769365,0.776429,0.492698,2291.25,0.432266,2313.3,0.418565,2935.364,0.548703,2822.8,0.52573
4,Fuel Load Live Woody,0.645159,0.688175,0.697778,0.365079,5680.7,0.411636,5464.9,0.432764,7373.864,0.534146,7262.65,0.508547
5,SAV ratio Dead 1hr,0.645159,0.823254,0.82373,0.591111,1938322.0,0.57558,1950133.0,0.563776,2808304.0,0.645958,2652640.0,0.625051
6,SAV Live Herb,0.645159,0.819048,0.81381,0.616825,14351703.0,0.522776,14915654.0,0.50472,20382170.0,0.627652,16304383.0,0.492704
7,SAV Live Woody,0.645159,0.724603,0.724841,0.513016,22111998.0,0.254753,23923974.0,0.185829,30666680.0,0.432805,25667255.0,0.196922
8,fuel bed depth,0.645159,0.679286,0.678968,0.411984,5245.8,0.518622,5138.5,0.539238,6578.743,0.609754,6339.8,0.585625
9,dead fuel extinction moisture,0.645159,0.73746,0.73246,0.544365,39882.0,0.547919,40407.0,0.547739,50839.14,0.643091,47237.0,0.611857
