# CrabNet Hyperparameter Surrogate Model

Here, we train a surrogate model for the CrabNet hyperparameter optimization on Matbench
datasets. We capture average model performance (MAE and RMSE) as well as model
complexity and runtime.

In [77]:
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, GroupKFold
import pandas as pd
import numpy as np
import joblib
from os import path
import json

# attempted use of skl2onnx to convert to onnx failing due to protobuf error
# https://github.com/onnx/onnx/issues/4469

# from skl2onnx import convert_sklearn
# from skl2onnx.common.data_types import FloatTensorType

In [78]:
dummy = True

task_name = "crabnet_hyperparameter"

data_dir = path.join("..", "..", "data", "processed", task_name)
model_dir = path.join("..", "..", "models", task_name)

if dummy:
    model_dir = path.join(model_dir, "dummy")

## Import Data

In [79]:
sobol_reg = pd.read_csv(path.join(data_dir, "sobol_regression.csv"))

if dummy:
    sobol_reg = sobol_reg.head(100)

### Prepare data for Random Forest Classifier -- Convert Categorical Data to Numerical Data

In [80]:
# bias_ohe = pd.get_dummies(sobol_reg["bias"], prefix="bias")
criterion_ohe = pd.get_dummies(sobol_reg["criterion"], prefix="criterion")
elemprop_ohe = pd.get_dummies(sobol_reg["elem_prop"], prefix="elem_prop")
hardware_ohe = pd.get_dummies(sobol_reg["hardware"], prefix="hardware")

sobol_reg["bias"] = sobol_reg["bias"].astype(int)

sobol_reg = pd.concat([sobol_reg, criterion_ohe, elemprop_ohe, hardware_ohe], axis=1)

sobol_reg.drop(columns=["criterion", "elem_prop", "hardware"], inplace=True)

## define f(x) to calc mae scores 

In [81]:
# argument for rfr_mae, X_array, y_array, model_name to save model as .pkl
def rfr_group_mae(X_array, y_array, group_array, model_name_stem, objective_name, random_state=13):
    kf = GroupKFold(n_splits=5)
    mae_scores = []
    y_preds = []
    y_trues = []
    for i, (train_index, test_index) in enumerate(
        kf.split(X_array, y_array, group_array)
    ):
        X_train, X_test = X_array[train_index], X_array[test_index]
        y_train, y_test = y_array[train_index], y_array[test_index]
        y_test = y_test.tolist()

        model = RandomForestRegressor(random_state=random_state)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test).tolist()

        y_preds.append(y_pred)
        y_trues.append(y_test)
        mae = mean_absolute_error(y_test, y_pred)
        mae_scores.append(mae)
        # save model as .pkl
        joblib.dump(model, f"{model_name_stem}_{i}.pkl")

    avg_mae = np.mean(mae_scores)
    std_mae = np.std(mae_scores)
    
    print(f"MAE for {objective_name}: {avg_mae:.4f} +/- {std_mae:.4f}")
    results = {"mae": mae_scores, "y_pred": y_preds, "y_true": y_trues}
    return results

## Features

In [84]:
common_features = [
    "N",
    "alpha",
    "d_model",
    "dim_feedforward",
    "dropout",
    "emb_scaler",
    "eps",
    "epochs_step",
    "fudge",
    "heads",
    "k",
    "lr",
    "pe_resolution",
    "ple_resolution",
    "pos_scaler",
    "weight_decay",
    "batch_size",
    "out_hidden4",
    "betas1",
    "betas2",
    "train_frac",
    "bias",
    "criterion_RobustL1",
    "criterion_RobustL2",
    "elem_prop_magpie",
    "elem_prop_mat2vec",
    "elem_prop_onehot",
    "hardware_2080ti"
]

# fba_isna_prob_features = common_features
# ls_isna_prob_features = common_features

# fba_features = common_features + ["fba_rank"]
# ls_features = common_features + ["ls_rank"]

# fba_time_s_features = common_features + ["fba_time_s_rank"]
# ls_time_s_features = common_features + ["ls_time_s_rank"]

## Objective Surrogate Models
#### no NaN values
### mae

In [85]:
mae_features = common_features + ["mae_rank"]  ## ToDo: mae or mae_rank (mae.1)

X_array_mae = sobol_reg[mae_features].to_numpy()
y_array_mae = sobol_reg[["mae"]].to_numpy().ravel()

sobol_reg_mae_group = (
    sobol_reg[mae_features]
    .round(6)
    .apply(lambda row: "_".join(row.values.astype(str)), axis=1)
)

mae_model_stem = path.join(model_dir, "sobol_reg_mae")
mae_results = rfr_group_mae(X_array_mae, y_array_mae, sobol_reg_mae_group, mae_model_stem, "mae")

MAE for mae: 0.0886 +/- 0.0158


### rmse

In [86]:
rmse_features = common_features + ["rmse_rank"] ## ToDo: rmse or rmse_rank (rmse.1)

X_array_rmse = sobol_reg[rmse_features].to_numpy()
y_array_rmse = sobol_reg[["rmse"]].to_numpy().ravel()

sobol_reg_rmse_group = (
    sobol_reg[rmse_features]
    .round(6)
    .apply(lambda row: "_".join(row.values.astype(str)), axis=1)
)

rmse_model_stem = path.join(model_dir, "sobol_reg_rmse")
rmse_results = rfr_group_mae(
    X_array_rmse, y_array_rmse, sobol_reg_rmse_group, rmse_model_stem, "rmse"
)

MAE for rmse: 0.0883 +/- 0.0257


### model size ###

In [87]:
model_size_features = common_features 

X_array_model_size = sobol_reg[model_size_features].to_numpy()
y_array_model_size = sobol_reg[["model_size"]].to_numpy().ravel()

sobol_reg_model_size_group = (
    sobol_reg[model_size_features]
    .round(6)
    .apply(lambda row: "_".join(row.values.astype(str)), axis=1)
)

model_size_model_stem = path.join(model_dir, "sobol_reg_model_size")
model_size_results = rfr_group_mae(
    X_array_model_size, y_array_model_size, sobol_reg_model_size_group, model_size_model_stem, "model_size"
)

MAE for model_size: 7078125.7105 +/- 791229.1921


### runtime

In [88]:
runtime_features = common_features + ["runtime_rank"] ## ToDo: runtime or runtime_rank (runtime.1)

X_array_runtime = sobol_reg[runtime_features].to_numpy()
y_array_runtime = sobol_reg[["runtime"]].to_numpy().ravel()

sobol_reg_runtime_group = (
    sobol_reg[runtime_features]
    .round(6)
    .apply(lambda row: "_".join(row.values.astype(str)), axis=1)
)

runtime_model_stem = path.join(model_dir, "sobol_reg_runtime")
runtime_results = rfr_group_mae(
    X_array_runtime, y_array_runtime, sobol_reg_runtime_group, runtime_model_stem, "runtime"
)

MAE for runtime: 71.1022 +/- 12.7256


In [89]:
main_results = {
    "mae": mae_results,
    "rmse": rmse_results,
    "model_size": model_size_results,
    "runtime": runtime_results,
}
with open(path.join(data_dir, "model_metadata.json"), "w") as f:
    json.dump(main_results, f)
    
main_results

{'mae': {'mae': [0.09736450358524826,
   0.09584076684453244,
   0.0802529964475443,
   0.10721362715898333,
   0.06212542711224781],
  'y_pred': [[1.0470445123262662,
    1.1388262858078486,
    1.0937742371687706,
    1.092910442683129,
    1.0438372602378871,
    1.0979699665832756,
    0.4680361319697645,
    0.4694053084366586,
    0.558221162505513,
    0.8858907433235408,
    0.7634087634084612,
    0.9416579741236427,
    0.4489247723793967,
    0.438963360103239,
    0.4383556589032155,
    0.4480121911319065,
    0.3982781319502199,
    0.44545946894258875,
    0.44618629554354483,
    0.4244994333327561],
   [0.9314951872437939,
    1.0570991161603451,
    1.0727408998185621,
    0.8653992024529414,
    0.4556970487915757,
    0.6124129060295667,
    0.39256521816360324,
    0.4853519476046882,
    0.45922470429958195,
    0.4867680789728061,
    0.3957088248979133,
    0.3986279115375167,
    0.4179404932305161,
    0.46804234963511776,
    0.43770644923194396,
    0.500553

In [90]:
model_paths = {
    "mae": mae_model_stem,
    "rmse": rmse_model_stem,
    "model_size": model_size_model_stem,
    "runtime": runtime_model_stem,
}
models = {}
for key, model_path in model_paths.items():
    models[key] = [joblib.load(f"{model_path}_{i}.pkl") for i in range(5)]

with open(path.join(data_dir, "cross_validation_models.pkl"), "wb") as f:
    pickle.dump(models, f)

models

{'mae': [RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13)],
 'rmse': [RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13)],
 'model_size': [RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13)],
 'runtime': [RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13)]}

### code graveyard 

In [91]:
### Ordinal Encoding

# "bias" [False, True]},
# "criterion" ["RobustL1", "RobustL2"]},
# "elem_prop"["mat2vec", "magpie", "onehot"],

# sobol_reg["bias"].replace(["False", "True"], [0,1], inplace=True)
# sobol_reg["criterion"].replace(["RobustL1", "RobustL2"], [0,1], inplace=True)
# sobol_reg["elem_prop"].replace(["mat2vec", "magpie", "onehot"], [0,1,2], inplace=True)
# sobol_reg["hardware"].replace(["2080ti"], [0], inplace=True)


In [52]:
# print("Average MAE for fba_isna_prob",rfr_mae(X_array_fba_isna_prob, y_array_fba_isna_prob,'fba_isna_prob.pkl'))

# load trained model
# loaded_model = joblib.load('fba_isna_prob_model.pkl')

# Save the model
# with open('../models/fba_isna_prob.pkl', 'wb') as f:
#     pickle.dump(model, f)

# # Load the model
# with open('path/to/save/model.pkl', 'rb') as f:
#     loaded_model = pickle.load(f)

In [53]:
# url_sobol_filter = "https://zenodo.org/record/7513019/files/sobol_probability_filter.csv"
# sobol_filter = pd.read_csv(url_sobol_filter)

# url_sobol_reg = "https://zenodo.org/record/7513019/files/sobol_regression.csv"
# sobol_reg = pd.read_csv(url_sobol_reg)

In [54]:
# os.getcwd()
# os.chdir("../data/raw")

# sobol_filter.to_csv('sobol_filter.csv', index=False)

# sobol_reg.to_csv('sobol_reg.csv', index=False)

In [55]:
# read in sobol_regression.csv
# url_sobol_reg = "https://zenodo.org/record/7513019/files/sobol_regression.csv"

In [56]:
# sobol_reg_x = sobol_reg[
#     [
#         "mu1_div_mu3",
#         "mu2_div_mu3",
#         "std1",
#         "std2",
#         "std3",
#         "comp1",
#         "comp2",
#         "num_particles",
#         "safety_factor",
#         "fba_rank",
#         "ls_rank",
#         "fba_time_s_rank",
#         "ls_time_s_rank",
#     ]
# ]

In [57]:
# print(len(sobol_reg_x))
# print(len(fba))

In [58]:
# print(
#     "Average MAE for ls_time_s",
#     rfr_mae(X_array_fba_time_s, y_array_ls_time_s, "sobol_reg_ls_time_s.pkl"),
# )

In [59]:
# # parse data for target "fba_isna_prob"
# fba_isna_prob = sobol_filter["fba_isna_prob"]
# sobolPF_fba_isna_prob = sobol_filter.drop(["ls_isna_prob", "fba_isna_prob"], axis=1)
# fba_isna_prob = fba_isna_prob.to_frame()