# CrabNet Hyperparameter Surrogate Model

Here, we train a surrogate model for the CrabNet hyperparameter optimization on Matbench
datasets. We capture average model performance (MAE and RMSE) as well as model
complexity and runtime.

In [23]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, GroupKFold
import pandas as pd
import numpy as np
import joblib
from os import path
import json

# attempted use of skl2onnx to convert to onnx failing due to protobuf error
# https://github.com/onnx/onnx/issues/4469

# from skl2onnx import convert_sklearn
# from skl2onnx.common.data_types import FloatTensorType

In [24]:
from pathlib import Path

dummy = False

task_name = "crabnet_hyperparameter"

data_dir = path.join("..", "..", "data", "processed", task_name)
model_dir = path.join("..", "..", "models", task_name)

if dummy:
    model_dir = path.join(model_dir, "dummy")
    
cv_model_dir = path.join(model_dir, "cv")

Path(model_dir).mkdir(exist_ok=True, parents=True) # technically redundant
Path(cv_model_dir).mkdir(exist_ok=True, parents=True)

## Import Data

In [25]:
sobol_reg = pd.read_csv(path.join(data_dir, "sobol_regression.csv"))

if dummy:
    data_dir = path.join(data_dir, "dummy")
    sobol_reg = sobol_reg.head(100)
    
Path(data_dir).mkdir(exist_ok=True, parents=True)

### Prepare data for Random Forest Classifier -- Convert Categorical Data to Numerical Data

In [26]:
# bias_ohe = pd.get_dummies(sobol_reg["bias"], prefix="bias")
criterion_ohe = pd.get_dummies(sobol_reg["criterion"], prefix="criterion")
elemprop_ohe = pd.get_dummies(sobol_reg["elem_prop"], prefix="elem_prop")
hardware_ohe = pd.get_dummies(sobol_reg["hardware"], prefix="hardware")

sobol_reg["bias"] = sobol_reg["bias"].astype(int)

sobol_reg = pd.concat([sobol_reg, criterion_ohe, elemprop_ohe, hardware_ohe], axis=1)

sobol_reg.drop(columns=["criterion", "elem_prop", "hardware"], inplace=True)

## define f(x) to calc mae scores 

In [27]:
# argument for rfr_mae, X_array, y_array, model_name to save model as .pkl
def rfr_group_mae(
    X_array, y_array, group_array, model_name_stem, objective_name, random_state=13
):
    kf = GroupKFold(n_splits=5)
    mae_scores = []
    y_preds = []
    y_trues = []
    for i, (train_index, test_index) in enumerate(
        kf.split(X_array, y_array, group_array)
    ):
        X_train, X_test = X_array[train_index], X_array[test_index]
        y_train, y_test = y_array[train_index], y_array[test_index]
        y_test = y_test.tolist()

        model = RandomForestRegressor(random_state=random_state)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test).tolist()

        y_preds.append(y_pred)
        y_trues.append(y_test)
        mae = mean_absolute_error(y_test, y_pred)
        mae_scores.append(mae)
        # save model as .pkl
        joblib.dump(model, f"{model_name_stem}_{i}.pkl", compress=7)

    avg_mae = np.mean(mae_scores)
    std_mae = np.std(mae_scores)

    print(f"MAE for {objective_name}: {avg_mae:.4f} +/- {std_mae:.4f}")
    results = {"mae": mae_scores, "y_pred": y_preds, "y_true": y_trues}
    return results

## Features

In [28]:
common_features = [
    "N",
    "alpha",
    "d_model",
    "dim_feedforward",
    "dropout",
    "emb_scaler",
    "eps",
    "epochs_step",
    "fudge",
    "heads",
    "k",
    "lr",
    "pe_resolution",
    "ple_resolution",
    "pos_scaler",
    "weight_decay",
    "batch_size",
    "out_hidden4",
    "betas1",
    "betas2",
    "train_frac",
    "bias",
    "criterion_RobustL1",
    "criterion_RobustL2",
    "elem_prop_magpie",
    "elem_prop_mat2vec",
    "elem_prop_onehot",
    "hardware_2080ti"
]

## Objective Surrogate Models
#### no NaN values
### mae

In [29]:
mae_features = common_features + ["mae_rank"]

X_array_mae = sobol_reg[mae_features].to_numpy()
y_array_mae = sobol_reg[["mae"]].to_numpy().ravel()

sobol_reg_mae_group = (
    sobol_reg[common_features]
    .round(6)
    .apply(lambda row: "_".join(row.values.astype(str)), axis=1)
)

mae_model_stem = path.join(model_dir, "sobol_reg_mae")
mae_results = rfr_group_mae(
    X_array_mae, y_array_mae, sobol_reg_mae_group, mae_model_stem, "mae"
)

MAE for mae: 0.0217 +/- 0.0004


### rmse

In [30]:
rmse_features = common_features + ["rmse_rank"]

X_array_rmse = sobol_reg[rmse_features].to_numpy()
y_array_rmse = sobol_reg[["rmse"]].to_numpy().ravel()

sobol_reg_rmse_group = (
    sobol_reg[common_features]
    .round(6)
    .apply(lambda row: "_".join(row.values.astype(str)), axis=1)
)

rmse_model_stem = path.join(model_dir, "sobol_reg_rmse")
rmse_results = rfr_group_mae(
    X_array_rmse, y_array_rmse, sobol_reg_rmse_group, rmse_model_stem, "rmse"
)

MAE for rmse: 0.0265 +/- 0.0004


### model size ###

In [31]:
model_size_features = common_features

X_array_model_size = sobol_reg[model_size_features].to_numpy()
y_array_model_size = sobol_reg[["model_size"]].to_numpy().ravel()

sobol_reg_model_size_group = (
    sobol_reg[common_features]
    .round(6)
    .apply(lambda row: "_".join(row.values.astype(str)), axis=1)
)

model_size_model_stem = path.join(model_dir, "sobol_reg_model_size")
model_size_results = rfr_group_mae(
    X_array_model_size,
    y_array_model_size,
    sobol_reg_model_size_group,
    model_size_model_stem,
    "model_size",
)

MAE for model_size: 317796.8646 +/- 6005.9939


### runtime

In [32]:
runtime_features = common_features + ["runtime_rank"]

X_array_runtime = sobol_reg[runtime_features].to_numpy()
y_array_runtime = sobol_reg[["runtime"]].to_numpy().ravel()

sobol_reg_runtime_group = (
    sobol_reg[common_features]
    .round(6)
    .apply(lambda row: "_".join(row.values.astype(str)), axis=1)
)

runtime_model_stem = path.join(model_dir, "sobol_reg_runtime")
runtime_results = rfr_group_mae(
    X_array_runtime,
    y_array_runtime,
    sobol_reg_runtime_group,
    runtime_model_stem,
    "runtime",
)

MAE for runtime: 20.5904 +/- 1.0632


In [33]:
main_results = {
    "mae": mae_results,
    "rmse": rmse_results,
    "model_size": model_size_results,
    "runtime": runtime_results,
}
with open(path.join(data_dir, "model_metadata.json"), "w") as f:
    json.dump(main_results, f)

In [34]:
model_paths = {
    "mae": mae_model_stem,
    "rmse": rmse_model_stem,
    "model_size": model_size_model_stem,
    "runtime": runtime_model_stem,
}
for i in range(5):
    models = {}
    for key, model_path in model_paths.items():
        models[key] = joblib.load(f"{model_path}_{i}.pkl")

    with open(path.join(cv_model_dir, f"cross_validation_models_{i}.pkl"), "wb") as f:
        joblib.dump(models, f, compress=7)

models

{'mae': RandomForestRegressor(random_state=13),
 'rmse': RandomForestRegressor(random_state=13),
 'model_size': RandomForestRegressor(random_state=13),
 'runtime': RandomForestRegressor(random_state=13)}

## Production models (full training data)
Six keys in the dictionary, each key is a value of a label, and its value pair is the trained model.
This trained model is stored in the models folder with the pickle file name "trained_model.pkl"

In [35]:
def train_and_save(
    sr_feat_array,
    sr_labels_array,
    sr_label_names,
):
    models = {}

    for X1, y1, name1 in zip(sr_feat_array, sr_labels_array, sr_label_names):
        print(f"X1 sr shape: {X1.shape}, Y1 sr shape: {y1.shape}")
        model = RandomForestRegressor(random_state=13)
        model.fit(X1, y1)
        models[name1] = model

    return models

In [36]:
# List of x_arrays, y_arrays, and target_names
sobol_reg_x_arrays = [X_array_mae, X_array_rmse, X_array_model_size, X_array_runtime]
sobol_reg_labels = [y_array_mae, y_array_rmse, y_array_model_size, y_array_runtime]
sobol_reg_target_names = ["mae", "rmse", "model_size", "runtime"]

# Train and save the model on all the data
models = train_and_save(
    sobol_reg_x_arrays,
    sobol_reg_labels,
    sobol_reg_target_names,
)

joblib.dump(models, path.join(model_dir, "surrogate_models.pkl"), compress=7)

X1 sr shape: (173219, 29), Y1 sr shape: (173219,)
X1 sr shape: (173219, 29), Y1 sr shape: (173219,)
X1 sr shape: (173219, 28), Y1 sr shape: (173219,)
X1 sr shape: (173219, 29), Y1 sr shape: (173219,)


['..\\..\\models\\crabnet_hyperparameter\\surrogate_models.pkl']