# CrabNet Hyperparameter Surrogate Model

Here, we train a surrogate model for the CrabNet hyperparameter optimization on Matbench
datasets. We capture average model performance (MAE and RMSE) as well as model
complexity and runtime.

In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, GroupKFold
import pandas as pd
import numpy as np
import joblib
from os import path
import json

# attempted use of skl2onnx to convert to onnx failing due to protobuf error
# https://github.com/onnx/onnx/issues/4469

# from skl2onnx import convert_sklearn
# from skl2onnx.common.data_types import FloatTensorType

In [2]:
dummy = False

task_name = "crabnet_hyperparameter"

data_dir = path.join("..", "..", "data", "processed", task_name)
model_dir = path.join("..", "..", "models", task_name)

if dummy:
    model_dir = path.join(model_dir, "dummy")

## Import Data

In [3]:
sobol_reg = pd.read_csv(path.join(data_dir, "sobol_regression.csv"))

if dummy:
    sobol_reg = sobol_reg.head(100)

### Prepare data for Random Forest Classifier -- Convert Categorical Data to Numerical Data

In [4]:
# bias_ohe = pd.get_dummies(sobol_reg["bias"], prefix="bias")
criterion_ohe = pd.get_dummies(sobol_reg["criterion"], prefix="criterion")
elemprop_ohe = pd.get_dummies(sobol_reg["elem_prop"], prefix="elem_prop")
hardware_ohe = pd.get_dummies(sobol_reg["hardware"], prefix="hardware")

sobol_reg["bias"] = sobol_reg["bias"].astype(int)

sobol_reg = pd.concat([sobol_reg, criterion_ohe, elemprop_ohe, hardware_ohe], axis=1)

sobol_reg.drop(columns=["criterion", "elem_prop", "hardware"], inplace=True)

## define f(x) to calc mae scores 

In [5]:
# argument for rfr_mae, X_array, y_array, model_name to save model as .pkl
def rfr_group_mae(X_array, y_array, group_array, model_name_stem, objective_name, random_state=13):
    kf = GroupKFold(n_splits=5)
    mae_scores = []
    y_preds = []
    y_trues = []
    for i, (train_index, test_index) in enumerate(
        kf.split(X_array, y_array, group_array)
    ):
        X_train, X_test = X_array[train_index], X_array[test_index]
        y_train, y_test = y_array[train_index], y_array[test_index]
        y_test = y_test.tolist()

        model = RandomForestRegressor(random_state=random_state)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test).tolist()

        y_preds.append(y_pred)
        y_trues.append(y_test)
        mae = mean_absolute_error(y_test, y_pred)
        mae_scores.append(mae)
        # save model as .pkl
        joblib.dump(model, f"{model_name_stem}_{i}.pkl", compress=3)

    avg_mae = np.mean(mae_scores)
    std_mae = np.std(mae_scores)
    
    print(f"MAE for {objective_name}: {avg_mae:.4f} +/- {std_mae:.4f}")
    results = {"mae": mae_scores, "y_pred": y_preds, "y_true": y_trues}
    return results

## Features

In [6]:
common_features = [
    "N",
    "alpha",
    "d_model",
    "dim_feedforward",
    "dropout",
    "emb_scaler",
    "eps",
    "epochs_step",
    "fudge",
    "heads",
    "k",
    "lr",
    "pe_resolution",
    "ple_resolution",
    "pos_scaler",
    "weight_decay",
    "batch_size",
    "out_hidden4",
    "betas1",
    "betas2",
    "train_frac",
    "bias",
    "criterion_RobustL1",
    "criterion_RobustL2",
    "elem_prop_magpie",
    "elem_prop_mat2vec",
    "elem_prop_onehot",
    "hardware_2080ti"
]

# fba_isna_prob_features = common_features
# ls_isna_prob_features = common_features

# fba_features = common_features + ["fba_rank"]
# ls_features = common_features + ["ls_rank"]

# fba_time_s_features = common_features + ["fba_time_s_rank"]
# ls_time_s_features = common_features + ["ls_time_s_rank"]

## Objective Surrogate Models
#### no NaN values
### mae

In [None]:
mae_features = common_features + ["mae_rank"]  ## ToDo: mae or mae_rank (mae.1)

X_array_mae = sobol_reg[mae_features].to_numpy()
y_array_mae = sobol_reg[["mae"]].to_numpy().ravel()

sobol_reg_mae_group = (
    sobol_reg[mae_features]
    .round(6)
    .apply(lambda row: "_".join(row.values.astype(str)), axis=1)
)

mae_model_stem = path.join(model_dir, "sobol_reg_mae")
mae_results = rfr_group_mae(
    X_array_mae, y_array_mae, sobol_reg_mae_group, mae_model_stem, "mae"
)

### rmse

In [None]:
rmse_features = common_features + ["rmse_rank"]  ## ToDo: rmse or rmse_rank (rmse.1)

X_array_rmse = sobol_reg[rmse_features].to_numpy()
y_array_rmse = sobol_reg[["rmse"]].to_numpy().ravel()

sobol_reg_rmse_group = (
    sobol_reg[rmse_features]
    .round(6)
    .apply(lambda row: "_".join(row.values.astype(str)), axis=1)
)

rmse_model_stem = path.join(model_dir, "sobol_reg_rmse")
rmse_results = rfr_group_mae(
    X_array_rmse, y_array_rmse, sobol_reg_rmse_group, rmse_model_stem, "rmse"
)

MAE for rmse: 0.0131 +/- 0.0001


### model size ###

In [None]:
model_size_features = common_features 

X_array_model_size = sobol_reg[model_size_features].to_numpy()
y_array_model_size = sobol_reg[["model_size"]].to_numpy().ravel()

sobol_reg_model_size_group = (
    sobol_reg[model_size_features]
    .round(6)
    .apply(lambda row: "_".join(row.values.astype(str)), axis=1)
)

model_size_model_stem = path.join(model_dir, "sobol_reg_model_size")
model_size_results = rfr_group_mae(
    X_array_model_size, y_array_model_size, sobol_reg_model_size_group, model_size_model_stem, "model_size"
)

MAE for model_size: 317796.8646 +/- 6005.9939


### runtime

In [None]:
runtime_features = common_features + [
    "runtime_rank"
]  ## ToDo: runtime or runtime_rank (runtime.1)

X_array_runtime = sobol_reg[runtime_features].to_numpy()
y_array_runtime = sobol_reg[["runtime"]].to_numpy().ravel()

sobol_reg_runtime_group = (
    sobol_reg[runtime_features]
    .round(6)
    .apply(lambda row: "_".join(row.values.astype(str)), axis=1)
)

runtime_model_stem = path.join(model_dir, "sobol_reg_runtime")
runtime_results = rfr_group_mae(
    X_array_runtime,
    y_array_runtime,
    sobol_reg_runtime_group,
    runtime_model_stem,
    "runtime",
)

MAE for runtime: 13.9175 +/- 0.7062


In [None]:
main_results = {
    "mae": mae_results,
    "rmse": rmse_results,
    "model_size": model_size_results,
    "runtime": runtime_results,
}
with open(path.join(data_dir, "model_metadata.json"), "w") as f:
    json.dump(main_results, f)
    
main_results

{'mae': {'mae': [0.008843673155521613,
   0.00876743657668446,
   0.008776292228330054,
   0.008869791280505015,
   0.00888197709113839],
  'y_pred': [[1.143685956497153,
    1.049450676093647,
    1.0132582492980349,
    1.006013309125075,
    0.37358981739413893,
    0.39343932512986624,
    0.7384827751272646,
    0.45743197004532715,
    0.9390963918693199,
    0.275529778364774,
    0.5295399840993557,
    0.7153759215229314,
    0.8367099148292506,
    0.439834836188201,
    0.43128012443861946,
    0.1870799480683385,
    0.44988394259583875,
    0.7667295673562018,
    0.502403886077206,
    0.45904044102559155,
    0.4187973945480961,
    0.45323562016513086,
    0.44784118516092036,
    0.8945712809054374,
    0.4612768772530723,
    0.421191231800593,
    0.3664828980154279,
    0.565143254698097,
    0.4752084725455279,
    0.5645768780186434,
    0.3766565157304551,
    0.5714010503619917,
    0.5136586488561202,
    0.5257359084128548,
    0.4248311020967094,
    0.416483

In [None]:
model_paths = {
    "mae": mae_model_stem,
    "rmse": rmse_model_stem,
    "model_size": model_size_model_stem,
    "runtime": runtime_model_stem,
}
for i in range(5):
    models = {}
    for key, model_path in model_paths.items():
        models[key] = joblib.load(f"{model_path}_{i}.pkl")

    with open(path.join(data_dir, f"cross_validation_models_{i}.pkl"), "wb") as f:
        joblib.dump(models, f, compress=3)

models

{'mae': [RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13)],
 'rmse': [RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13)],
 'model_size': [RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13)],
 'runtime': [RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13)]}

## Production models (full training data)
Six keys in the dictionary, each key is a value of a label, and its value pair is the trained model.
This trained model is stored in the models folder with the pickle file name "trained_model.pkl"

In [None]:
def train_and_save(
    sr_feat_array,
    sr_labels_array,
    sr_label_names,
):
    models = {}

    for X1, y1, name1 in zip(sr_feat_array, sr_labels_array, sr_label_names):
        print(f"X1 sr shape: {X1.shape}, Y1 sr shape: {y1.shape}")
        model = RandomForestRegressor(random_state=13)
        model.fit(X1, y1)
        models[name1] = model

    return models

In [None]:
# List of x_arrays, y_arrays, and target_names
sobol_reg_x_arrays = [X_array_mae, X_array_rmse, X_array_model_size, X_array_runtime]
sobol_reg_labels = [y_array_mae, y_array_rmse, y_array_model_size, y_array_runtime]
sobol_reg_target_names = ["mae", "rmse", "model_size", "runtime"]

# Train and save the model on all the data
models = train_and_save(
    sobol_reg_x_arrays,
    sobol_reg_labels,
    sobol_reg_target_names,
)

joblib.dump(models, path.join(model_dir, "surrogate_models.pkl"), compress=3)

X1 spf shape: (41228, 9), Y1 spf shape: (41228,)
X1 spf shape: (41228, 9), Y1 spf shape: (41228,)
X2 sr shape: (408504, 10), Y2 sr shape: (408504,)
X2 sr shape: (338467, 10), Y2 sr shape: (338467,)
X2 sr shape: (438371, 10), Y2 sr shape: (438371,)
X2 sr shape: (438371, 10), Y2 sr shape: (438371,)


['..\\..\\models\\particle_packing\\surrogate_models.pkl']

### code graveyard 

In [None]:
### Ordinal Encoding

# "bias" [False, True]},
# "criterion" ["RobustL1", "RobustL2"]},
# "elem_prop"["mat2vec", "magpie", "onehot"],

# sobol_reg["bias"].replace(["False", "True"], [0,1], inplace=True)
# sobol_reg["criterion"].replace(["RobustL1", "RobustL2"], [0,1], inplace=True)
# sobol_reg["elem_prop"].replace(["mat2vec", "magpie", "onehot"], [0,1,2], inplace=True)
# sobol_reg["hardware"].replace(["2080ti"], [0], inplace=True)


In [None]:
# print("Average MAE for fba_isna_prob",rfr_mae(X_array_fba_isna_prob, y_array_fba_isna_prob,'fba_isna_prob.pkl'))

# load trained model
# loaded_model = joblib.load('fba_isna_prob_model.pkl')

# Save the model
# with open('../models/fba_isna_prob.pkl', 'wb') as f:
#     pickle.dump(model, f)

# # Load the model
# with open('path/to/save/model.pkl', 'rb') as f:
#     loaded_model = pickle.load(f)

In [None]:
# url_sobol_filter = "https://zenodo.org/record/7513019/files/sobol_probability_filter.csv"
# sobol_filter = pd.read_csv(url_sobol_filter)

# url_sobol_reg = "https://zenodo.org/record/7513019/files/sobol_regression.csv"
# sobol_reg = pd.read_csv(url_sobol_reg)

In [None]:
# os.getcwd()
# os.chdir("../data/raw")

# sobol_filter.to_csv('sobol_filter.csv', index=False)

# sobol_reg.to_csv('sobol_reg.csv', index=False)

In [None]:
# read in sobol_regression.csv
# url_sobol_reg = "https://zenodo.org/record/7513019/files/sobol_regression.csv"

In [None]:
# sobol_reg_x = sobol_reg[
#     [
#         "mu1_div_mu3",
#         "mu2_div_mu3",
#         "std1",
#         "std2",
#         "std3",
#         "comp1",
#         "comp2",
#         "num_particles",
#         "safety_factor",
#         "fba_rank",
#         "ls_rank",
#         "fba_time_s_rank",
#         "ls_time_s_rank",
#     ]
# ]

In [None]:
# print(len(sobol_reg_x))
# print(len(fba))

In [None]:
# print(
#     "Average MAE for ls_time_s",
#     rfr_mae(X_array_fba_time_s, y_array_ls_time_s, "sobol_reg_ls_time_s.pkl"),
# )

In [None]:
# # parse data for target "fba_isna_prob"
# fba_isna_prob = sobol_filter["fba_isna_prob"]
# sobolPF_fba_isna_prob = sobol_filter.drop(["ls_isna_prob", "fba_isna_prob"], axis=1)
# fba_isna_prob = fba_isna_prob.to_frame()