# CrabNet Hyperparameter Surrogate Model

Here, we train a surrogate model for the CrabNet hyperparameter optimization on Matbench
datasets. We capture average model performance (MAE and RMSE) as well as model
complexity and runtime.

In [37]:
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
import joblib
from os import path
import json

# attempted use of skl2onnx to convert to onnx failing due to protobuf error
# https://github.com/onnx/onnx/issues/4469

# from skl2onnx import convert_sklearn
# from skl2onnx.common.data_types import FloatTensorType

In [38]:
dummy = True

task_name = "crabnet_hyperparameter"

data_dir = path.join("..", "..", "data", "processed", task_name)
model_dir = path.join("..", "..", "models", task_name)

if dummy:
    model_dir = path.join(model_dir, "dummy")

## Import Data

In [39]:
sobol_reg = pd.read_csv(path.join(data_dir, "sobol_regression.csv"))

if dummy:
    sobol_reg = sobol_reg.head(100)

In [40]:
sobol_reg.hardware

0     2080ti
1     2080ti
2     2080ti
3     2080ti
4     2080ti
       ...  
95    2080ti
96    2080ti
97    2080ti
98    2080ti
99    2080ti
Name: hardware, Length: 100, dtype: object

### Prepare data for Random Forest Classifier -- Convert Categorical Data to Numerical Data

In [41]:
# "bias" [False, True]},
# "criterion" ["RobustL1", "RobustL2"]},
# "elem_prop"["mat2vec", "magpie", "onehot"],

sobol_reg["bias"].replace(["False", "True"], [0,1], inplace=True)
sobol_reg["criterion"].replace(["RobustL1", "RobustL2"], [0,1], inplace=True)
sobol_reg["elem_prop"].replace(["mat2vec", "magpie", "onehot"], [0,1,2], inplace=True)
sobol_reg["hardware"].replace(["2080ti"], [0], inplace=True)


## define f(x) to calc mae scores 

In [42]:
# argument for rfr_mae, X_array, y_array, model_name to save model as .pkl
def rfr_mae(X_array, y_array, model_name_stem, objective_name, random_state=13):
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    mae_scores = []
    y_preds = []
    y_trues = []
    for i, (train_index, test_index) in enumerate(kf.split(X_array)):
        X_train, X_test = X_array[train_index], X_array[test_index]
        y_train, y_test = y_array[train_index], y_array[test_index]
        y_test = y_test.tolist()

        model = RandomForestRegressor(random_state=random_state, )
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test).tolist()

        y_preds.append(y_pred)
        y_trues.append(y_test)
        mae = mean_absolute_error(y_test, y_pred)
        mae_scores.append(mae)
        # save model as .pkl
        joblib.dump(model, f"{model_name_stem}_{i}.pkl")

    avg_mae = np.mean(mae_scores)
    std_mae = np.std(mae_scores)
    
    print(f"MAE for {objective_name}: {avg_mae:.4f} +/- {std_mae:.4f}")
    results = {"mae": mae_scores, "y_pred": y_preds, "y_true": y_trues}
    return results

## Features

In [43]:
common_features = [
    "N",
    "alpha",
    "d_model",
    "dim_feedforward",
    "dropout",
    "emb_scaler",
    "eps",
    "epochs_step",
    "fudge",
    "heads",
    "k",
    "lr",
    "pe_resolution",
    "ple_resolution",
    "pos_scaler",
    "weight_decay",
    "batch_size",
    "out_hidden4",
    "betas1",
    "betas2",
    "bias",
    "train_frac",
    "criterion",
    "elem_prop",
    "hardware"
]

# fba_isna_prob_features = common_features
# ls_isna_prob_features = common_features

# fba_features = common_features + ["fba_rank"]
# ls_features = common_features + ["ls_rank"]

# fba_time_s_features = common_features + ["fba_time_s_rank"]
# ls_time_s_features = common_features + ["ls_time_s_rank"]

## Objective Surrogate Models
#### no NaN values
### mae

In [44]:
mae_features = common_features + ["mae"] ## ToDo: mae or mae_rank (mae.1)

X_array_mae = sobol_reg[mae_features].to_numpy()
y_array_mae = sobol_reg[["mae"]].to_numpy().ravel()

mae_model_stem = path.join(model_dir, "sobol_reg_mae")
mae_results = rfr_mae(
    X_array_mae, y_array_mae, mae_model_stem, "mae"
)

MAE for mae: 0.0138 +/- 0.0041


### rmse

In [46]:
rmse_features = common_features + ["rmse"] ## ToDo: rmse or rmse_rank (rmse.1)

X_array_rmse = sobol_reg[rmse_features].to_numpy()
y_array_rmse = sobol_reg[["rmse"]].to_numpy().ravel()

rmse_model_stem = path.join(model_dir, "sobol_reg_rmse")
rmse_results = rfr_mae(
    X_array_rmse, y_array_rmse, rmse_model_stem, "rmse"
)

MAE for rmse: 0.0173 +/- 0.0086


### model size ###

In [47]:
model_size_features = common_features + ["model_size"]  

X_array_model_size = sobol_reg[model_size_features].to_numpy()
y_array_model_size = sobol_reg[["model_size"]].to_numpy().ravel()

model_size_model_stem = path.join(model_dir, "sobol_reg_model_size")
model_size_results = rfr_mae(
    X_array_model_size, y_array_model_size, model_size_model_stem, "model_size"
)

MAE for model_size: 1522247.3779 +/- 496881.7467


### runtime

In [48]:
runtime_features = common_features + ["runtime"] ## ToDo: runtime or runtime_rank (runtime.1)

X_array_runtime = sobol_reg[runtime_features].to_numpy()
y_array_runtime = sobol_reg[["runtime"]].to_numpy().ravel()

runtime_model_stem = path.join(model_dir, "sobol_reg_runtime")
runtime_results = rfr_mae(
    X_array_runtime, y_array_runtime, runtime_model_stem, "runtime"
)

MAE for runtime: 9.6531 +/- 4.0121


In [49]:
main_results = {
    "mae": mae_results,
    "rmse": rmse_results,
    "model_size": model_size_results,
    "runtime": runtime_results,
}
with open(path.join(data_dir, "model_metadata.json"), "w") as f:
    json.dump(main_results, f)
    
main_results

{'mae': {'mae': [0.00873629583998393,
   0.021125257885024798,
   0.01468574643052134,
   0.012036958824605478,
   0.012657993983800585],
  'y_pred': [[0.9804798298980143,
    1.0704216156827264,
    1.0384423449393112,
    0.7554203608418216,
    0.4552424729967268,
    0.9021641572926181,
    0.4564116129362864,
    0.3573934759859297,
    0.7487551073742119,
    0.2934022986528932,
    0.4457781215063901,
    0.4952723437591511,
    0.8675134726035815,
    0.3383145378388403,
    0.5095859582063202,
    0.44153246550223274,
    0.8548310985391713,
    0.7743755884528828,
    0.47692707807582124,
    0.6827234190285358],
   [1.0882155435541685,
    1.0882155435541685,
    1.0310824160431828,
    1.0882155435541685,
    0.7736987222597862,
    0.37050083409900025,
    0.4955759452782532,
    0.5077091741174563,
    0.8279504895360477,
    0.3057328192710067,
    0.6040742778862936,
    0.5071647923925914,
    0.4265712879103687,
    0.43884194442552626,
    0.396356306105412,
    0.47

In [50]:
model_paths = {
    "mae": mae_model_stem,
    "rmse": rmse_model_stem,
    "model_size": model_size_model_stem,
    "runtime": runtime_model_stem,
}
models = {}
for key, model_path in model_paths.items():
    models[key] = [joblib.load(f"{model_path}_{i}.pkl") for i in range(5)]

with open(path.join(data_dir, "cross_validation_models.pkl"), "wb") as f:
    pickle.dump(models, f)

models

{'mae': [RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13)],
 'rmse': [RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13)],
 'model_size': [RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13)],
 'runtime': [RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13)]}

### code graveyard 

In [None]:
# print("Average MAE for fba_isna_prob",rfr_mae(X_array_fba_isna_prob, y_array_fba_isna_prob,'fba_isna_prob.pkl'))

# load trained model
# loaded_model = joblib.load('fba_isna_prob_model.pkl')

# Save the model
# with open('../models/fba_isna_prob.pkl', 'wb') as f:
#     pickle.dump(model, f)

# # Load the model
# with open('path/to/save/model.pkl', 'rb') as f:
#     loaded_model = pickle.load(f)

In [None]:
# url_sobol_filter = "https://zenodo.org/record/7513019/files/sobol_probability_filter.csv"
# sobol_filter = pd.read_csv(url_sobol_filter)

# url_sobol_reg = "https://zenodo.org/record/7513019/files/sobol_regression.csv"
# sobol_reg = pd.read_csv(url_sobol_reg)

In [None]:
# os.getcwd()
# os.chdir("../data/raw")

# sobol_filter.to_csv('sobol_filter.csv', index=False)

# sobol_reg.to_csv('sobol_reg.csv', index=False)

In [None]:
# read in sobol_regression.csv
# url_sobol_reg = "https://zenodo.org/record/7513019/files/sobol_regression.csv"

In [None]:
# sobol_reg_x = sobol_reg[
#     [
#         "mu1_div_mu3",
#         "mu2_div_mu3",
#         "std1",
#         "std2",
#         "std3",
#         "comp1",
#         "comp2",
#         "num_particles",
#         "safety_factor",
#         "fba_rank",
#         "ls_rank",
#         "fba_time_s_rank",
#         "ls_time_s_rank",
#     ]
# ]

In [None]:
# print(len(sobol_reg_x))
# print(len(fba))

In [None]:
# print(
#     "Average MAE for ls_time_s",
#     rfr_mae(X_array_fba_time_s, y_array_ls_time_s, "sobol_reg_ls_time_s.pkl"),
# )

In [None]:
# # parse data for target "fba_isna_prob"
# fba_isna_prob = sobol_filter["fba_isna_prob"]
# sobolPF_fba_isna_prob = sobol_filter.drop(["ls_isna_prob", "fba_isna_prob"], axis=1)
# fba_isna_prob = fba_isna_prob.to_frame()